experimental: add fdl (Full Data Logging) translator

NSR needs logging that is different than our existing changelog in several ways: * Full data, not just metadata * Pre-op, not post-op * High performance * Supports the concept of time-bounded "terms" Others (for example EC) might need the same thing. This patch adds such a translator. It also adds code to dump the resulting journals, and to replay them using syncops, plus (very rudimentary) tests for all of the above. Change-Id: I29680a1b4e0a9e7d5a8497fef302c46434b86636 Signed-off-by: Jeff Darcy <jdarcy@redhat.com> Reviewed-on: http://review.gluster.org/12450 Smoke: Gluster Build System <jenkins@build.gluster.com> CentOS-regression: Gluster Build System <jenkins@build.gluster.com> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
author: Jeff Darcy <jdarcy@redhat.com> 2016-02-08 13:30:49 -0500
committer: Jeff Darcy <jdarcy@redhat.com> 2016-02-13 05:13:07 -0800
commit: c458433041aafb48ae6d6e5fcf3e1e737dc3fda3 (patch)
tree: 33a03ca0c1f5faf58419de2c4ff4532752ddfb07
parent: da33097c3d6492e3b468b4347e47c70828fb4320 (diff)
30 files changed, 2269 insertions, 110 deletions
diff --git a/api/src/gfapi.aliases b/api/src/gfapi.aliases
index 40b6ed21192..7181dd2f6e8 100644
--- a/api/src/gfapi.aliases
+++ b/api/src/gfapi.aliases
@@ -140,3 +140,4 @@ _priv_glfs_resolve _glfs_resolve$GFAPI_PRIVATE_3.7.0
 _priv_glfs_process_upcall_event _glfs_process_upcall_event$GFAPI_PRIVATE_3.7.0
 
 _pub_glfs_h_lookupat _glfs_h_lookupat$GFAPI_3.7.4
+_pub_glfs_ipc_xd _glfs_ipc_xd@GFAPI_4.0.4
diff --git a/api/src/gfapi.map b/api/src/gfapi.map
index d42ae2b97af..b35984a088c 100644
--- a/api/src/gfapi.map
+++ b/api/src/gfapi.map
@@ -167,3 +167,8 @@ GFAPI_3.7.4 {
 	global:
 		glfs_h_lookupat;
 } GFAPI_PRIVATE_3.7.0;
+
+GFAPI_4.0.0 {
+	global:
+		glfs_ipc_xd;
+} GFAPI_3.7.4;
diff --git a/api/src/glfs.c b/api/src/glfs.c
index b151936a6e8..037b579225f 100644
--- a/api/src/glfs.c
+++ b/api/src/glfs.c
@@ -1233,7 +1233,7 @@ invalid_fs:
 GFAPI_SYMVER_PUBLIC_DEFAULT(glfs_get_volfile, 3.6.0);
 
 int
-pub_glfs_ipc (struct glfs *fs, int opcode)
+pub_glfs_ipc_xd (struct glfs *fs, int opcode, dict_t *xd_in, dict_t **xd_out)
 {
 	xlator_t        *subvol = NULL;
         int             ret = -1;
@@ -1248,7 +1248,7 @@ pub_glfs_ipc (struct glfs *fs, int opcode)
 		goto out;
 	}
 
-	ret = syncop_ipc (subvol, opcode, NULL, NULL);
+	ret = syncop_ipc (subvol, opcode, xd_in, xd_out);
         DECODE_SYNCOP_ERR (ret);
 
 out:
@@ -1259,4 +1259,12 @@ invalid_fs:
         return ret;
 }
 
+GFAPI_SYMVER_PUBLIC_DEFAULT(glfs_ipc_xd, 4.0.0);
+
+int
+pub_glfs_ipc (struct glfs *fs, int opcode)
+{
+        return pub_glfs_ipc_xd (fs, opcode, NULL, NULL);
+}
+
 GFAPI_SYMVER_PUBLIC_DEFAULT(glfs_ipc, 3.7.0);
diff --git a/configure.ac b/configure.ac
index 5d2fe342b74..29e36648aac 100644
--- a/configure.ac
+++ b/configure.ac
@@ -117,6 +117,8 @@ AC_CONFIG_FILES([Makefile
                 xlators/features/Makefile
                 xlators/features/arbiter/Makefile
                 xlators/features/arbiter/src/Makefile
+                xlators/experimental/fdl/Makefile
+                xlators/experimental/fdl/src/Makefile
                 xlators/features/changelog/Makefile
                 xlators/features/changelog/src/Makefile
                 xlators/features/changelog/lib/Makefile
diff --git a/glusterfs.spec.in b/glusterfs.spec.in
index 53a65ae4fed..9f04bc37e10 100644
--- a/glusterfs.spec.in
+++ b/glusterfs.spec.in
@@ -952,6 +952,7 @@ fi
 %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/barrier.so
 %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/cdc.so
 %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/changelog.so
+%{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/experimental/fdl.so
 %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/gfid-access.so
 %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/read-only.so
 %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/shard.so
@@ -1217,6 +1218,8 @@ fi
 /usr/lib/firewalld/services/glusterfs.xml
 %endif
 
+%{_sbindir}/gf_logdump
+%{_sbindir}/gf_recon
 
 %changelog
 * Sat Jan 16 2016 Niels de Vos <ndevos@redhat.com>
diff --git a/glusterfsd/src/glusterfsd.c b/glusterfsd/src/glusterfsd.c
index bbaca1e7277..c980e7bc640 100644
--- a/glusterfsd/src/glusterfsd.c
+++ b/glusterfsd/src/glusterfsd.c
@@ -1199,6 +1199,26 @@ parse_opts (int key, char *arg, struct argp_state *state)
         return 0;
 }
 
+gf_boolean_t
+should_call_fini (glusterfs_ctx_t *ctx, xlator_t *trav)
+{
+        /* There's nothing to call, so the other checks don't matter. */
+        if (!trav->fini) {
+                return _gf_false;
+        }
+
+        /* This preserves previous behavior in glusterd. */
+        if (ctx->process_mode == GF_GLUSTERD_PROCESS) {
+                return _gf_true;
+        }
+
+        /* This is the only one known to be safe in glusterfsd. */
+        if (!strcmp(trav->type,"experimental/fdl")) {
+                return _gf_true;
+        }
+
+        return _gf_false;
+}
 
 void
 cleanup_and_exit (int signum)
@@ -1271,20 +1291,17 @@ cleanup_and_exit (int signum)
 
         /*call fini for glusterd xlator */
         /* TODO : Invoke fini for rest of the xlators */
-        if (ctx->process_mode == GF_GLUSTERD_PROCESS) {
-
-                trav = NULL;
-                if (ctx->active)
-                        trav = ctx->active->top;
-                while (trav) {
-                        if (trav->fini) {
-                                THIS = trav;
-                                trav->fini (trav);
-                        }
-                        trav = trav->next;
+        trav = NULL;
+        if (ctx->active)
+                trav = ctx->active->top;
+        while (trav) {
+                if (should_call_fini(ctx,trav)) {
+                        THIS = trav;
+                        trav->fini (trav);
                 }
-
+                trav = trav->next;
         }
+
         exit(0);
 }
 
diff --git a/libglusterfs/src/Makefile.am b/libglusterfs/src/Makefile.am
index 46e2e021134..c6d93c925ac 100644
--- a/libglusterfs/src/Makefile.am
+++ b/libglusterfs/src/Makefile.am
@@ -83,7 +83,7 @@ y.tab.h: graph.y
 defaults.c: defaults-tmpl.c generator.py gen-defaults.py
 	$(PYTHON) $(srcdir)/gen-defaults.py $(srcdir)/defaults-tmpl.c > $@
 
-CLEANFILES = graph.lex.c y.tab.c y.tab.h defaults.c
+CLEANFILES = $(nodist_libglusterfs_la_SOURCES)
 
 if UNITTEST
 CLEANFILES += *.gcda *.gcno *_xunit.xml
diff --git a/libglusterfs/src/call-stub.h b/libglusterfs/src/call-stub.h
index 01621368ee9..82a49c1d7b9 100644
--- a/libglusterfs/src/call-stub.h
+++ b/libglusterfs/src/call-stub.h
@@ -17,12 +17,15 @@
 #include "stack.h"
 #include "list.h"
 
-typedef struct {
+typedef struct _call_stub {
 	struct list_head list;
 	char wind;
 	call_frame_t *frame;
 	glusterfs_fop_t fop;
         struct mem_pool *stub_mem_pool; /* pointer to stub mempool in ctx_t */
+        uint32_t jnl_meta_len;
+        uint32_t jnl_data_len;
+        void (*serialize) (struct _call_stub *, char *, char *);
 
 	union {
 		fop_lookup_t lookup;
diff --git a/libglusterfs/src/generator.py b/libglusterfs/src/generator.py
index 5e8f6c29cd4..8be68337baa 100644..100755
--- a/libglusterfs/src/generator.py
+++ b/libglusterfs/src/generator.py
@@ -2,6 +2,65 @@
 
 import string
 
+# ops format: 'fop-arg' name type stub-field [nosync]
+#             'cbk-arg' name type
+#             'extra'   name type arg-str
+#             'journal' fop-type
+#             'link'    inode iatt
+#
+# 'role' indicates the significance of this line to the code generator (sort of
+# our own type).
+#
+# For fop-arg, we first need to know the name and the type of the arg so that
+# we can generate SHORT_ARGS (for function calls) and LONG_ARGS (for
+# declarations).  For code that uses stubs, we also need to know the name of
+# the stub field, which might be different than the argument itself.  Lastly,
+# for code that uses syncops, we need to know whether whoever wrote the syncop
+# for this fop "forgot" to include this argument.  (Editorial: this kind of
+# creeping inconsistency is why we should have used code generation for stubs
+# and syncops as well as defaults all along.)  To address this need, we use the
+# optional 'nosync' field for arguments (e.g. mkdir.umask) that we should skip
+# in generated syncop code.
+#
+# 'cbk-arg' is like fop-arg but simpler and used for generating callbacks
+# instead of fop functions.
+#
+# 'extra' is also like fop-arg, but it's another hack for syncops.  This time
+# the problem is that some of what would normally be *callback* arguments are
+# instead created in the caller and passed to the syncop.  We handle that by
+# adding an entry at the appropriate place in the fop-arg list, with the name
+# and type to generate a declaration and an argument string to generate the
+# actual syncop call.
+#
+# The mere presence of a 'journal' item is sufficient for most of the journal
+# code to recognize that it should do something.  However, reconciliation also
+# needs to decide how reconciliation builds the arguments it needs to call down
+# to the syncop layer, based on what's in the journal.  To do that, we divide
+# ops into three types and store those types in the ops table.  In general,
+# these three types work as follows.
+#
+#    For an fd-op, the GFID in the journal is used (in loc.gfid) field to
+#    look up an inode, then an anonymous fd is found/created for that inode.
+#
+#    For an inode-op, the GFID in the journal is used the same way, but no fd
+#    is needed.
+#
+#    For an entry-op, the *parent* GFID and name from the journal are used to
+#    look up an inode (via loc.pargfid and par.name respectively).
+#
+# The only places this seems to fall down is for link and create.  In link,
+# which is generally an entry-op, the source is looked up as though it's an
+# inode-op.  In create, we have an fd argument but it's really a return
+# argument so we get a fresh inode instead of looking one up.  Those two cases
+# need to be handled as special cases in the reconciliation code.
+#
+# 'link' is (hopefully) the last of the journal/syncop hacks.  Much like
+# 'extra', some values that are returned as callback arguments in the normal
+# case are handled differently for syncops.  For syncops that create objects
+# (e.g. mkdir) we need to link those objects into our inode table.  The 'inode'
+# and 'iatt' fields here give us the information we need to construct the
+# proper inode_link call(s).
+
 ops = {}
 
 ops['fgetxattr'] = (
@@ -13,19 +72,21 @@ ops['fgetxattr'] = (
 )
 
 ops['fsetxattr'] = (
-	('fop-arg',	'fd',			'fd_t *'),
-	('fop-arg',	'dict',			'dict_t *'),
-	('fop-arg',	'flags',		'int32_t'),
-	('fop-arg',	'xdata',		'dict_t *'),
+	('fop-arg',	'fd',			'fd_t *',			'fd'),
+	('fop-arg',	'dict',			'dict_t *',			'xattr'),
+	('fop-arg',	'flags',		'int32_t',			'flags'),
+	('fop-arg',	'xdata',		'dict_t *',			'xdata'),
 	('cbk-arg',	'xdata',		'dict_t *'),
+	('journal',	'fd-op'),
 )
 
 ops['setxattr'] = (
-	('fop-arg',	'loc',			'loc_t *'),
-	('fop-arg',	'dict',			'dict_t *'),
-	('fop-arg',	'flags',		'int32_t'),
-	('fop-arg',	'xdata',		'dict_t *'),
+	('fop-arg',	'loc',			'loc_t *',			'loc'),
+	('fop-arg',	'dict',			'dict_t *',			'xattr'),
+	('fop-arg',	'flags',		'int32_t',			'flags'),
+	('fop-arg',	'xdata',		'dict_t *',			'xdata'),
 	('cbk-arg',	'xdata',		'dict_t *'),
+	('journal',	'inode-op'),
 )
 
 ops['statfs'] = (
@@ -73,16 +134,17 @@ ops['flush'] = (
 )
 
 ops['writev'] = (
-	('fop-arg',	'fd',			'fd_t *'),
-	('fop-arg',	'vector',		'struct iovec *'),
+	('fop-arg',	'fd',			'fd_t *',			'fd'),
+	('fop-arg',	'vector',		'struct iovec *',	'vector'),
 	('fop-arg',	'count',		'int32_t'),
-	('fop-arg',	'off',			'off_t'),
-	('fop-arg',	'flags',		'uint32_t'),
+	('fop-arg',	'off',			'off_t',			'offset'),
+	('fop-arg',	'flags',		'uint32_t',			'flags'),
 	('fop-arg',	'iobref',		'struct iobref *'),
-	('fop-arg',	'xdata',		'dict_t *'),
+	('fop-arg',	'xdata',		'dict_t *',			'xdata'),
 	('cbk-arg',	'prebuf',		'struct iatt *'),
 	('cbk-arg',	'postbuf',		'struct iatt *'),
 	('cbk-arg',	'xdata',		'dict_t *'),
+	('journal',	'fd-op'),
 )
 
 ops['readv'] = (
@@ -108,96 +170,111 @@ ops['open'] = (
 )
 
 ops['create'] = (
-	('fop-arg',	'loc',			'loc_t *'),
-	('fop-arg',	'flags',		'int32_t'),
-	('fop-arg',	'mode',			'mode_t'),
-	('fop-arg',	'umask',		'mode_t'),
-	('fop-arg',	'fd',			'fd_t *'),
-	('fop-arg',	'xdata',		'dict_t *'),
+	('fop-arg',	'loc',			'loc_t *',			'loc'),
+	('fop-arg',	'flags',		'int32_t',			'flags'),
+	('fop-arg',	'mode',			'mode_t',			'mode'),
+	('fop-arg',	'umask',		'mode_t',			'umask',	'nosync'),
+	('fop-arg',	'fd',			'fd_t *',			'fd'),
+	('extra',	'iatt',			'struct iatt',		'&iatt'),
+	('fop-arg',	'xdata',		'dict_t *',			'xdata'),
 	('cbk-arg',	'fd',			'fd_t *'),
 	('cbk-arg',	'inode',		'inode_t *'),
 	('cbk-arg',	'buf',			'struct iatt *'),
 	('cbk-arg',	'preparent',	'struct iatt *'),
 	('cbk-arg',	'postparent',	'struct iatt *'),
 	('cbk-arg',	'xdata',		'dict_t *'),
+	('journal',	'entry-op'),
+	('link',	'loc.inode',	'&iatt'),
 )
 
 ops['link'] = (
-	('fop-arg',	'oldloc',		'loc_t *'),
-	('fop-arg',	'newloc',		'loc_t *'),
-	('fop-arg',	'xdata',		'dict_t *'),
+	('fop-arg',	'oldloc',		'loc_t *',			'loc'),
+	('fop-arg',	'newloc',		'loc_t *',			'loc2'),
+	('extra',	'iatt',			'struct iatt',		'&iatt'),
+	('fop-arg',	'xdata',		'dict_t *',			'xdata'),
 	('cbk-arg',	'inode',		'inode_t *'),
 	('cbk-arg',	'buf',			'struct iatt *'),
 	('cbk-arg',	'preparent',	'struct iatt *'),
 	('cbk-arg',	'postparent',	'struct iatt *'),
 	('cbk-arg',	'xdata',		'dict_t *'),
+	('journal',	'entry-op'),
 )
 
 ops['rename'] = (
-	('fop-arg',	'oldloc',		'loc_t *'),
-	('fop-arg',	'newloc',		'loc_t *'),
-	('fop-arg',	'xdata',		'dict_t *'),
+	('fop-arg',	'oldloc',		'loc_t *',			'loc'),
+	('fop-arg',	'newloc',		'loc_t *',			'loc2'),
+	('fop-arg',	'xdata',		'dict_t *',			'xdata'),
 	('cbk-arg',	'buf',			'struct iatt *'),
 	('cbk-arg',	'preoldparent',	'struct iatt *'),
 	('cbk-arg',	'postoldparent','struct iatt *'),
 	('cbk-arg',	'prenewparent',	'struct iatt *'),
 	('cbk-arg',	'postnewparent','struct iatt *'),
 	('cbk-arg',	'xdata',		'dict_t *'),
+	('journal',	'entry-op'),
 )
 
 ops['symlink'] = (
-	('fop-arg',	'linkpath',		'const char *'),
-	('fop-arg',	'loc',			'loc_t *'),
-	('fop-arg',	'umask',		'mode_t'),
-	('fop-arg',	'xdata',		'dict_t *'),
+	('fop-arg',	'linkpath',		'const char *',		'linkname'),
+	('fop-arg',	'loc',			'loc_t *',			'loc'),
+	('fop-arg',	'umask',		'mode_t',			'mode',		'nosync'),
+	('extra',	'iatt',			'struct iatt',		'&iatt'),
+	('fop-arg',	'xdata',		'dict_t *',			'xdata'),
 	('cbk-arg',	'inode',		'inode_t *'),
 	('cbk-arg',	'buf',			'struct iatt *'),
 	('cbk-arg',	'preparent',	'struct iatt *'),
 	('cbk-arg',	'postparent',	'struct iatt *'),
 	('cbk-arg',	'xdata',		'dict_t *'),
+	('journal',	'entry-op'),
 )
 
 ops['rmdir'] = (
-	('fop-arg',	'loc',			'loc_t *'),
-	('fop-arg',	'flags',		'int32_t'),
-	('fop-arg',	'xdata',		'dict_t *'),
+	('fop-arg',	'loc',			'loc_t *',			'loc'),
+	('fop-arg',	'flags',		'int32_t',			'flags'),
+	('fop-arg',	'xdata',		'dict_t *',			'xdata'),
 	('cbk-arg',	'preparent',	'struct iatt *'),
 	('cbk-arg',	'postparent',	'struct iatt *'),
 	('cbk-arg',	'xdata',		'dict_t *'),
+	('journal',	'entry-op'),
 )
 
 ops['unlink'] = (
-	('fop-arg',	'loc',			'loc_t *'),
-	('fop-arg',	'flags',		'int32_t'),
-	('fop-arg',	'xdata',		'dict_t *'),
+	('fop-arg',	'loc',			'loc_t *',			'loc'),
+	('fop-arg',	'flags',		'int32_t',			'flags',	'nosync'),
+	('fop-arg',	'xdata',		'dict_t *',			'xdata'),
 	('cbk-arg',	'preparent',	'struct iatt *'),
 	('cbk-arg',	'postparent',	'struct iatt *'),
 	('cbk-arg',	'xdata',		'dict_t *'),
+	('journal',	'entry-op'),
 )
 
 ops['mkdir'] = (
-	('fop-arg',	'loc',			'loc_t *'),
-	('fop-arg',	'mode',			'mode_t'),
-	('fop-arg',	'umask',		'mode_t'),
-	('fop-arg',	'xdata',		'dict_t *'),
+	('fop-arg',	'loc',			'loc_t *',			'loc'),
+	('fop-arg',	'mode',			'mode_t',			'mode'),
+	('fop-arg',	'umask',		'mode_t',			'umask',	'nosync'),
+	('extra',	'iatt',			'struct iatt',		'&iatt'),
+	('fop-arg',	'xdata',		'dict_t *',			'xdata'),
 	('cbk-arg',	'inode',		'inode_t *'),
 	('cbk-arg',	'buf',			'struct iatt *'),
 	('cbk-arg',	'preparent',	'struct iatt *'),
 	('cbk-arg',	'postparent',	'struct iatt *'),
 	('cbk-arg',	'xdata',		'dict_t *'),
+	('journal',	'entry-op'),
+	('link',	'loc.inode',	'&iatt'),
 )
 
 ops['mknod'] = (
-	('fop-arg',	'loc',			'loc_t *'),
-	('fop-arg',	'mode',			'mode_t'),
-	('fop-arg',	'rdev',			'dev_t'),
-	('fop-arg',	'umask',		'mode_t'),
-	('fop-arg',	'xdata',		'dict_t *'),
+	('fop-arg',	'loc',			'loc_t *',			'loc'),
+	('fop-arg',	'mode',			'mode_t',			'mode'),
+	('fop-arg',	'rdev',			'dev_t',			'rdev'),
+	('fop-arg',	'umask',		'mode_t',			'umask',	'nosync'),
+	('extra',	'iatt',			'struct iatt',		'&iatt'),
+	('fop-arg',	'xdata',		'dict_t *',			'xdata'),
 	('cbk-arg',	'inode',		'inode_t *'),
 	('cbk-arg',	'buf',			'struct iatt *'),
 	('cbk-arg',	'preparent',	'struct iatt *'),
 	('cbk-arg',	'postparent',	'struct iatt *'),
 	('cbk-arg',	'xdata',		'dict_t *'),
+	('journal',	'entry-op'),
 )
 
 ops['readlink'] = (
@@ -217,12 +294,13 @@ ops['access'] = (
 )
 
 ops['ftruncate'] = (
-	('fop-arg',	'fd',			'fd_t *'),
-	('fop-arg',	'offset',		'off_t'),
-	('fop-arg',	'xdata',		'dict_t *'),
+	('fop-arg',	'fd',			'fd_t *',				'fd'),
+	('fop-arg',	'offset',		'off_t',				'offset'),
+	('fop-arg',	'xdata',		'dict_t *',				'xdata'),
 	('cbk-arg',	'prebuf',		'struct iatt *'),
 	('cbk-arg',	'postbuf',		'struct iatt *'),
 	('cbk-arg',	'xdata',		'dict_t *'),
+	('journal',	'fd-op'),
 )
 
 ops['getxattr'] = (
@@ -234,35 +312,39 @@ ops['getxattr'] = (
 )
 
 ops['xattrop'] = (
-	('fop-arg',	'loc',			'loc_t *'),
-	('fop-arg',	'flags',		'gf_xattrop_flags_t'),
-	('fop-arg',	'dict',			'dict_t *'),
-	('fop-arg',	'xdata',		'dict_t *'),
+	('fop-arg',	'loc',			'loc_t *',				'loc'),
+	('fop-arg',	'flags',		'gf_xattrop_flags_t',	'optype'),
+	('fop-arg',	'dict',			'dict_t *',				'xattr'),
+	('fop-arg',	'xdata',		'dict_t *',				'xdata'),
 	('cbk-arg',	'dict',			'dict_t *'),
 	('cbk-arg',	'xdata',		'dict_t *'),
+	('journal',	'inode-op'),
 )
 
 ops['fxattrop'] = (
-	('fop-arg',	'fd',			'fd_t *'),
-	('fop-arg',	'flags',		'gf_xattrop_flags_t'),
-	('fop-arg',	'dict',			'dict_t *'),
-	('fop-arg',	'xdata',		'dict_t *'),
+	('fop-arg',	'fd',			'fd_t *',				'fd'),
+	('fop-arg',	'flags',		'gf_xattrop_flags_t',	'optype'),
+	('fop-arg',	'dict',			'dict_t *',				'xattr'),
+	('fop-arg',	'xdata',		'dict_t *',				'xdata'),
 	('cbk-arg',	'dict',			'dict_t *'),
 	('cbk-arg',	'xdata',		'dict_t *'),
+	('journal',	'fd-op'),
 )
 
 ops['removexattr'] = (
-	('fop-arg',	'loc',			'loc_t *'),
-	('fop-arg',	'name',			'const char *'),
-	('fop-arg',	'xdata',		'dict_t *'),
+	('fop-arg',	'loc',			'loc_t *',			'loc'),
+	('fop-arg',	'name',			'const char *',		'name'),
+	('fop-arg',	'xdata',		'dict_t *',			'xdata'),
 	('cbk-arg',	'xdata',		'dict_t *'),
+	('journal',	'inode-op'),
 )
 
 ops['fremovexattr'] = (
-	('fop-arg',	'fd',			'fd_t *'),
-	('fop-arg',	'name',			'const char *'),
-	('fop-arg',	'xdata',		'dict_t *'),
+	('fop-arg',	'fd',			'fd_t *',			'fd'),
+	('fop-arg',	'name',			'const char *',		'name'),
+	('fop-arg',	'xdata',		'dict_t *',			'xdata'),
 	('cbk-arg',	'xdata',		'dict_t *'),
+	('journal',	'fd-op'),
 )
 
 ops['lk'] = (
@@ -341,22 +423,26 @@ ops['readdirp'] = (
 )
 
 ops['setattr'] = (
-	('fop-arg',	'loc',			'loc_t *'),
-	('fop-arg',	'stbuf',		'struct iatt *'),
-	('fop-arg',	'valid',		'int32_t'),
-	('fop-arg',	'xdata',		'dict_t *'),
+	('fop-arg',	'loc',			'loc_t *',			'loc'),
+	('fop-arg',	'stbuf',		'struct iatt *',	'stat'),
+	('fop-arg',	'valid',		'int32_t',			'valid'),
+	('extra',	'preop',		'struct iatt',		'&preop'),
+	('extra',	'postop',		'struct iatt',		'&postop'),
+	('fop-arg',	'xdata',		'dict_t *',			'xdata'),
 	('cbk-arg',	'statpre',		'struct iatt *'),
 	('cbk-arg',	'statpost',		'struct iatt *'),
 	('cbk-arg',	'xdata',		'dict_t *'),
+	('journal',	'inode-op'),
 )
 
 ops['truncate'] = (
-	('fop-arg',	'loc',			'loc_t *'),
-	('fop-arg',	'offset',		'off_t'),
-	('fop-arg',	'xdata',		'dict_t *'),
+	('fop-arg',	'loc',			'loc_t *',			'loc'),
+	('fop-arg',	'offset',		'off_t',			'offset'),
+	('fop-arg',	'xdata',		'dict_t *',			'xdata'),
 	('cbk-arg',	'prebuf',		'struct iatt *'),
 	('cbk-arg',	'postbuf',		'struct iatt *'),
 	('cbk-arg',	'xdata',		'dict_t *'),
+	('journal',	'inode-op'),
 )
 
 ops['stat'] = (
@@ -378,45 +464,51 @@ ops['lookup'] = (
 )
 
 ops['fsetattr'] = (
-	('fop-arg',	'fd',			'fd_t *'),
-	('fop-arg',	'stbuf',		'struct iatt *'),
-	('fop-arg',	'valid',		'int32_t'),
-	('fop-arg',	'xdata',		'dict_t *'),
+	('fop-arg',	'fd',			'fd_t *',			'fd'),
+	('fop-arg',	'stbuf',		'struct iatt *',	'stat'),
+	('fop-arg',	'valid',		'int32_t',			'valid'),
+	('extra',	'preop',		'struct iatt',		'&preop'),
+	('extra',	'postop',		'struct iatt',		'&postop'),
+	('fop-arg',	'xdata',		'dict_t *',			'xdata'),
 	('cbk-arg',	'statpre',		'struct iatt *'),
 	('cbk-arg',	'statpost',		'struct iatt *'),
 	('cbk-arg',	'xdata',		'dict_t *'),
+	('journal',	'fd-op'),
 )
 
 ops['fallocate'] = (
-	('fop-arg',	'fd',			'fd_t *'),
-	('fop-arg',	'keep_size',	'int32_t'),
-	('fop-arg',	'offset',		'off_t'),
-	('fop-arg',	'len',			'size_t'),
-	('fop-arg',	'xdata',		'dict_t *'),
+	('fop-arg',	'fd',			'fd_t *',			'fd'),
+	('fop-arg',	'keep_size',	'int32_t',			'mode'),
+	('fop-arg',	'offset',		'off_t',			'offset'),
+	('fop-arg',	'len',			'size_t',			'size'),
+	('fop-arg',	'xdata',		'dict_t *',			'xdata'),
 	('cbk-arg',	'pre',			'struct iatt *'),
 	('cbk-arg',	'post',			'struct iatt *'),
 	('cbk-arg',	'xdata',		'dict_t *'),
+	('journal',	'fd-op'),
 )
 
 ops['discard'] = (
-	('fop-arg',	'fd',			'fd_t *'),
-	('fop-arg',	'offset',		'off_t'),
-	('fop-arg',	'len',			'size_t'),
-	('fop-arg',	'xdata',		'dict_t *'),
+	('fop-arg',	'fd',			'fd_t *',			'fd'),
+	('fop-arg',	'offset',		'off_t',			'offset'),
+	('fop-arg',	'len',			'size_t',			'size'),
+	('fop-arg',	'xdata',		'dict_t *',			'xdata'),
 	('cbk-arg',	'pre',			'struct iatt *'),
 	('cbk-arg',	'post',			'struct iatt *'),
 	('cbk-arg',	'xdata',		'dict_t *'),
+	('journal',	'fd-op'),
 )
 
 ops['zerofill'] = (
-	('fop-arg',	'fd',			'fd_t *'),
-	('fop-arg',	'offset',		'off_t'),
+	('fop-arg',	'fd',			'fd_t *',			'fd'),
+	('fop-arg',	'offset',		'off_t',			'offset'),
 	# As e.g. fallocate/discard (above) "len" should really be a size_t.
-	('fop-arg',	'len',			'off_t'),
-	('fop-arg',	'xdata',		'dict_t *'),
+	('fop-arg',	'len',			'off_t',			'size'),
+	('fop-arg',	'xdata',		'dict_t *',			'xdata'),
 	('cbk-arg',	'pre',			'struct iatt *'),
 	('cbk-arg',	'post',			'struct iatt *'),
 	('cbk-arg',	'xdata',		'dict_t *'),
+	('journal',	'fd-op'),
 )
 
 ops['ipc'] = (
@@ -460,6 +552,11 @@ def get_subs (names, types):
 
 def generate (tmpl, name, subs):
 	text = tmpl.replace("@NAME@",name)
+	if name == "writev":
+		# More spurious inconsistency.
+		text = text.replace("@UPNAME@","WRITE")
+	else:
+		text = text.replace("@UPNAME@",name.upper())
 	for old, new in subs[name].iteritems():
 		text = text.replace(old,new)
 	# TBD: reindent/reformat the result for maximum readability.
diff --git a/libglusterfs/src/iobuf.c b/libglusterfs/src/iobuf.c
index a4d36691cd0..d1eb0acaf5e 100644
--- a/libglusterfs/src/iobuf.c
+++ b/libglusterfs/src/iobuf.c
@@ -1014,7 +1014,7 @@ int
 iobref_merge (struct iobref *to, struct iobref *from)
 {
         int           i = 0;
-        int           ret = -1;
+        int           ret = 0;
         struct iobuf *iobuf = NULL;
 
         GF_VALIDATE_OR_GOTO ("iobuf", to, out);
diff --git a/libglusterfs/src/syscall.c b/libglusterfs/src/syscall.c
index eb0c1cf983a..d412b4d656d 100644
--- a/libglusterfs/src/syscall.c
+++ b/libglusterfs/src/syscall.c
@@ -588,7 +588,7 @@ sys_fallocate(int fd, int mode, off_t offset, off_t len)
         return posix_fallocate(fd, offset, len);
 #endif
 
-#if defined(F_ALLOCATECONFIG) && defined(GF_DARWIN_HOST_OS)
+#if defined(F_ALLOCATECONTIG) && defined(GF_DARWIN_HOST_OS)
         /* C conversion from C++ implementation for OSX by Mozilla Foundation */
         if (mode) {
                 /* keep size not supported */
diff --git a/tests/features/fdl-overflow.t b/tests/features/fdl-overflow.t
new file mode 100644
index 00000000000..d7633a7ca7d
--- /dev/null
+++ b/tests/features/fdl-overflow.t
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+
+log_base=$($CLI --print-logdir)
+log_id=${B0}/${V0}-0
+log_id=${log_id:1}     # Remove initial slash
+log_id=${log_id//\//-} # Replace remaining slashes with dashes
+
+_check_sizes () {
+	local n=0
+	local sz
+	local total_sz=0
+
+	# We don't care about the sizes of the meta files.  That would be
+	# embedding too much of the implementation into the test.
+	n=$(ls ${log_base}/${log_id}-meta-*.jnl | wc -l)
+	[ $n = 2 ] || return 1
+
+	# We *do* care about the sizes of the data files, which should exactly
+	# reflect the amount of data written via dd.
+	n=0
+	while read sz name; do
+                G_LOG "found journal ${name} size ${sz}MB"
+		n=$((n+1))
+		total_sz=$((total_sz+sz))
+	done < <(du -sm ${log_base}/${log_id}-data-*.jnl)
+	[ $n = 2 ] || return 1
+	# On our CentOS and NetBSD regression-test systems, but not on my Fedora
+	# development system, each file ends up being slightly larger than its
+	# data size because of metadata, and 'du' rounds that up to a full extra
+	# megabyte.  We'll allow either result, because what we're really
+	# looking for is a complete failure to roll over from one file to
+	# another at the appropriate size.
+	[ $total_sz = 20 -o $total_sz = $((n+20)) ] || return 1
+
+	return 0
+}
+
+check_sizes () {
+	set -x
+	_check_sizes
+	ret=$?
+	set +x
+	return ret
+}
+
+if [ x"$OSTYPE" = x"NetBSD" ]; then
+        CREAT_OFLAG="creat,"
+else
+        CREAT_OFLAG=""
+fi
+
+TEST rm -f ${log_base}/${log_id}-*.log
+TEST glusterd
+TEST pidof glusterd
+
+# Get a simple volume set up and mounted with FDL active.
+TEST $CLI volume create $V0 ${H0}:${B0}/${V0}-0
+TEST $CLI volume set $V0 changelog.changelog off
+TEST $CLI volume set $V0 features.fdl on
+TEST $CLI volume start $V0
+TEST $GFS -s $H0 --volfile-id $V0 $M0
+
+# Generate some I/O and unmount/stop so we can see log sizes.
+TEST dd if=/dev/zero of=$M0/twentyMB bs=1048576 count=20 \
+     oflag=${CREAT_OFLAG}sync
+TEST umount $M0
+TEST $CLI volume stop $V0
+
+TEST _check_sizes
+
+cleanup
diff --git a/tests/features/fdl.t b/tests/features/fdl.t
new file mode 100644
index 00000000000..34d6d78228a
--- /dev/null
+++ b/tests/features/fdl.t
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+
+log_base=$($CLI --print-logdir)
+log_id=${B0}/${V0}-0
+log_id=${log_id:1}     # Remove initial slash
+log_id=${log_id//\//-} # Replace remaining slashes with dashes
+FDL_META_FILE=${log_base}/${log_id}-meta-1.jnl
+FDL_DATA_FILE=${log_base}/${log_id}-data-1.jnl
+
+check_logfile() {
+	[ $(gf_logdump $FDL_META_FILE $FDL_DATA_FILE | grep $1 | wc -l) -ge $2 ]
+}
+
+if [ x"$OSTYPE" = x"NetBSD" ]; then
+        CREAT_OFLAG="creat,"
+else
+        CREAT_OFLAG=""
+fi
+
+TEST rm -f $FDL_META_FILE $FDL_DATA_FILE
+TEST glusterd
+TEST pidof glusterd
+
+# Get a simple volume set up and mounted with FDL active.
+TEST $CLI volume create $V0 ${H0}:${B0}/${V0}-0
+TEST $CLI volume set $V0 changelog.changelog off
+TEST $CLI volume set $V0 features.fdl on
+TEST $CLI volume start $V0
+TEST $GFS -s $H0 --volfile-id $V0 $M0
+
+# Generate some I/O and unmount.
+TEST mkdir -p $M0/abc/def
+TEST dd if=/dev/zero of=$M0/abc/def/ghi bs=128 count=2 \
+     oflag=${CREAT_OFLAG}sync
+TEST chmod 314 $M0/abc/def/ghi
+TEST rm -rf $M0/abc
+TEST umount $M0
+
+# Check that gf_logdump works, and shows the ops we just issued.  There will be
+# more SETATTR ops than the one corresponding to our chmod, because some are
+# issued internally.  We have to guess a bit about where the log will be.
+TEST check_logfile GF_FOP_MKDIR 2
+TEST check_logfile GF_FOP_CREATE 1
+TEST check_logfile GF_FOP_WRITE 2
+TEST check_logfile GF_FOP_SETATTR 1
+TEST check_logfile GF_FOP_UNLINK 1
+TEST check_logfile GF_FOP_RMDIR 2
+
+cleanup
diff --git a/tests/features/recon.t b/tests/features/recon.t
new file mode 100644
index 00000000000..7dda2a680e8
--- /dev/null
+++ b/tests/features/recon.t
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+. $(dirname $0)/../include.rc
+. $(dirname $0)/../volume.rc
+
+log_base=$($CLI --print-logdir)
+log_id=${B0}/${V0}-0
+log_id=${log_id:1}     # Remove initial slash
+log_id=${log_id//\//-} # Replace remaining slashes with dashes
+FDL_META_FILE=${log_base}/${log_id}-meta-1.jnl
+FDL_DATA_FILE=${log_base}/${log_id}-data-1.jnl
+
+tmpdir=$(mktemp -d -t ${0##*/}.XXXXXX)
+trap "rm -rf $tmpdir" EXIT
+
+write_file () {
+	echo "peekaboo" > $1
+}
+
+TEST rm -f $FDL_META_FILE $FDL_DATA_FILE
+TEST glusterd
+TEST pidof glusterd
+
+# Get a simple volume set up and mounted with FDL active.
+TEST $CLI volume create $V0 ${H0}:${B0}/${V0}-0
+TEST $CLI volume set $V0 features.fdl on
+TEST $CLI volume start $V0
+TEST $GFS -s $H0 --volfile-id $V0 $M0
+
+# Generate some I/O and then copy off the journal files for later.
+TEST mkdir -p $M0/abc/def
+TEST write_file $M0/abc/def/ghi
+#EST chmod 314 $M0/abc/def/ghi
+cp ${FDL_META_FILE} ${FDL_DATA_FILE} ${tmpdir}
+
+# Get back to an empty state and unmount.
+TEST rm -rf $M0/abc
+TEST umount $M0
+
+# Make sure we really are in an empty state.  Otherwise the tests below could
+# pass just because we never cleaned up in the first place.
+TEST [ ! -d ${B0}/${V0}-0/abc ]
+
+# Create a stub volfile.
+vol_file=${GLUSTERD_WORKDIR}/vols/${V0}/${V0}.${H0}.${log_id}.vol
+vol_id_line=$(grep volume-id ${vol_file})
+cat > ${tmpdir}/recon.vol << EOF
+volume recon-posix
+    type storage/posix
+    option directory ${B0}/${V0}-0
+${vol_id_line}
+end-volume
+EOF
+
+TEST gf_recon ${tmpdir}/recon.vol ${tmpdir}/$(basename ${FDL_META_FILE}) \
+				  ${tmpdir}/$(basename ${FDL_DATA_FILE})
+
+TEST [ -d ${B0}/${V0}-0/abc/def ]
+EXPECT "peekaboo" cat ${B0}/${V0}-0/abc/def/ghi
+# TBD: test permissions, xattrs
+
+cleanup
diff --git a/tests/include.rc b/tests/include.rc
index 139bc03ac8c..21a69465797 100644
--- a/tests/include.rc
+++ b/tests/include.rc
@@ -136,7 +136,7 @@ function G_LOG()
         return
      fi
      local g_log_string;
-     g_log_string="++++++++++ G_LOG:$0: TEST: $1 $@ ++++++++++"
+     g_log_string="++++++++++ G_LOG:$0: TEST: $@ ++++++++++"
      g_log_string="`date -u +["%F %T.%6N"]`:$g_log_string"
      local g_log_filename
      for  g_log_filename in `find $g_log_logdir/ -type f -name \*.log`;
@@ -541,10 +541,10 @@ function cleanup()
         fi >&2
 
         # tar logs at the start and end of every test
-        if [ -n $LOGDIR ]
+        if [ -n "$LOGDIR" -a -z "$STOP_WASTING_SPACE" ]
         then
                 tarname=$(basename $0 .t)
-                tar -rvf ${LOGDIR}/${tarname}.tar ${LOGDIR}/* \
+                tar -rf ${LOGDIR}/${tarname}.tar ${LOGDIR}/* \
                         --exclude="*.tar" \
                         && \
                 find $LOGDIR/* -maxdepth 0 -name '*.tar' -prune \
diff --git a/xlators/experimental/Makefile.am b/xlators/experimental/Makefile.am
index 06f04a193c8..a31512203f6 100644
--- a/xlators/experimental/Makefile.am
+++ b/xlators/experimental/Makefile.am
@@ -1,3 +1,3 @@
-SUBDIRS = nsr-client nsr-server
+SUBDIRS = nsr-client nsr-server fdl
 
 CLEANFILES =
diff --git a/xlators/experimental/fdl/Makefile.am b/xlators/experimental/fdl/Makefile.am
new file mode 100644
index 00000000000..a985f42a877
--- /dev/null
+++ b/xlators/experimental/fdl/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/experimental/fdl/src/Makefile.am b/xlators/experimental/fdl/src/Makefile.am
new file mode 100644
index 00000000000..a05fc797b0a
--- /dev/null
+++ b/xlators/experimental/fdl/src/Makefile.am
@@ -0,0 +1,42 @@
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/experimental
+xlator_LTLIBRARIES = fdl.la
+
+noinst_HEADERS = jnl-types.h
+
+nodist_fdl_la_SOURCES = fdl.c
+fdl_la_LDFLAGS = -module -avoid-version
+fdl_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+sbin_PROGRAMS = gf_logdump gf_recon
+gf_logdump_SOURCES = logdump.c
+nodist_gf_logdump_SOURCES = libfdl.c
+gf_logdump_LDADD = $(top_builddir)/libglusterfs/src/libglusterfs.la\
+                   $(top_builddir)/api/src/libgfapi.la
+
+# Eventually recon(ciliation) code will move elsewhere, but for now it's
+# easier to have it next to the similar logdump code.
+gf_recon_SOURCES = recon.c
+nodist_gf_recon_SOURCES = librecon.c
+gf_recon_LDADD = $(top_builddir)/libglusterfs/src/libglusterfs.la\
+                   $(top_builddir)/api/src/libgfapi.la
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+	      -I$(top_srcdir)/api/src -fPIC \
+	      -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -D$(GF_HOST_OS) \
+	      -DDATADIR=\"$(localstatedir)\"
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+noinst_PYTHON = gen_fdl.py gen_dumper.py gen_recon.py
+EXTRA_DIST = fdl-tmpl.c dump-tmpl.c recon-tmpl.c
+
+CLEANFILES = $(nodist_fdl_la_SOURCES) $(nodist_gf_logdump_SOURCES)
+
+fdl.c: fdl-tmpl.c gen_fdl.py
+	$(PYTHON) $(srcdir)/gen_fdl.py $(srcdir)/fdl-tmpl.c > $@
+
+libfdl.c: dump-tmpl.c gen_dumper.py
+	$(PYTHON) $(srcdir)/gen_dumper.py $(srcdir)/dump-tmpl.c > $@
+
+librecon.c: recon-tmpl.c gen_recon.py
+	$(PYTHON) $(srcdir)/gen_recon.py $(srcdir)/recon-tmpl.c > $@
diff --git a/xlators/experimental/fdl/src/dump-tmpl.c b/xlators/experimental/fdl/src/dump-tmpl.c
new file mode 100644
index 00000000000..cac1071a9c1
--- /dev/null
+++ b/xlators/experimental/fdl/src/dump-tmpl.c
@@ -0,0 +1,156 @@
+#pragma fragment PROLOG
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glfs.h"
+#include "iatt.h"
+#include "xlator.h"
+#include "jnl-types.h"
+
+#pragma fragment DICT
+        {
+                int key_len, data_len;
+                char *key_ptr;
+                printf ("@ARGNAME@ = dict {\n");
+                for (;;) {
+                        key_len = *((int *)new_meta);
+                        new_meta += sizeof(int);
+                        if (!key_len) {
+                                break;
+                        }
+                        key_ptr = new_meta;
+                        new_meta += key_len;
+                        data_len = *((int *)new_meta);
+                        new_meta += sizeof(int) + data_len;
+                        printf (" %s = <%d bytes>\n", key_ptr, data_len);
+                }
+                printf ("}\n");
+        }
+
+#pragma fragment DOUBLE
+        printf ("@ARGNAME@ = @FORMAT@\n", *((uint64_t *)new_meta),
+                *((uint64_t *)new_meta));
+        new_meta += sizeof(uint64_t);
+
+#pragma fragment GFID
+        printf ("@ARGNAME@ = <gfid %s>\n", uuid_utoa(*((uuid_t *)new_meta)));
+        new_meta += 16;
+
+#pragma fragment INTEGER
+        printf ("@ARGNAME@ = @FORMAT@\n", *((uint32_t *)new_meta),
+                *((uint32_t *)new_meta));
+        new_meta += sizeof(uint32_t);
+
+#pragma fragment LOC
+        printf ("@ARGNAME@ = loc {\n");
+        printf ("  gfid = %s\n", uuid_utoa(*((uuid_t *)new_meta)));
+        new_meta += 16;
+        printf ("  pargfid = %s\n", uuid_utoa(*((uuid_t *)new_meta)));
+        new_meta += 16;
+        if (*(new_meta++)) {
+                printf ("  name = %s\n", new_meta);
+                new_meta += (strlen(new_meta) + 1);
+        }
+        printf ("}\n");
+
+#pragma fragment STRING
+        if (*(new_meta++)) {
+                printf ("@ARGNAME@ = %s\n", new_meta);
+                new_meta += (strlen(new_meta) + 1);
+        }
+
+#pragma fragment VECTOR
+        {
+                size_t len = *((size_t *)new_meta);
+                new_meta += sizeof(len);
+                printf ("@ARGNAME@ = <%zu bytes>\n", len);
+                new_data += len;
+        }
+
+#pragma fragment IATT
+        {
+                ia_prot_t *myprot = ((ia_prot_t *)new_meta);
+                printf ("@ARGNAME@ = iatt {\n");
+                printf ("  ia_prot = %c%c%c",
+                        myprot->suid ? 'S' : '-',
+                        myprot->sgid ? 'S' : '-',
+                        myprot->sticky ? 'T' : '-');
+                printf ("%c%c%c",
+                        myprot->owner.read ? 'r' : '-',
+                        myprot->owner.write ? 'w' : '-',
+                        myprot->owner.exec ? 'x' : '-');
+                printf ("%c%c%c",
+                        myprot->group.read ? 'r' : '-',
+                        myprot->group.write ? 'w' : '-',
+                        myprot->group.exec ? 'x' : '-');
+                printf ("%c%c%c\n",
+                        myprot->other.read ? 'r' : '-',
+                        myprot->other.write ? 'w' : '-',
+                        myprot->other.exec ? 'x' : '-');
+                new_meta += sizeof(ia_prot_t);
+                uint32_t *myints = (uint32_t *)new_meta;
+                printf ("  ia_uid = %u\n", myints[0]);
+                printf ("  ia_gid = %u\n", myints[1]);
+                printf ("  ia_atime = %u.%09u\n", myints[2], myints[3]);
+                printf ("  ia_mtime = %u.%09u\n", myints[4], myints[5]);
+                new_meta += sizeof(*myints) * 6;
+        }
+
+#pragma fragment FOP
+void
+fdl_dump_@NAME@ (char **old_meta, char **old_data)
+{
+        char    *new_meta	= *old_meta;
+        char	*new_data	= *old_data;
+
+        /* TBD: word size/endianness */
+@FUNCTION_BODY@
+
+        *old_meta = new_meta;
+        *old_data = new_data;
+}
+
+#pragma fragment CASE
+        case GF_FOP_@UPNAME@:
+                printf ("=== GF_FOP_@UPNAME@\n");
+                fdl_dump_@NAME@ (&new_meta, &new_data);
+                break;
+
+#pragma fragment EPILOG
+int
+fdl_dump (char **old_meta, char **old_data)
+{
+        char            *new_meta       = *old_meta;
+        char            *new_data       = *old_data;
+        static glfs_t   *fs             = NULL;
+        int             recognized      = 1;
+        event_header_t  *eh;
+
+        /*
+         * We don't really call anything else in GFAPI, but this is the most
+         * convenient way to satisfy all of the spurious dependencies on how it
+         * or glusterfsd initialize (e.g. setting up THIS).
+         */
+        if (!fs) {
+                fs = glfs_new ("dummy");
+        }
+
+        eh = (event_header_t *)new_meta;
+        new_meta += sizeof (*eh);
+
+        /* TBD: check event_type instead of assuming NEW_REQUEST */
+
+        switch (eh->fop_type) {
+@SWITCH_BODY@
+
+        default:
+                printf ("unknown fop %u\n", eh->fop_type);
+                recognized = 0;
+        }
+
+        *old_meta = new_meta;
+        *old_data = new_data;
+        return recognized;
+}
diff --git a/xlators/experimental/fdl/src/fdl-tmpl.c b/xlators/experimental/fdl/src/fdl-tmpl.c
new file mode 100644
index 00000000000..8fcc6a8d6ff
--- /dev/null
+++ b/xlators/experimental/fdl/src/fdl-tmpl.c
@@ -0,0 +1,506 @@
+/*
+  Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
+  This file is part of GlusterFS.
+
+  This file is licensed to you under your choice of the GNU Lesser
+  General Public License, version 3 or any later version (LGPLv3 or
+  later), or the GNU General Public License, version 2 (GPLv2), in all
+  cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include "call-stub.h"
+#include "iatt.h"
+#include "defaults.h"
+#include "syscall.h"
+#include "xlator.h"
+#include "jnl-types.h"
+
+/* TBD: make tunable */
+#define META_FILE_SIZE  (1 << 20)
+#define DATA_FILE_SIZE  (1 << 24)
+
+enum gf_fdl {
+        gf_fdl_mt_fdl_private_t = gf_common_mt_end + 1,
+        gf_fdl_mt_end
+};
+
+typedef struct {
+        char            *type;
+        off_t           size;
+        char            *path;
+        int             fd;
+        void *          ptr;
+        off_t           max_offset;
+} log_obj_t;
+
+typedef struct {
+        struct list_head        reqs;
+        pthread_mutex_t         req_lock;
+        pthread_cond_t          req_cond;
+        char                    *log_dir;
+        pthread_t               worker;
+        gf_boolean_t            should_stop;
+        gf_boolean_t            change_term;
+        log_obj_t               meta_log;
+        log_obj_t               data_log;
+        int                     term;
+        int                     first_term;
+} fdl_private_t;
+
+void
+fdl_enqueue (xlator_t *this, call_stub_t *stub)
+{
+        fdl_private_t   *priv   = this->private;
+
+        pthread_mutex_lock (&priv->req_lock);
+        list_add_tail (&stub->list, &priv->reqs);
+        pthread_mutex_unlock (&priv->req_lock);
+
+        pthread_cond_signal (&priv->req_cond);
+}
+
+#pragma generate
+
+char *
+fdl_open_term_log (xlator_t *this, log_obj_t *obj, int term)
+{
+        fdl_private_t   *priv   = this->private;
+        int             ret;
+        char *          ptr     = NULL;
+
+        /*
+         * Use .jnl instead of .log so that we don't get test info (mistakenly)
+         * appended to our journal files.
+         */
+        if (this->ctx->cmd_args.log_ident) {
+                ret = gf_asprintf (&obj->path, "%s/%s-%s-%d.jnl",
+                                   priv->log_dir, this->ctx->cmd_args.log_ident,
+                                   obj->type, term);
+        }
+        else {
+                ret = gf_asprintf (&obj->path, "%s/fubar-%s-%d.jnl",
+                                   priv->log_dir, obj->type, term);
+        }
+        if ((ret <= 0) || !obj->path) {
+                gf_log (this->name, GF_LOG_ERROR,
+                        "failed to construct log-file path");
+                goto err;
+        }
+
+        gf_log (this->name, GF_LOG_INFO, "opening %s (size %ld)",
+                obj->path, obj->size);
+
+        obj->fd = open (obj->path, O_RDWR|O_CREAT|O_TRUNC, 0666);
+        if (obj->fd < 0) {
+                gf_log (this->name, GF_LOG_ERROR,
+                        "failed to open log file (%s)", strerror(errno));
+                goto err;
+        }
+
+#if !defined(GF_BSD_HOST_OS)
+        /*
+         * NetBSD can just go die in a fire.  Even though it claims to support
+         * fallocate/posix_fallocate they don't actually *do* anything so the
+         * file size remains zero.  Then mmap succeeds anyway, but any access
+         * to the mmap'ed region will segfault.  It would be acceptable for
+         * fallocate to do what it says, for mmap to fail, or for access to
+         * extend the file.  NetBSD managed to hit the trifecta of Getting
+         * Everything Wrong, and debugging in that environment to get this far
+         * has already been painful enough (systems I worked on in 1990 were
+         * better that way).  We'll fall through to the lseek/write method, and
+         * performance will be worse, and TOO BAD.
+         */
+        if (sys_fallocate(obj->fd,0,0,obj->size) < 0)
+#endif
+        {
+                gf_log (this->name, GF_LOG_WARNING,
+                        "failed to fallocate space for log file");
+                /* Have to do this the ugly page-faulty way. */
+                (void) sys_lseek (obj->fd, obj->size-1, SEEK_SET);
+                (void) sys_write (obj->fd, "", 1);
+        }
+
+        ptr = mmap (NULL, obj->size, PROT_WRITE, MAP_SHARED, obj->fd, 0);
+        if (ptr == MAP_FAILED) {
+                gf_log (this->name, GF_LOG_ERROR, "failed to mmap log (%s)",
+                        strerror(errno));
+                goto err;
+        }
+
+        obj->ptr = ptr;
+        obj->max_offset = 0;
+        return ptr;
+
+err:
+        if (obj->fd >= 0) {
+                sys_close (obj->fd);
+                obj->fd = (-1);
+        }
+        if (obj->path) {
+                GF_FREE (obj->path);
+                obj->path = NULL;
+        }
+        return ptr;
+}
+
+void
+fdl_close_term_log (xlator_t *this, log_obj_t *obj)
+{
+        fdl_private_t   *priv           = this->private;
+
+        if (obj->ptr) {
+                (void) munmap (obj->ptr, obj->size);
+                obj->ptr = NULL;
+        }
+
+        if (obj->fd >= 0) {
+                gf_log (this->name, GF_LOG_INFO,
+                        "truncating term %d %s journal to %ld",
+                        priv->term, obj->type, obj->max_offset);
+                if (sys_ftruncate(obj->fd,obj->max_offset) < 0) {
+                        gf_log (this->name, GF_LOG_WARNING,
+                                "failed to truncate journal (%s)",
+                                strerror(errno));
+                }
+                sys_close (obj->fd);
+                obj->fd = (-1);
+        }
+
+        if (obj->path) {
+                GF_FREE (obj->path);
+                obj->path = NULL;
+        }
+}
+
+gf_boolean_t
+fdl_change_term (xlator_t *this, char **meta_ptr, char **data_ptr)
+{
+        fdl_private_t   *priv           = this->private;
+
+        fdl_close_term_log (this, &priv->meta_log);
+        fdl_close_term_log (this, &priv->data_log);
+
+        ++(priv->term);
+
+        *meta_ptr = fdl_open_term_log (this, &priv->meta_log, priv->term);
+        if (!*meta_ptr) {
+                return _gf_false;
+        }
+
+        *data_ptr = fdl_open_term_log (this, &priv->data_log, priv->term);
+        if (!*data_ptr) {
+                return _gf_false;
+        }
+
+        return _gf_true;
+}
+
+void *
+fdl_worker (void *arg)
+{
+        xlator_t        *this           = arg;
+        fdl_private_t   *priv           = this->private;
+        call_stub_t     *stub;
+        char *          meta_ptr        = NULL;
+        off_t           *meta_offset    = &priv->meta_log.max_offset;
+        char *          data_ptr        = NULL;
+        off_t           *data_offset    = &priv->data_log.max_offset;
+        unsigned long   base_as_ul;
+        void *          msync_ptr;
+        size_t          msync_len;
+        gf_boolean_t    recycle;
+        void            *err_label      = &&err_unlocked;
+
+        priv->meta_log.type = "meta";
+        priv->meta_log.size = META_FILE_SIZE;
+        priv->meta_log.path = NULL;
+        priv->meta_log.fd = (-1);
+        priv->meta_log.ptr = NULL;
+
+        priv->data_log.type = "data";
+        priv->data_log.size = DATA_FILE_SIZE;
+        priv->data_log.path = NULL;
+        priv->data_log.fd = (-1);
+        priv->data_log.ptr = NULL;
+
+        /* TBD: initial term should come from persistent storage (e.g. etcd) */
+        priv->first_term = ++(priv->term);
+        meta_ptr = fdl_open_term_log (this, &priv->meta_log, priv->term);
+        if (!meta_ptr) {
+                goto *err_label;
+        }
+        data_ptr = fdl_open_term_log (this, &priv->data_log, priv->term);
+        if (!data_ptr) {
+                fdl_close_term_log (this, &priv->meta_log);
+                goto *err_label;
+        }
+
+        for (;;) {
+                pthread_mutex_lock (&priv->req_lock);
+                err_label = &&err_locked;
+                while (list_empty(&priv->reqs)) {
+                        pthread_cond_wait (&priv->req_cond, &priv->req_lock);
+                        if (priv->should_stop) {
+                                goto *err_label;
+                        }
+                        if (priv->change_term) {
+                                if (!fdl_change_term(this, &meta_ptr,
+                                                           &data_ptr)) {
+                                        goto *err_label;
+                                }
+                                priv->change_term = _gf_false;
+                                continue;
+                        }
+                }
+                stub = list_entry (priv->reqs.next, call_stub_t, list);
+                list_del_init (&stub->list);
+                pthread_mutex_unlock (&priv->req_lock);
+                err_label = &&err_unlocked;
+                /*
+                 * TBD: batch requests
+                 *
+                 * What we should do here is gather up *all* of the requests
+                 * that have accumulated since we were last at this point,
+                 * blast them all out in one big writev, and then dispatch them
+                 * all before coming back for more.  That maximizes throughput,
+                 * at some cost to latency (due to queuing effects at the log
+                 * stage).  Note that we're likely to be above io-threads, so
+                 * the dispatch itself will be parallelized (at further cost to
+                 * latency).  For now, we just do the simplest thing and handle
+                 * one request all the way through before fetching the next.
+                 *
+                 * So, why mmap/msync instead of writev/fdatasync?  Because it's
+                 * faster.  Much faster.  So much faster that I half-suspect
+                 * cheating, but it's more convenient for now than having to
+                 * ensure that everything's page-aligned for O_DIRECT (the only
+                 * alternative that still might avoid ridiculous levels of
+                 * local-FS overhead).
+                 *
+                 * TBD: check that msync really does get our data to disk.
+                 */
+                gf_log (this->name, GF_LOG_DEBUG,
+                        "logging %u+%u bytes for op %d",
+                        stub->jnl_meta_len, stub->jnl_data_len, stub->fop);
+                recycle = _gf_false;
+                if ((*meta_offset + stub->jnl_meta_len) > priv->meta_log.size) {
+                        recycle = _gf_true;
+                }
+                if ((*data_offset + stub->jnl_data_len) > priv->data_log.size) {
+                        recycle = _gf_true;
+                }
+                if (recycle && !fdl_change_term(this,&meta_ptr,&data_ptr)) {
+                        goto *err_label;
+                }
+                meta_ptr = priv->meta_log.ptr;
+                data_ptr = priv->data_log.ptr;
+                gf_log (this->name, GF_LOG_DEBUG, "serializing to %p/%p",
+                        meta_ptr + *meta_offset, data_ptr + *data_offset);
+                stub->serialize (stub, meta_ptr + *meta_offset,
+                                       data_ptr + *data_offset);
+                if (stub->jnl_meta_len > 0) {
+                        base_as_ul = (unsigned long) (meta_ptr + *meta_offset);
+                        msync_ptr = (void *) (base_as_ul & ~0x0fff);
+                        msync_len = (size_t) (base_as_ul &  0x0fff);
+                        if (msync (msync_ptr, msync_len+stub->jnl_meta_len,
+                                              MS_SYNC) < 0) {
+                                gf_log (this->name, GF_LOG_WARNING,
+                                        "failed to log request meta (%s)",
+                                        strerror(errno));
+                        }
+                        *meta_offset += stub->jnl_meta_len;
+                }
+                if (stub->jnl_data_len > 0) {
+                        base_as_ul = (unsigned long) (data_ptr + *data_offset);
+                        msync_ptr = (void *) (base_as_ul & ~0x0fff);
+                        msync_len = (size_t) (base_as_ul &  0x0fff);
+                        if (msync (msync_ptr, msync_len+stub->jnl_data_len,
+                                              MS_SYNC) < 0) {
+                                gf_log (this->name, GF_LOG_WARNING,
+                                        "failed to log request data (%s)",
+                                        strerror(errno));
+                        }
+                        *data_offset += stub->jnl_data_len;
+                }
+                call_resume (stub);
+        }
+
+err_locked:
+        pthread_mutex_unlock (&priv->req_lock);
+err_unlocked:
+        fdl_close_term_log (this, &priv->meta_log);
+        fdl_close_term_log (this, &priv->data_log);
+        return NULL;
+}
+
+int32_t
+fdl_ipc (call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata)
+{
+        fdl_private_t   *priv   = this->private;
+        dict_t          *tdict;
+        int32_t         gt_err  = EIO;
+
+        switch (op) {
+
+        case FDL_IPC_CHANGE_TERM:
+                gf_log (this->name, GF_LOG_INFO, "got CHANGE_TERM op");
+                priv->change_term = _gf_true;
+                pthread_cond_signal (&priv->req_cond);
+                STACK_UNWIND_STRICT (ipc, frame, 0, 0, NULL);
+                break;
+
+        case FDL_IPC_GET_TERMS:
+                gf_log (this->name, GF_LOG_INFO, "got GET_TERMS op");
+                tdict = dict_new ();
+                if (!tdict) {
+                        gt_err = ENOMEM;
+                        goto gt_done;
+                }
+                if (dict_set_int32(tdict,"first",priv->first_term) != 0) {
+                        goto gt_done;
+                }
+                if (dict_set_int32(tdict,"last",priv->term) != 0) {
+                        goto gt_done;
+                }
+                gt_err = 0;
+        gt_done:
+                if (gt_err) {
+                        STACK_UNWIND_STRICT (ipc, frame, -1, gt_err, NULL);
+                } else {
+                        STACK_UNWIND_STRICT (ipc, frame, 0, 0, tdict);
+                }
+                if (tdict) {
+                        dict_unref (tdict);
+                }
+                break;
+
+        default:
+                STACK_WIND_TAIL (frame,
+                                 FIRST_CHILD(this),
+                                 FIRST_CHILD(this)->fops->ipc,
+                                 op, xdata);
+        }
+
+        return 0;
+}
+
+int
+fdl_init (xlator_t *this)
+{
+        fdl_private_t   *priv   = NULL;
+
+        priv = GF_CALLOC (1, sizeof (*priv), gf_fdl_mt_fdl_private_t);
+        if (!priv) {
+                gf_log (this->name, GF_LOG_ERROR,
+                        "failed to allocate fdl_private");
+                goto err;
+        }
+
+        INIT_LIST_HEAD (&priv->reqs);
+        if (pthread_mutex_init (&priv->req_lock, NULL) != 0) {
+                gf_log (this->name, GF_LOG_ERROR,
+                        "failed to initialize req_lock");
+                goto err;
+        }
+        if (pthread_cond_init (&priv->req_cond, NULL) != 0) {
+                gf_log (this->name, GF_LOG_ERROR,
+                        "failed to initialize req_cond");
+                goto err;
+        }
+
+        GF_OPTION_INIT ("log-path", priv->log_dir, path, err);
+
+        if (pthread_create(&priv->worker,NULL,fdl_worker,this) != 0) {
+                gf_log (this->name, GF_LOG_ERROR,
+                        "failed to start fdl_worker");
+                goto err;
+        }
+
+        /*
+         * The rest of the fop table is automatically generated, so this is a
+         * bit cleaner than messing with the generation to add a hand-written
+         * exception.
+         */
+        this->fops->ipc = fdl_ipc;
+
+        this->private = priv;
+        return 0;
+
+err:
+        if (priv) {
+                GF_FREE(priv);
+        }
+        return -1;
+}
+
+void
+fdl_fini (xlator_t *this)
+{
+        fdl_private_t   *priv   = this->private;
+
+        if (priv) {
+                priv->should_stop = _gf_true;
+                pthread_cond_signal (&priv->req_cond);
+                pthread_join (priv->worker, NULL);
+                GF_FREE(priv);
+        }
+}
+
+int
+fdl_reconfigure (xlator_t *this, dict_t *options)
+{
+        fdl_private_t   *priv   = this->private;
+
+	GF_OPTION_RECONF ("log_dir", priv->log_dir, options, path, out);
+        /* TBD: react if it changed */
+
+out:
+        return 0;
+}
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+        int     ret = -1;
+
+        GF_VALIDATE_OR_GOTO ("fdl", this, out);
+
+        ret = xlator_mem_acct_init (this, gf_fdl_mt_end + 1);
+
+        if (ret != 0) {
+                gf_log (this->name, GF_LOG_ERROR, "Memory accounting init"
+                        "failed");
+                return ret;
+        }
+out:
+        return ret;
+}
+
+class_methods_t class_methods = {
+        .init           = fdl_init,
+        .fini           = fdl_fini,
+        .reconfigure    = fdl_reconfigure,
+        .notify         = default_notify,
+};
+
+struct volume_options options[] = {
+        { .key = {"log-path"},
+          .type = GF_OPTION_TYPE_PATH,
+          .default_value = DEFAULT_LOG_FILE_DIRECTORY,
+          .description = "Directory for FDL files."
+        },
+        { .key  = {NULL} },
+};
+
+struct xlator_cbks cbks = {
+        .release        = default_release,
+        .releasedir     = default_releasedir,
+        .forget         = default_forget,
+};
diff --git a/xlators/experimental/fdl/src/gen_dumper.py b/xlators/experimental/fdl/src/gen_dumper.py
new file mode 100755
index 00000000000..42db55d2cb3
--- /dev/null
+++ b/xlators/experimental/fdl/src/gen_dumper.py
@@ -0,0 +1,116 @@
+#!/usr/bin/python
+
+import os
+import re
+import sys
+
+curdir = os.path.dirname (sys.argv[0])
+gendir = os.path.join (curdir, '../../../../libglusterfs/src')
+sys.path.append (gendir)
+from generator import ops, fop_subs, cbk_subs, generate
+
+# See the big header comment at the start of gen_fdl.py to see how the stages
+# fit together.  The big difference here is that *all* of the C code is in the
+# template file as labelled fragments, instead of as Python strings.  That
+# makes it much easier to edit in one place, with proper syntax highlighting
+# and indentation.
+#
+#   Stage 1 uses type-specific fragments to generate FUNCTION_BODY, instead of
+#   LEN_*_TEMPLATE and SERLZ_*_TEMPLATE to generate LEN_CODE and SER_CODE.
+#
+#   Stage 2 uses the FOP and CASE fragments instead of RECON_TEMPLATE and
+#   FOP_TEMPLATE.  The expanded FOP code (including FUNCTION_BODY substitution
+#   in the middle of each function) is emitted immediately; the expanded CASE
+#   code is saved for the next stage.
+#
+#   Stage 3 uses the PROLOG and EPILOG fragments, with the expanded CASE code
+#   in the middle of EPILOG, to generate the whole output file.
+#
+# Another way of looking at it is to consider how the fragments appear in
+# the final output:
+#
+#   PROLOG
+#   FOP (expanded for CREATE)
+#       FOP before FUNCTION_BODY
+#       LOC, INTEGER, GFID, etc. (one per arg, by type)
+#       FOP after FUNCTION_BODY
+#   FOP (expanded for WRITEV)
+#       FOP before FUNCTION_BODY
+#       GFID, VECTOR, etc. (on per arg, by type)
+#       FOP after FUNCTION_BODY
+#   (more FOPs)
+#   EPILOG
+#       EPILOG before CASE
+#       CASE statements (one per fop)
+#       EPILOG after CASE
+
+typemap = {
+	'dict_t *':				( "DICT",		""),
+	'fd_t *':				( "GFID",		""),
+	'dev_t':				( "DOUBLE",		"%ld (0x%lx)"),
+	'gf_xattrop_flags_t':	( "INTEGER",	"%d (0x%x)"),
+	'int32_t':				( "INTEGER",	"%d (0x%x)"),
+	'mode_t':				( "INTEGER",	"%d (0x%x)"),
+	'off_t':				( "DOUBLE",		"%ld (0x%lx)"),
+	'size_t':				( "DOUBLE",		"%ld (0x%lx)"),
+	'uint32_t':				( "INTEGER",	"%d (0x%x)"),
+	'loc_t *':				( "LOC",		""),
+	'const char *':			( "STRING",		""),
+	'struct iovec *':		( "VECTOR",		""),
+	'struct iatt *':		( "IATT",		""),
+}
+
+def get_special_subs (args):
+	code = ""
+	for arg in args:
+		if (arg[0] != 'fop-arg') or (len(arg) < 4):
+			continue
+		recon_type, recon_fmt = typemap[arg[2]]
+		code += fragments[recon_type].replace("@ARGNAME@",arg[3])		\
+									 .replace("@FORMAT@",recon_fmt)
+	return code
+
+def gen_functions ():
+	code = ""
+	for name, value in ops.iteritems():
+		if "journal" not in [ x[0] for x in value ]:
+			continue
+		fop_subs[name]["@FUNCTION_BODY@"] = get_special_subs(value)
+		# Print the FOP fragment with @FUNCTION_BODY@ in the middle.
+		code += generate(fragments["FOP"],name,fop_subs)
+	return code
+
+def gen_cases ():
+	code = ""
+	for name, value in ops.iteritems():
+		if "journal" not in [ x[0] for x in value ]:
+			continue
+		# Add the CASE fragment for this fop.
+		code += generate(fragments["CASE"],name,fop_subs)
+	return code
+
+def load_fragments (path="recon-tmpl.c"):
+	pragma_re = re.compile('pragma fragment (.*)')
+	cur_symbol = None
+	cur_value = ""
+	result = {}
+	for line in open(path,"r").readlines():
+		m = pragma_re.search(line)
+		if m:
+			if cur_symbol:
+				result[cur_symbol] = cur_value
+			cur_symbol = m.group(1)
+			cur_value = ""
+		else:
+			cur_value += line
+	if cur_symbol:
+		result[cur_symbol] = cur_value
+	return result
+
+if __name__ == "__main__":
+	fragments = load_fragments(sys.argv[1])
+	print "/* BEGIN GENERATED CODE - DO NOT MODIFY */"
+	print fragments["PROLOG"]
+	print gen_functions()
+	print fragments["EPILOG"].replace("@SWITCH_BODY@",gen_cases())
+	print "/* END GENERATED CODE */"
diff --git a/xlators/experimental/fdl/src/gen_fdl.py b/xlators/experimental/fdl/src/gen_fdl.py
new file mode 100755
index 00000000000..7f6b1aaaeaa
--- /dev/null
+++ b/xlators/experimental/fdl/src/gen_fdl.py
@@ -0,0 +1,328 @@
+#!/usr/bin/python
+
+import os
+import sys
+
+curdir = os.path.dirname (sys.argv[0])
+gendir = os.path.join (curdir, '../../../../libglusterfs/src')
+sys.path.append (gendir)
+from generator import ops, fop_subs, cbk_subs, generate
+
+# Generation occurs in three stages.  In this case, it actually makes more
+# sense to discuss them in the *opposite* order of that in which they
+# actually happen.
+#
+#   Stage 3 is to insert all of the generated code into a file, replacing the
+#   "#pragma generate" that's already there.  The file can thus contain all
+#   sorts of stuff that's not specific to one fop, either before or after the
+#   generated code as appropriate.
+#
+#   Stage 2 is to generate all of the code *for a particular fop*, using a
+#   string-valued template plus a table of substitution values.  Most of these
+#   are built in to the generator itself.  However, we also add a couple that
+#   are specific to this particular translator - LEN_CODE and SER_CODE.  These
+#   are per-fop functions to get the length or the contents (respectively) of
+#   what we'll put in the log.  As with stage 3 allowing per-file boilerplate
+#   before and after generated code, this allows per-fop boilerplate before and
+#   after generated code.
+#
+#   Stage 1, therefore, is to create the LEN_CODE and SER_CODE substitutions for
+#   each fop, and put them in the same table where e.g. NAME and SHORT_ARGS
+#   already are.  We do this by looking at the fop-description table in the
+#   generator module, then doing out own template substitution to plug each
+#   specific argument name into another string-valued template.
+#
+# So, what does this leave us with in terms of variables and files?
+#
+#   For stage 1, we have a series of LEN_*_TEMPLATE and SERLZ_*_TEMPLATE
+#   strings, which are used to generate the length and serialization code for
+#   each argument type.
+#
+#   For stage 2, we have a bunch of *_TEMPLATE strings (no LEN_ or SERLZ_
+#   prefix), which are used (along with the output from stage 1) to generate
+#   whole functions.
+#
+#   For stage 3, we have a whole separate file (fdl_tmpl.c) into which we insert
+#   the collection of all functions defined in stage 2.
+
+
+LEN_TEMPLATE = """
+void
+fdl_len_@NAME@ (call_stub_t *stub)
+{
+        uint32_t    meta_len    = sizeof (event_header_t);
+		uint32_t	data_len	= 0;
+
+        /* TBD: global stuff, e.g. uid/gid */
+@LEN_CODE@
+
+		/* TBD: pad extension length */
+		stub->jnl_meta_len = meta_len;
+		stub->jnl_data_len = data_len;
+}
+"""
+
+SER_TEMPLATE = """
+void
+fdl_serialize_@NAME@ (call_stub_t *stub, char *meta_buf, char *data_buf)
+{
+		event_header_t	*eh;
+		unsigned long	offset = 0;
+
+        /* TBD: word size/endianness */
+		eh = (event_header_t *)meta_buf;
+		eh->event_type = NEW_REQUEST;
+		eh->fop_type = GF_FOP_@UPNAME@;
+		eh->request_id = 0;	// TBD
+		meta_buf += sizeof (*eh);
+@SER_CODE@
+		/* TBD: pad extension length */
+		eh->ext_length = offset;
+}
+"""
+
+CBK_TEMPLATE = """
+int32_t
+fdl_@NAME@_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno,
+                @LONG_ARGS@)
+{
+        STACK_UNWIND_STRICT (@NAME@, frame, op_ret, op_errno,
+                             @SHORT_ARGS@);
+        return 0;
+}
+"""
+
+CONTINUE_TEMPLATE = """
+int32_t
+fdl_@NAME@_continue (call_frame_t *frame, xlator_t *this,
+                     @LONG_ARGS@)
+{
+        STACK_WIND (frame, fdl_@NAME@_cbk,
+                    FIRST_CHILD(this), FIRST_CHILD(this)->fops->@NAME@,
+                    @SHORT_ARGS@);
+        return 0;
+}
+
+"""
+
+FOP_TEMPLATE = """
+int32_t
+fdl_@NAME@ (call_frame_t *frame, xlator_t *this,
+            @LONG_ARGS@)
+{
+        call_stub_t     *stub;
+
+        stub = fop_@NAME@_stub (frame, default_@NAME@,
+                                @SHORT_ARGS@);
+		fdl_len_@NAME@ (stub);
+        stub->serialize = fdl_serialize_@NAME@;
+        fdl_enqueue (this, stub);
+
+        return 0;
+}
+"""
+
+LEN_DICT_TEMPLATE = """
+		if (@SRC@) {
+			data_pair_t *memb;
+			for (memb = @SRC@->members_list; memb; memb = memb->next) {
+				meta_len += sizeof(int);
+				meta_len += strlen(memb->key) + 1;
+				meta_len += sizeof(int);
+				meta_len += memb->value->len;
+			}
+		}
+		meta_len += sizeof(int);
+"""
+
+LEN_GFID_TEMPLATE = """
+        meta_len += 16;
+"""
+
+LEN_INTEGER_TEMPLATE = """
+        meta_len += sizeof (@SRC@);
+"""
+
+# 16 for gfid, 16 for pargfid, 1 for flag, 0/1 for terminating NUL
+LEN_LOC_TEMPLATE = """
+        if (@SRC@.name) {
+                meta_len += (strlen (@SRC@.name) + 34);
+        } else {
+                meta_len += 33;
+        }
+"""
+
+LEN_STRING_TEMPLATE = """
+        if (@SRC@) {
+                meta_len += (strlen (@SRC@) + 1);
+        } else {
+                meta_len += 1;
+        }
+"""
+
+LEN_VECTOR_TEMPLATE = """
+        meta_len += sizeof(size_t);
+        data_len += iov_length (@VEC@, @CNT@);
+"""
+
+LEN_IATT_TEMPLATE = """
+		meta_len += sizeof(@SRC@.ia_prot);
+		meta_len += sizeof(@SRC@.ia_uid);
+		meta_len += sizeof(@SRC@.ia_gid);
+		meta_len += sizeof(@SRC@.ia_atime);
+		meta_len += sizeof(@SRC@.ia_atime_nsec);
+		meta_len += sizeof(@SRC@.ia_mtime);
+		meta_len += sizeof(@SRC@.ia_mtime_nsec);
+"""
+
+SERLZ_DICT_TEMPLATE = """
+        if (@SRC@) {
+			data_pair_t *memb;
+			for (memb = @SRC@->members_list; memb; memb = memb->next) {
+				*((int *)(meta_buf+offset)) = strlen(memb->key) + 1;
+				offset += sizeof(int);
+				strcpy (meta_buf+offset, memb->key);
+				offset += strlen(memb->key) + 1;
+				*((int *)(meta_buf+offset)) = memb->value->len;
+				offset += sizeof(int);
+				memcpy (meta_buf+offset, memb->value->data, memb->value->len);
+				offset += memb->value->len;
+			}
+        }
+		*((int *)(meta_buf+offset)) = 0;
+		offset += sizeof(int);
+"""
+
+SERLZ_GFID_TEMPLATE = """
+        memcpy (meta_buf+offset, @SRC@->inode->gfid, 16);
+        offset += 16;
+"""
+
+SERLZ_INTEGER_TEMPLATE = """
+        memcpy (meta_buf+offset, &@SRC@, sizeof(@SRC@));
+        offset += sizeof(@SRC@);
+"""
+
+SERLZ_LOC_TEMPLATE = """
+        memcpy (meta_buf+offset, @SRC@.gfid, 16);
+        offset += 16;
+        memcpy (meta_buf+offset, @SRC@.pargfid, 16);
+        offset += 16;
+        if (@SRC@.name) {
+                *(meta_buf+offset) = 1;
+				++offset;
+                strcpy (meta_buf+offset, @SRC@.name);
+                offset += (strlen (@SRC@.name) + 1);
+        } else {
+                *(meta_buf+offset) = 0;
+				++offset;
+        }
+"""
+
+SERLZ_STRING_TEMPLATE = """
+        if (@SRC@) {
+                *(meta_buf+offset) = 1;
+				++offset;
+                strcpy (meta_buf+offset, @SRC@);
+                offset += strlen(@SRC@);
+        } else {
+                *(meta_buf+offset) = 0;
+				++offset;
+        }
+"""
+
+SERLZ_VECTOR_TEMPLATE = """
+        *((size_t *)(meta_buf+offset)) = iov_length (@VEC@, @CNT@);
+        offset += sizeof(size_t);
+        int32_t i;
+        for (i = 0; i < @CNT@; ++i) {
+                memcpy (data_buf, @VEC@[i].iov_base, @VEC@[i].iov_len);
+                data_buf += @VEC@[i].iov_len;
+        }
+"""
+
+# We don't need to save all of the fields - only those affected by chown,
+# chgrp, chmod, and utime.
+SERLZ_IATT_TEMPLATE = """
+		*((ia_prot_t *)(meta_buf+offset)) = @SRC@.ia_prot;
+		offset += sizeof(@SRC@.ia_prot);
+		*((uint32_t *)(meta_buf+offset)) = @SRC@.ia_uid;
+		offset += sizeof(@SRC@.ia_uid);
+		*((uint32_t *)(meta_buf+offset)) = @SRC@.ia_gid;
+		offset += sizeof(@SRC@.ia_gid);
+		*((uint32_t *)(meta_buf+offset)) = @SRC@.ia_atime;
+		offset += sizeof(@SRC@.ia_atime);
+		*((uint32_t *)(meta_buf+offset)) = @SRC@.ia_atime_nsec;
+		offset += sizeof(@SRC@.ia_atime_nsec);
+		*((uint32_t *)(meta_buf+offset)) = @SRC@.ia_mtime;
+		offset += sizeof(@SRC@.ia_mtime);
+		*((uint32_t *)(meta_buf+offset)) = @SRC@.ia_mtime_nsec;
+		offset += sizeof(@SRC@.ia_mtime_nsec);
+"""
+
+typemap = {
+	'dict_t *':				( LEN_DICT_TEMPLATE,	SERLZ_DICT_TEMPLATE),
+	'fd_t *':				( LEN_GFID_TEMPLATE,	SERLZ_GFID_TEMPLATE),
+	'dev_t':				( LEN_INTEGER_TEMPLATE,	SERLZ_INTEGER_TEMPLATE),
+	'gf_xattrop_flags_t':	( LEN_INTEGER_TEMPLATE,	SERLZ_INTEGER_TEMPLATE),
+	'int32_t':				( LEN_INTEGER_TEMPLATE,	SERLZ_INTEGER_TEMPLATE),
+	'mode_t':				( LEN_INTEGER_TEMPLATE, SERLZ_INTEGER_TEMPLATE),
+	'off_t':				( LEN_INTEGER_TEMPLATE,	SERLZ_INTEGER_TEMPLATE),
+	'size_t':				( LEN_INTEGER_TEMPLATE,	SERLZ_INTEGER_TEMPLATE),
+	'uint32_t':				( LEN_INTEGER_TEMPLATE,	SERLZ_INTEGER_TEMPLATE),
+	'loc_t *':				( LEN_LOC_TEMPLATE,		SERLZ_LOC_TEMPLATE),
+	'const char *':			( LEN_STRING_TEMPLATE,	SERLZ_STRING_TEMPLATE),
+	'struct iatt *':		( LEN_IATT_TEMPLATE,	SERLZ_IATT_TEMPLATE),
+}
+
+def get_special_subs (args):
+	len_code = ""
+	ser_code = ""
+	for arg in args:
+		if (arg[0] != 'fop-arg') or (len(arg) < 4):
+			continue
+		# Let this throw an exception if we get an unknown field name.  The
+		# broken build will remind whoever messed with the stub code that a
+		# corresponding update is needed here.
+		if arg[3] == "vector":
+			# Make it as obvious as possible that this is a special case.
+			len_code += LEN_VECTOR_TEMPLATE \
+				.replace("@VEC@","stub->args.vector") \
+				.replace("@CNT@","stub->args.count")
+			ser_code += SERLZ_VECTOR_TEMPLATE \
+				.replace("@VEC@","stub->args.vector") \
+				.replace("@CNT@","stub->args.count")
+		else:
+			len_tmpl, ser_tmpl = typemap[arg[2]]
+			src = "stub->args.%s" % arg[3]
+			len_code += len_tmpl.replace("@SRC@",src)
+			ser_code += ser_tmpl.replace("@SRC@",src)
+	return len_code, ser_code
+
+def gen_fdl ():
+	entrypoints = []
+	for name, value in ops.iteritems():
+		if "journal" not in [ x[0] for x in value ]:
+			continue
+		len_code, ser_code = get_special_subs(value)
+		fop_subs[name]["@LEN_CODE@"] = len_code[:-1]
+		fop_subs[name]["@SER_CODE@"] = ser_code[:-1]
+		print generate(LEN_TEMPLATE,name,fop_subs)
+		print generate(SER_TEMPLATE,name,fop_subs)
+		print generate(CBK_TEMPLATE,name,cbk_subs)
+		print generate(CONTINUE_TEMPLATE,name,fop_subs)
+		print generate(FOP_TEMPLATE,name,fop_subs)
+		entrypoints.append(name)
+	print "struct xlator_fops fops = {"
+	for ep in entrypoints:
+		print "\t.%s = fdl_%s," % (ep, ep)
+	print "};"
+
+for l in open(sys.argv[1],'r').readlines():
+	if l.find('#pragma generate') != -1:
+		print "/* BEGIN GENERATED CODE - DO NOT MODIFY */"
+		gen_fdl()
+		print "/* END GENERATED CODE */"
+	else:
+		print l[:-1]
diff --git a/xlators/experimental/fdl/src/gen_recon.py b/xlators/experimental/fdl/src/gen_recon.py
new file mode 100755
index 00000000000..26318f92d88
--- /dev/null
+++ b/xlators/experimental/fdl/src/gen_recon.py
@@ -0,0 +1,191 @@
+#!/usr/bin/python
+
+import os
+import re
+import string
+import sys
+
+curdir = os.path.dirname (sys.argv[0])
+gendir = os.path.join (curdir, '../../../../libglusterfs/src')
+sys.path.append (gendir)
+from generator import ops, fop_subs, cbk_subs, generate
+
+# See the big header comment at the start of gen_fdl.py to see how the stages
+# fit together.  The big difference here is that *all* of the C code is in the
+# template file as labelled fragments, instead of as Python strings.  That
+# makes it much easier to edit in one place, with proper syntax highlighting
+# and indentation.
+#
+#   Stage 1 uses type-specific fragments to generate FUNCTION_BODY, instead of
+#   LEN_*_TEMPLATE and SERLZ_*_TEMPLATE to generate LEN_CODE and SER_CODE.
+#
+#   Stage 2 uses the FOP and CASE fragments instead of RECON_TEMPLATE and
+#   FOP_TEMPLATE.  The expanded FOP code (including FUNCTION_BODY substitution
+#   in the middle of each function) is emitted immediately; the expanded CASE
+#   code is saved for the next stage.
+#
+#   Stage 3 uses the PROLOG and EPILOG fragments, with the expanded CASE code
+#   in the middle of EPILOG, to generate the whole output file.
+#
+# Another way of looking at it is to consider how the fragments appear in
+# the final output:
+#
+#   PROLOG
+#   FOP (expanded for CREATE)
+#       FOP before FUNCTION_BODY
+#       LOC, INTEGER, GFID, etc. (one per arg, by type)
+#       FOP after FUNCTION_BODY
+#   FOP (expanded for WRITEV)
+#       FOP before FUNCTION_BODY
+#       GFID, VECTOR, etc. (one per arg, by type)
+#       FOP after FUNCTION_BODY
+#   (more FOPs)
+#   EPILOG
+#       EPILOG before CASE
+#       CASE statements (one per fop)
+#       EPILOG after CASE
+
+typemap = {
+	'dict_t *':				"DICT",
+	'fd_t *':				"FD",
+	'dev_t':				"DOUBLE",
+	'gf_xattrop_flags_t':	"INTEGER",
+	'int32_t':				"INTEGER",
+	'mode_t':				"INTEGER",
+	'off_t':				"DOUBLE",
+	'size_t':				"DOUBLE",
+	'uint32_t':				"INTEGER",
+	'loc_t *':				"LOC",
+	'const char *':			"STRING",
+	'struct iovec *':		"VECTOR",
+	'struct iatt *':		"IATT",
+	'struct iobref *':		"IOBREF",
+}
+
+def get_special_subs (name, args, fop_type):
+	code = ""
+	cleanups = ""
+	links = ""
+	s_args = []
+	for arg in args:
+		if arg[0] == 'extra':
+			code += "\t%s %s;\n\n" % (arg[2], arg[1])
+			s_args.append(arg[3])
+			continue
+		if arg[0] == 'link':
+			links += fragments["LINK"].replace("@INODE_ARG@",arg[1])	\
+									  .replace("@IATT_ARG@",arg[2])
+			continue
+		if arg[0] != 'fop-arg':
+			continue
+		if (name, arg[1]) == ('writev', 'count'):
+			# Special case: just skip this.  We can't mark it as 'nosync'
+			# because of the way the translator and dumper generators look for
+			# that after 'stub-name' which we don't define.  Instead of adding a
+			# bunch of generic infrastructure for this one case, just pound it
+			# here.
+			continue
+		recon_type = typemap[arg[2]]
+		# print "/* %s.%s => %s (%s)*/" % (name, arg[1], recon_type, fop_type)
+		if (name == "create") and (arg[1] == "fd"):
+			# Special case: fd for create is new, not looked up.
+			# print "/* change to NEW_FD */"
+			recon_type = "NEW_FD"
+		elif (recon_type == "LOC") and (fop_type == "entry-op"):
+			# Need to treat this differently for inode vs. entry ops.
+			# Special case: link source is treated like inode-op.
+			if (name != "link") or (arg[1] != "oldloc"):
+				# print "/* change to PARENT_LOC */"
+				recon_type = "PARENT_LOC"
+		code += fragments[recon_type].replace("@ARGNAME@",arg[1])		\
+									 .replace("@ARGTYPE@",arg[2])
+		cleanup_key = recon_type + "_CLEANUP"
+		if fragments.has_key(cleanup_key):
+			cleanups += fragments[cleanup_key].replace("@ARGNAME@",arg[1])
+		if 'nosync' in arg[4:]:
+			code += "\t(void)%s;\n" % arg[1];
+			continue
+		if arg[2] in ("loc_t *", "struct iatt *"):
+			# These are passed as pointers to the syncop, but they're actual
+			# structures in the generated code.
+			s_args.append("&"+arg[1]);
+		else:
+			s_args.append(arg[1])
+	# We have to handle a couple of special cases here, because some n00b
+	# defined the syncops with a different argument order than the fops they're
+	# based on.
+	if name == 'writev':
+		# Swap 'flags' and 'iobref'.  Also, we need to add the iov count, which
+		# is not stored in or read from the journal.  There are other ways to
+		# do that, but this is the only place we need anything similar and we
+		# already have to treat it as a special case so this is simplest.
+		s_args_str = 'fd, &vector, 1, off, iobref, flags, xdata'
+	elif name == 'symlink':
+		# Swap 'linkpath' and 'loc'.
+		s_args_str = '&loc, linkpath, &iatt, xdata'
+	else:
+		s_args_str = string.join (s_args, ", ")
+	return code, links, s_args_str, cleanups
+
+# TBD: probably need to generate type-specific cleanup code as well - e.g.
+# fd_unref for an fd_t, loc_wipe for a loc_t, and so on.  All of these
+# generated CLEANUP fragments will go at the end of the function, with goto
+# labels.  Meanwhile, the error-checking part of each type-specific fragment
+# (e.g. LOC or FD) will need to update the indirect label that we jump to when
+# an error is detected.  This will probably get messy.
+def gen_functions ():
+	code = ""
+	for name, value in ops.iteritems():
+		fop_type = [ x[1] for x in value if x[0] == "journal" ]
+		if not fop_type:
+			continue
+		body, links, syncop_args, cleanups = get_special_subs (name, value,
+															   fop_type[0])
+		fop_subs[name]["@FUNCTION_BODY@"] = body
+		fop_subs[name]["@LINKS@"] = links
+		fop_subs[name]["@SYNCOP_ARGS@"] = syncop_args
+		fop_subs[name]["@CLEANUPS@"] = cleanups
+		if name == "writev":
+			# Take advantage of the fact that, *during reconciliation*, the
+			# vector is always a single element.  In normal I/O it's not.
+			fop_subs[name]["@SUCCESS_VALUE@"] = "vector.iov_len"
+		else:
+			fop_subs[name]["@SUCCESS_VALUE@"] = "GFAPI_SUCCESS"
+		# Print the FOP fragment with @FUNCTION_BODY@ in the middle.
+		code += generate(fragments["FOP"],name,fop_subs)
+	return code
+
+def gen_cases ():
+	code = ""
+	for name, value in ops.iteritems():
+		if "journal" not in [ x[0] for x in value ]:
+			continue
+		# Add the CASE fragment for this fop.
+		code += generate(fragments["CASE"],name,fop_subs)
+	return code
+
+def load_fragments (path="recon-tmpl.c"):
+	pragma_re = re.compile('pragma fragment (.*)')
+	cur_symbol = None
+	cur_value = ""
+	result = {}
+	for line in open(path,"r").readlines():
+		m = pragma_re.search(line)
+		if m:
+			if cur_symbol:
+				result[cur_symbol] = cur_value
+			cur_symbol = m.group(1)
+			cur_value = ""
+		else:
+			cur_value += line
+	if cur_symbol:
+		result[cur_symbol] = cur_value
+	return result
+
+if __name__ == "__main__":
+	fragments = load_fragments(sys.argv[1])
+	print "/* BEGIN GENERATED CODE - DO NOT MODIFY */"
+	print fragments["PROLOG"]
+	print gen_functions()
+	print fragments["EPILOG"].replace("@SWITCH_BODY@",gen_cases())
+	print "/* END GENERATED CODE */"
diff --git a/xlators/experimental/fdl/src/jnl-types.h b/xlators/experimental/fdl/src/jnl-types.h
new file mode 100644
index 00000000000..8cb39d01a25
--- /dev/null
+++ b/xlators/experimental/fdl/src/jnl-types.h
@@ -0,0 +1,14 @@
+#define NEW_REQUEST     (uint8_t)'N'
+
+typedef struct {
+        uint8_t         event_type;     /* e.g. NEW_REQUEST */
+        uint8_t         fop_type;       /* e.g. GF_FOP_SETATTR */
+        uint16_t        request_id;
+        uint32_t        ext_length;
+} event_header_t;
+
+enum {
+        FDL_IPC_BASE = 0xfeedbee5,       /* ... and they make honey */
+        FDL_IPC_CHANGE_TERM,
+        FDL_IPC_GET_TERMS,
+};
diff --git a/xlators/experimental/fdl/src/logdump.c b/xlators/experimental/fdl/src/logdump.c
new file mode 100644
index 00000000000..7c979c32a04
--- /dev/null
+++ b/xlators/experimental/fdl/src/logdump.c
@@ -0,0 +1,50 @@
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/mman.h>
+
+extern int fdl_dump (char **, char **);
+
+int
+main (int argc, char **argv)
+{
+        int     meta_fd         = (-1);
+        char    *meta_buf       = NULL;
+        int     data_fd         = (-1);
+        char    *data_buf       = NULL;
+
+        meta_fd = open (argv[1], O_RDONLY);
+        if (meta_fd < 0) {
+                perror ("open");
+                return EXIT_FAILURE;
+        }
+
+        /* TBD: get proper length */
+        meta_buf = mmap (NULL, 1048576, PROT_READ, MAP_PRIVATE, meta_fd, 0);
+        if (meta_buf == MAP_FAILED) {
+                perror ("mmap");
+                return EXIT_FAILURE;
+        }
+
+        data_fd = open (argv[2], O_RDONLY);
+        if (data_fd < 0) {
+                perror ("open");
+                return EXIT_FAILURE;
+        }
+
+        /* TBD: get proper length */
+        data_buf = mmap (NULL, 1048576, PROT_READ, MAP_PRIVATE, data_fd, 0);
+        if (data_buf == MAP_FAILED) {
+                perror ("mmap");
+                return EXIT_FAILURE;
+        }
+
+        for (;;) {
+                if (!fdl_dump(&meta_buf,&data_buf)) {
+                        break;
+                }
+        }
+
+        return EXIT_SUCCESS;
+}
diff --git a/xlators/experimental/fdl/src/recon-tmpl.c b/xlators/experimental/fdl/src/recon-tmpl.c
new file mode 100644
index 00000000000..523bda39418
--- /dev/null
+++ b/xlators/experimental/fdl/src/recon-tmpl.c
@@ -0,0 +1,305 @@
+#pragma fragment PROLOG
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "fd.h"
+#include "iatt.h"
+#include "syncop.h"
+#include "xlator.h"
+#include "glfs-internal.h"
+
+#include "jnl-types.h"
+
+#define GFAPI_SUCCESS 0
+
+inode_t *
+recon_get_inode (glfs_t *fs, uuid_t gfid)
+{
+        inode_t         *inode;
+        loc_t           loc     = {NULL,};
+        struct iatt     iatt;
+        int             ret;
+        inode_t         *newinode;
+
+        inode = inode_find (fs->active_subvol->itable, gfid);
+        if (inode) {
+                printf ("=== FOUND %s IN TABLE\n", uuid_utoa(gfid));
+                return inode;
+        }
+
+        loc.inode = inode_new (fs->active_subvol->itable);
+        if (!loc.inode) {
+                return NULL;
+        }
+        gf_uuid_copy (loc.inode->gfid, gfid);
+        gf_uuid_copy (loc.gfid, gfid);
+
+        printf ("=== DOING LOOKUP FOR %s\n", uuid_utoa(gfid));
+
+        ret = syncop_lookup (fs->active_subvol, &loc, &iatt,
+                             NULL, NULL, NULL);
+        if (ret != GFAPI_SUCCESS) {
+                fprintf (stderr, "syncop_lookup failed (%d)\n", ret);
+                return NULL;
+        }
+
+        newinode = inode_link (loc.inode, NULL, NULL, &iatt);
+        if (newinode) {
+                inode_lookup (newinode);
+        }
+
+        return newinode;
+}
+
+#pragma fragment DICT
+        dict_t  *@ARGNAME@;
+
+        @ARGNAME@ = dict_new();
+        if (!@ARGNAME@) {
+                goto *err_label;
+        }
+        err_label = &&cleanup_@ARGNAME@;
+
+        {
+                int     key_len, data_len;
+                char    *key_ptr;
+                int     garbage;
+                for (;;) {
+                        key_len = *((int *)new_meta);
+                        new_meta += sizeof(int);
+                        if (!key_len) {
+                                break;
+                        }
+                        key_ptr = new_meta;
+                        new_meta += key_len;
+                        data_len = *((int *)new_meta);
+                        new_meta += sizeof(int);
+                        garbage = dict_set_static_bin (@ARGNAME@, key_ptr,
+                                                       new_meta, data_len);
+                        /* TBD: check error from dict_set_static_bin */
+                        (void)garbage;
+                        new_meta += data_len;
+                }
+        }
+
+#pragma fragment DICT_CLEANUP
+cleanup_@ARGNAME@:
+        dict_unref (@ARGNAME@);
+
+#pragma fragment DOUBLE
+        @ARGTYPE@       @ARGNAME@       = *((@ARGTYPE@ *)new_meta);
+        new_meta += sizeof(uint64_t);
+
+#pragma fragment FD
+        inode_t *@ARGNAME@_ino;
+        fd_t    *@ARGNAME@;
+
+        @ARGNAME@_ino = recon_get_inode (fs, *((uuid_t *)new_meta));
+        new_meta += 16;
+        if (!@ARGNAME@_ino) {
+                goto *err_label;
+        }
+        err_label = &&cleanup_@ARGNAME@_ino;
+
+        @ARGNAME@ = fd_anonymous (@ARGNAME@_ino);
+        if (!@ARGNAME@) {
+                goto *err_label;
+        }
+        err_label = &&cleanup_@ARGNAME@;
+
+#pragma fragment FD_CLEANUP
+cleanup_@ARGNAME@:
+        fd_unref (@ARGNAME@);
+cleanup_@ARGNAME@_ino:
+        inode_unref (@ARGNAME@_ino);
+
+#pragma fragment NEW_FD
+        /*
+         * This pseudo-type is only used for create, and in that case we know
+         * we'll be using loc.inode, so it's not worth generalizing to take an
+         * extra argument.
+         */
+        fd_t    *@ARGNAME@      = fd_anonymous (loc.inode);
+
+        if (!fd) {
+                goto *err_label;
+        }
+        err_label = &&cleanup_@ARGNAME@;
+        new_meta += 16;
+
+#pragma fragment NEW_FD_CLEANUP
+cleanup_@ARGNAME@:
+        fd_unref (@ARGNAME@);
+
+#pragma fragment INTEGER
+        @ARGTYPE@       @ARGNAME@       = *((@ARGTYPE@ *)new_meta);
+
+        new_meta += sizeof(@ARGTYPE@);
+
+#pragma fragment LOC
+        loc_t           @ARGNAME@       = { NULL, };
+
+        @ARGNAME@.inode = recon_get_inode (fs, *((uuid_t *)new_meta));
+        if (!@ARGNAME@.inode) {
+                goto *err_label;
+        }
+        err_label = &&cleanup_@ARGNAME@;
+        gf_uuid_copy (@ARGNAME@.gfid, @ARGNAME@.inode->gfid);
+        new_meta += 16;
+        new_meta += 16; /* skip over pargfid */
+        if (*(new_meta++)) {
+                @ARGNAME@.name = new_meta;
+                new_meta += strlen(new_meta) + 1;
+        }
+
+#pragma fragment LOC_CLEANUP
+cleanup_@ARGNAME@:
+        loc_wipe (&@ARGNAME@);
+
+#pragma fragment PARENT_LOC
+        loc_t           @ARGNAME@       = { NULL, };
+
+        new_meta += 16; /* skip over gfid */
+        @ARGNAME@.parent = recon_get_inode (fs, *((uuid_t *)new_meta));
+        if (!@ARGNAME@.parent) {
+                goto *err_label;
+        }
+        err_label = &&cleanup_@ARGNAME@;
+        gf_uuid_copy (@ARGNAME@.pargfid, @ARGNAME@.parent->gfid);
+        new_meta += 16;
+        if (!*(new_meta++)) {
+                goto *err_label;
+        }
+        @ARGNAME@.name = new_meta;
+        new_meta += strlen(new_meta) + 1;
+
+        @ARGNAME@.inode = inode_new (fs->active_subvol->itable);
+        if (!@ARGNAME@.inode) {
+                goto *err_label;
+        }
+
+#pragma fragment PARENT_LOC_CLEANUP
+cleanup_@ARGNAME@:
+        loc_wipe (&@ARGNAME@);
+
+#pragma fragment STRING
+        char    *@ARGNAME@;
+        if (*(new_meta++)) {
+                @ARGNAME@ = new_meta;
+                new_meta += (strlen(new_meta) + 1);
+        }
+        else {
+                goto *err_label;
+        }
+
+#pragma fragment VECTOR
+        struct iovec    @ARGNAME@;
+
+        @ARGNAME@.iov_len = *((size_t *)new_meta);
+        new_meta += sizeof(@ARGNAME@.iov_len);
+        @ARGNAME@.iov_base = new_data;
+        new_data += @ARGNAME@.iov_len;
+
+#pragma fragment IATT
+        struct iatt     @ARGNAME@;
+        {
+                @ARGNAME@.ia_prot = *((ia_prot_t *)new_meta);
+                new_meta += sizeof(ia_prot_t);
+                uint32_t *myints = (uint32_t *)new_meta;
+                @ARGNAME@.ia_uid = myints[0];
+                @ARGNAME@.ia_gid = myints[1];
+                @ARGNAME@.ia_atime = myints[2];
+                @ARGNAME@.ia_atime_nsec = myints[3];
+                @ARGNAME@.ia_mtime = myints[4];
+                @ARGNAME@.ia_mtime_nsec = myints[5];
+                new_meta += sizeof(*myints) * 6;
+        }
+
+#pragma fragment IOBREF
+        struct iobref   *@ARGNAME@;
+
+        @ARGNAME@ = iobref_new();
+        if (!@ARGNAME@) {
+                goto *err_label;
+        }
+        err_label = &&cleanup_@ARGNAME@;
+
+#pragma fragment IOBREF_CLEANUP
+cleanup_@ARGNAME@:
+        iobref_unref (@ARGNAME@);
+
+#pragma fragment LINK
+        /* TBD: check error */
+        inode_t *new_inode = inode_link (@INODE_ARG@, NULL, NULL, @IATT_ARG@);
+        if (new_inode) {
+                inode_lookup (new_inode);
+        }
+
+#pragma fragment FOP
+int
+fdl_replay_@NAME@ (glfs_t *fs, char **old_meta, char **old_data)
+{
+        char    *new_meta	= *old_meta;
+        char	*new_data	= *old_data;
+        int     ret;
+        int     status          = 0xbad;
+        void    *err_label      = &&done;
+
+@FUNCTION_BODY@
+
+        ret = syncop_@NAME@ (fs->active_subvol, @SYNCOP_ARGS@, NULL);
+        if (ret != @SUCCESS_VALUE@) {
+                fprintf (stderr, "syncop_@NAME@ returned %d", ret);
+                goto *err_label;
+        }
+
+@LINKS@
+
+        status = 0;
+
+@CLEANUPS@
+
+done:
+        *old_meta = new_meta;
+        *old_data = new_data;
+        return status;
+}
+
+#pragma fragment CASE
+        case GF_FOP_@UPNAME@:
+                printf ("=== GF_FOP_@UPNAME@\n");
+                if (fdl_replay_@NAME@ (fs, &new_meta, &new_data) != 0) {
+                        goto done;
+                }
+                recognized = 1;
+                break;
+
+#pragma fragment EPILOG
+int
+recon_execute (glfs_t *fs, char **old_meta, char **old_data)
+{
+        char            *new_meta       = *old_meta;
+        char            *new_data       = *old_data;
+        int             recognized      = 0;
+        event_header_t  *eh;
+
+        eh = (event_header_t *)new_meta;
+        new_meta += sizeof (*eh);
+
+        /* TBD: check event_type instead of assuming NEW_REQUEST */
+
+        switch (eh->fop_type) {
+@SWITCH_BODY@
+
+        default:
+                printf ("unknown fop %u\n", eh->fop_type);
+        }
+
+done:
+        *old_meta = new_meta;
+        *old_data = new_data;
+        return recognized;
+}
diff --git a/xlators/experimental/fdl/src/recon.c b/xlators/experimental/fdl/src/recon.c
new file mode 100644
index 00000000000..14168a011e0
--- /dev/null
+++ b/xlators/experimental/fdl/src/recon.c
@@ -0,0 +1,89 @@
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/mman.h>
+
+#include "glusterfs.h"
+#include "fd.h"
+#include "syncop.h"
+#include "glfs-internal.h"
+
+#define GFAPI_SUCCESS 0
+
+extern int recon_execute (glfs_t *, char **, char **);
+
+int
+main (int argc, char **argv)
+{
+        glfs_t  *fs;
+        int     ret;
+        int     meta_fd         = (-1);
+        char    *meta_buf       = NULL;
+        int     data_fd         = (-1);
+        char    *data_buf       = NULL;
+
+        fs = glfs_new ("whocares");
+        if (!fs) {
+                fprintf (stderr, "glfs_new failed\n");
+                return EXIT_FAILURE;
+        }
+
+        if (getenv("RECON_DEBUG")) {
+                ret = glfs_set_logging (fs, "/dev/stderr", 7);
+        }
+        else {
+                ret = glfs_set_logging (fs, "/dev/null", 0);
+        }
+
+        if (ret != GFAPI_SUCCESS) {
+                fprintf (stderr, "glfs_set_logging failed (%d)\n", errno);
+                return EXIT_FAILURE;
+        }
+
+        ret = glfs_set_volfile (fs, argv[1]);
+        if (ret != GFAPI_SUCCESS) {
+                fprintf (stderr, "glfs_set_volfile failed (%d)\n", errno);
+                return EXIT_FAILURE;
+        }
+
+        ret = glfs_init (fs);
+        if (ret != GFAPI_SUCCESS) {
+                fprintf (stderr, "glfs_init failed (%d)\n", errno);
+                return EXIT_FAILURE;
+        }
+
+        meta_fd = open (argv[2], O_RDONLY);
+        if (meta_fd < 0) {
+                perror ("open");
+                return EXIT_FAILURE;
+        }
+
+        /* TBD: get proper length */
+        meta_buf = mmap (NULL, 1048576, PROT_READ, MAP_PRIVATE, meta_fd, 0);
+        if (meta_buf == MAP_FAILED) {
+                perror ("mmap");
+                return EXIT_FAILURE;
+        }
+
+        data_fd = open (argv[3], O_RDONLY);
+        if (data_fd < 0) {
+                perror ("open");
+                return EXIT_FAILURE;
+        }
+
+        /* TBD: get proper length */
+        data_buf = mmap (NULL, 1048576, PROT_READ, MAP_PRIVATE, data_fd, 0);
+        if (data_buf == MAP_FAILED) {
+                perror ("mmap");
+                return EXIT_FAILURE;
+        }
+
+        for (;;) {
+                if (!recon_execute(fs,&meta_buf,&data_buf)) {
+                        break;
+                }
+        }
+
+        return EXIT_SUCCESS;
+}
diff --git a/xlators/features/Makefile.am b/xlators/features/Makefile.am
index 7e5783f4f30..649d9d8e9fa 100644
--- a/xlators/features/Makefile.am
+++ b/xlators/features/Makefile.am
@@ -1,5 +1,6 @@
-SUBDIRS = locks quota read-only mac-compat quiesce marker index barrier arbiter\
-          protect compress changelog changetimerecorder ganesha gfid-access $(GLUPY_SUBDIR) qemu-block \
-          upcall snapview-client snapview-server trash shard bit-rot #path-converter # filter
+SUBDIRS = locks quota read-only mac-compat quiesce marker index barrier \
+	  arbiter protect compress changelog changetimerecorder ganesha \
+	  gfid-access $(GLUPY_SUBDIR) qemu-block upcall snapview-client \
+	  snapview-server trash shard bit-rot
 
 CLEANFILES =
diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c
index 2c52cf72a3f..3df4b3556cf 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volgen.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c
@@ -1783,6 +1783,30 @@ out:
         return ret;
 }
 
+/* Add this before (above) io-threads because it's not thread-safe yet. */
+static int
+brick_graph_add_fdl (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
+                     dict_t *set_dict, glusterd_brickinfo_t *brickinfo)
+{
+
+        xlator_t        *xl = NULL;
+        int             ret = -1;
+
+        if (!graph || !volinfo || !set_dict)
+                goto out;
+
+        if (dict_get_str_boolean (set_dict, "features.fdl", 0)) {
+                xl = volgen_graph_add (graph, "experimental/fdl",
+                                       volinfo->volname);
+                if (!xl)
+                        goto out;
+        }
+        ret = 0;
+
+out:
+        return ret;
+}
+
 static int
 brick_graph_add_iot (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
                       dict_t *set_dict, glusterd_brickinfo_t *brickinfo)
@@ -2359,6 +2383,7 @@ static volgen_brick_xlator_t server_graph_table[] = {
         {brick_graph_add_index, "index"},
         {brick_graph_add_barrier, NULL},
         {brick_graph_add_marker, "marker"},
+        {brick_graph_add_fdl, "fdl"},
         {brick_graph_add_iot, "io-threads"},
         {brick_graph_add_upcall, "upcall"},
         {brick_graph_add_pump, NULL},
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
index 1463ef72c71..c0059d83cfe 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -2711,6 +2711,15 @@ struct volopt_map_entry glusterd_volopt_map[] = {
           .op_version  = GD_OP_VERSION_4_0_0,
           .description = "percent of rep_count-1 bricks that must be up"
         },
+        /* Full Data Logging */
+        {
+          .key         = "features.fdl",
+          .voltype     = "features/fdl",
+          .option      = "!fdl",
+          .op_version  = GD_OP_VERSION_4_0_0,
+          .flags       = OPT_FLAG_XLATOR_OPT,
+          .type        = NO_DOC,
+        },
         { .key         = NULL
         }
 };
author	Jeff Darcy <jdarcy@redhat.com>	2016-02-08 13:30:49 -0500
committer	Jeff Darcy <jdarcy@redhat.com>	2016-02-13 05:13:07 -0800
commit	c458433041aafb48ae6d6e5fcf3e1e737dc3fda3 (patch)
tree	33a03ca0c1f5faf58419de2c4ff4532752ddfb07
parent	da33097c3d6492e3b468b4347e47c70828fb4320 (diff)