summaryrefslogtreecommitdiffstats
path: root/cli
diff options
context:
space:
mode:
authorAmar Tumballi <amar@gluster.com>2010-08-09 05:38:46 +0000
committerAnand V. Avati <avati@dev.gluster.com>2010-08-11 22:24:32 -0700
commit0b501ac95edc0ef5945228eb47e6482cfc4efa41 (patch)
tree58fa16eb100868a65e754f704b2a19817250ca91 /cli
parent6ff316dca7c9eebab14e71b6ad0908c2e45a5ee2 (diff)
'gluster volume rebalance' related fixes
Signed-off-by: Amar Tumballi <amar@gluster.com> Signed-off-by: Anand V. Avati <avati@dev.gluster.com> BUG: 1307 (gluster volume defrag <VOLNAME> status) URL: http://bugs.gluster.com/cgi-bin/bugzilla3/show_bug.cgi?id=1307
Diffstat (limited to 'cli')
-rw-r--r--cli/src/cli-cmd-volume.c54
-rw-r--r--cli/src/cli.h1
-rw-r--r--cli/src/cli3_1-cops.c78
3 files changed, 105 insertions, 28 deletions
diff --git a/cli/src/cli-cmd-volume.c b/cli/src/cli-cmd-volume.c
index 22844c500ae..63c15473072 100644
--- a/cli/src/cli-cmd-volume.c
+++ b/cli/src/cli-cmd-volume.c
@@ -219,8 +219,7 @@ cli_cmd_volume_rename_cbk (struct cli_state *state, struct cli_cmd_word *word,
goto out;
dict = dict_new ();
-
- if (dict)
+ if (!dict)
goto out;
GF_ASSERT (words[2]);
@@ -254,34 +253,57 @@ out:
return ret;
}
+void
+cli_cmd_volume_defrag_usage ()
+{
+ cli_out ("Usage: volume rebalance <volname> <start|stop|status>");
+}
int
cli_cmd_volume_defrag_cbk (struct cli_state *state, struct cli_cmd_word *word,
const char **words, int wordcount)
{
- int ret = -1;
- rpc_clnt_procedure_t *proc = NULL;
- call_frame_t *frame = NULL;
- char *volname = NULL;
-
+ int ret = -1;
+ rpc_clnt_procedure_t *proc = NULL;
+ call_frame_t *frame = NULL;
+ dict_t *dict = NULL;
frame = create_frame (THIS, THIS->ctx->pool);
if (!frame)
goto out;
+ dict = dict_new ();
+ if (!dict)
+ goto out;
+
+ GF_ASSERT (words[2]);
+
+ if (!(words[3])) {
+ cli_cmd_volume_defrag_usage();
+ goto out;
+ }
//TODO: Build validation here
- volname = (char *)words[2];
- GF_ASSERT (volname);
+ ret = dict_set_str (dict, "volname", (char *)words[2]);
+ if (ret)
+ goto out;
+
+ ret = dict_set_str (dict, "command", (char *)words[3]);
+ if (ret)
+ goto out;
proc = &cli_rpc_prog->proctable[GF1_CLI_DEFRAG_VOLUME];
if (proc->fn) {
- ret = proc->fn (frame, THIS, volname);
+ ret = proc->fn (frame, THIS, dict);
}
out:
- if (!proc && ret)
- cli_out ("Defrag of Volume %s failed", volname);
+ if (!proc && ret) {
+ if (dict)
+ dict_destroy (dict);
+
+ cli_out ("Defrag of Volume %s failed", (char *)words[2]);
+ }
return 0;
}
@@ -438,7 +460,13 @@ struct cli_cmd volume_cmds[] = {
{ "volume remove-brick <VOLNAME> [(replica <COUNT>)|(stripe <COUNT>)] <BRICK> ...",
cli_cmd_volume_remove_brick_cbk },
- { "volume defrag <VOLNAME>",
+ { "volume rebalance <VOLNAME> start",
+ cli_cmd_volume_defrag_cbk },
+
+ { "volume rebalance <VOLNAME> stop",
+ cli_cmd_volume_defrag_cbk },
+
+ { "volume rebalance <VOLNAME> status",
cli_cmd_volume_defrag_cbk },
{ "volume replace-brick <VOLNAME> (<BRICK> <NEW-BRICK>)|pause|abort|start|status",
diff --git a/cli/src/cli.h b/cli/src/cli.h
index 7b2de667358..36ac7214bfd 100644
--- a/cli/src/cli.h
+++ b/cli/src/cli.h
@@ -123,6 +123,7 @@ struct cli_local {
struct {
char *volname;
+ int cmd;
} defrag_vol;
struct {
diff --git a/cli/src/cli3_1-cops.c b/cli/src/cli3_1-cops.c
index 210c881b61b..de3aa6ed2dd 100644
--- a/cli/src/cli3_1-cops.c
+++ b/cli/src/cli3_1-cops.c
@@ -500,11 +500,12 @@ int
gf_cli3_1_defrag_volume_cbk (struct rpc_req *req, struct iovec *iov,
int count, void *myframe)
{
- gf1_cli_defrag_vol_rsp rsp = {0,};
- int ret = 0;
- cli_local_t *local = NULL;
- char *volname = NULL;
- call_frame_t *frame = NULL;
+ gf1_cli_defrag_vol_rsp rsp = {0,};
+ cli_local_t *local = NULL;
+ char *volname = NULL;
+ call_frame_t *frame = NULL;
+ int cmd = 0;
+ int ret = 0;
if (-1 == req->rpc_status) {
goto out;
@@ -521,12 +522,34 @@ gf_cli3_1_defrag_volume_cbk (struct rpc_req *req, struct iovec *iov,
if (frame)
local = frame->local;
- if (local)
- volname = local->u.start_vol.volname;
+ if (local) {
+ volname = local->u.defrag_vol.volname;
+ cmd = local->u.defrag_vol.cmd;
+ }
+ if (cmd == GF_DEFRAG_CMD_START) {
+ cli_out ("starting defrag on volume %s has been %s", volname,
+ (rsp.op_ret) ? "unsuccessful": "successful");
+ }
+ if (cmd == GF_DEFRAG_CMD_STOP) {
+ if (rsp.op_ret == -1)
+ cli_out ("'defrag volume %s stop' failed", volname);
+ else
+ cli_out ("stopped defrag process of volume %s \n"
+ "(after rebalancing %"PRId64" files totaling "
+ "%"PRId64" bytes)", volname, rsp.files, rsp.size);
+ }
+ if (cmd == GF_DEFRAG_CMD_STATUS) {
+ if (rsp.op_ret == -1)
+ cli_out ("failed to get the status of defrag process");
+ else {
+ cli_out ("rebalanced %"PRId64" files of size %"PRId64
+ " (total files scanned %"PRId64")",
+ rsp.files, rsp.size, rsp.lookedup_files);
+ }
+ }
- gf_log ("cli", GF_LOG_NORMAL, "Received resp to probe");
- cli_out ("Defrag of volume %s has been %s", volname,
- (rsp.op_ret) ? "unsuccessful": "successful");
+ if (volname)
+ GF_FREE (volname);
ret = rsp.op_ret;
@@ -1074,23 +1097,48 @@ int32_t
gf_cli3_1_defrag_volume (call_frame_t *frame, xlator_t *this,
void *data)
{
- gf1_cli_defrag_vol_req req = {0,};
- int ret = 0;
- cli_local_t *local = NULL;
+ gf1_cli_defrag_vol_req req = {0,};
+ int ret = 0;
+ cli_local_t *local = NULL;
+ char *volname = NULL;
+ char *cmd_str = NULL;
+ dict_t *dict = NULL;
if (!frame || !this || !data) {
ret = -1;
goto out;
}
+ dict = data;
+
+ ret = dict_get_str (dict, "volname", &volname);
+ if (ret)
+ gf_log ("", GF_LOG_DEBUG, "error");
+
+ ret = dict_get_str (dict, "command", &cmd_str);
+ if (ret) {
+ gf_log ("", GF_LOG_DEBUG, "error");
+ goto out;
+ }
+
+ if (strncasecmp (cmd_str, "start", 6) == 0) {
+ req.cmd = GF_DEFRAG_CMD_START;
+ } else if (strncasecmp (cmd_str, "stop", 5) == 0) {
+ req.cmd = GF_DEFRAG_CMD_STOP;
+ } else if (strncasecmp (cmd_str, "status", 7) == 0) {
+ req.cmd = GF_DEFRAG_CMD_STATUS;
+ }
+
+
local = cli_local_get ();
if (local) {
- local->u.defrag_vol.volname = data;
+ local->u.defrag_vol.volname = gf_strdup (volname);
+ local->u.defrag_vol.cmd = req.cmd;
frame->local = local;
}
- req.volname = data;
+ req.volname = volname;
ret = cli_cmd_submit (&req, frame, cli_rpc_prog,
GD_MGMT_CLI_DEFRAG_VOLUME, NULL,
-rw-r--r--xlators/cluster/dht/src/dht-diskusage.c179
-rw-r--r--xlators/cluster/dht/src/dht-hashfn.c76
-rw-r--r--xlators/cluster/dht/src/dht-helper.c399
-rw-r--r--xlators/cluster/dht/src/dht-inode-read.c92
-rw-r--r--xlators/cluster/dht/src/dht-inode-write.c455
-rw-r--r--xlators/cluster/dht/src/dht-layout.c168
-rw-r--r--xlators/cluster/dht/src/dht-linkfile.c144
-rw-r--r--xlators/cluster/dht/src/dht-rebalance.c494
-rw-r--r--xlators/cluster/dht/src/dht-rename.c209
-rw-r--r--xlators/cluster/dht/src/dht-selfheal.c316
-rw-r--r--xlators/cluster/dht/src/dht-shared.c780
-rw-r--r--xlators/cluster/dht/src/dht.c549
-rw-r--r--xlators/cluster/dht/src/nufa.c336
-rw-r--r--xlators/cluster/dht/src/switch.c216
-rw-r--r--xlators/cluster/dht/src/unittest/dht_layout_mock.c63
-rw-r--r--xlators/cluster/dht/src/unittest/dht_layout_unittest.c124
-rw-r--r--xlators/cluster/ha/src/Makefile.am2
-rw-r--r--xlators/cluster/map/src/Makefile.am2
-rw-r--r--xlators/cluster/nsr-client/Makefile.am3
-rw-r--r--xlators/cluster/nsr-client/src/Makefile.am33
-rw-r--r--xlators/cluster/nsr-client/src/fop-template.c113
-rwxr-xr-xxlators/cluster/nsr-client/src/gen-fops.py57
-rw-r--r--xlators/cluster/nsr-client/src/nsrc.c243
-rw-r--r--xlators/cluster/nsr-recon/Makefile.am3
-rw-r--r--xlators/cluster/nsr-recon/src/Makefile.am23
-rw-r--r--xlators/cluster/nsr-recon/src/recon_driver.c3130
-rw-r--r--xlators/cluster/nsr-recon/src/recon_driver.h325
-rw-r--r--xlators/cluster/nsr-recon/src/recon_xlator.c1010
-rw-r--r--xlators/cluster/nsr-recon/src/recon_xlator.h92
-rw-r--r--xlators/cluster/nsr-server/Makefile.am3
-rw-r--r--xlators/cluster/nsr-server/src/Makefile.am43
-rw-r--r--xlators/cluster/nsr-server/src/all-templates.c345
-rwxr-xr-xxlators/cluster/nsr-server/src/codegen.py174
-rw-r--r--xlators/cluster/nsr-server/src/etcd-api.c831
-rw-r--r--xlators/cluster/nsr-server/src/etcd-api.h214
-rw-r--r--xlators/cluster/nsr-server/src/etcd-sim.c280
-rwxr-xr-xxlators/cluster/nsr-server/src/gen-fops.py120
-rw-r--r--xlators/cluster/nsr-server/src/leader.c138
-rw-r--r--xlators/cluster/nsr-server/src/nsr-internal.h101
-rw-r--r--xlators/cluster/nsr-server/src/nsr.c812
-rw-r--r--xlators/cluster/nsr-server/src/recon_notify.c389
-rw-r--r--xlators/cluster/nsr-server/src/yajl.c175
-rw-r--r--xlators/cluster/nsr-server/src/yajl/yajl_common.h75
-rw-r--r--xlators/cluster/nsr-server/src/yajl/yajl_gen.h157
-rw-r--r--xlators/cluster/nsr-server/src/yajl/yajl_parse.h226
-rw-r--r--xlators/cluster/nsr-server/src/yajl/yajl_tree.h177
-rw-r--r--xlators/cluster/nsr-server/src/yajl/yajl_version.h23
-rw-r--r--xlators/cluster/nsr-server/src/yajl_alloc.c49
-rw-r--r--xlators/cluster/nsr-server/src/yajl_alloc.h34
-rw-r--r--xlators/cluster/nsr-server/src/yajl_buf.c103
-rw-r--r--xlators/cluster/nsr-server/src/yajl_buf.h57
-rw-r--r--xlators/cluster/nsr-server/src/yajl_bytestack.h69
-rw-r--r--xlators/cluster/nsr-server/src/yajl_encode.c220
-rw-r--r--xlators/cluster/nsr-server/src/yajl_encode.h34
-rw-r--r--xlators/cluster/nsr-server/src/yajl_gen.c350
-rw-r--r--xlators/cluster/nsr-server/src/yajl_lex.c763
-rw-r--r--xlators/cluster/nsr-server/src/yajl_lex.h117
-rw-r--r--xlators/cluster/nsr-server/src/yajl_parser.c492
-rw-r--r--xlators/cluster/nsr-server/src/yajl_parser.h78
-rw-r--r--xlators/cluster/nsr-server/src/yajl_tree.c501
-rw-r--r--xlators/cluster/nsr-server/src/yajl_version.c7
-rw-r--r--xlators/cluster/stripe/src/Makefile.am3
-rw-r--r--xlators/cluster/stripe/src/stripe-helpers.c99
-rw-r--r--xlators/cluster/stripe/src/stripe-mem-types.h1
-rw-r--r--xlators/cluster/stripe/src/stripe.c863
-rw-r--r--xlators/cluster/stripe/src/stripe.h11
-rw-r--r--xlators/debug/error-gen/src/Makefile.am2
-rw-r--r--xlators/debug/error-gen/src/error-gen.c236
-rw-r--r--xlators/debug/error-gen/src/error-gen.h12
-rw-r--r--xlators/debug/io-stats/src/Makefile.am6
-rw-r--r--xlators/debug/io-stats/src/io-stats.c272
-rw-r--r--xlators/debug/trace/src/Makefile.am3
-rw-r--r--xlators/debug/trace/src/trace-mem-types.h (renamed from xlators/features/marker/utils/src/procdiggy.h)17
-rw-r--r--xlators/debug/trace/src/trace.c2878
-rw-r--r--xlators/debug/trace/src/trace.h61
-rw-r--r--xlators/encryption/Makefile.am2
-rw-r--r--xlators/encryption/crypt/Makefile.am3
-rw-r--r--xlators/encryption/crypt/src/Makefile.am24
-rw-r--r--xlators/encryption/crypt/src/atom.c962
-rw-r--r--xlators/encryption/crypt/src/crypt-common.h141
-rw-r--r--xlators/encryption/crypt/src/crypt-mem-types.h44
-rw-r--r--xlators/encryption/crypt/src/crypt.c4522
-rw-r--r--xlators/encryption/crypt/src/crypt.h908
-rw-r--r--xlators/encryption/crypt/src/data.c769
-rw-r--r--xlators/encryption/crypt/src/keys.c302
-rw-r--r--xlators/encryption/crypt/src/metadata.c605
-rw-r--r--xlators/encryption/crypt/src/metadata.h74
-rw-r--r--xlators/encryption/rot-13/src/Makefile.am2
-rw-r--r--xlators/encryption/rot-13/src/rot-13.c21
-rw-r--r--xlators/features/Makefile.am3
-rw-r--r--xlators/features/barrier/Makefile.am3
-rw-r--r--xlators/features/barrier/src/Makefile.am16
-rw-r--r--xlators/features/barrier/src/barrier-mem-types.h20
-rw-r--r--xlators/features/barrier/src/barrier.c658
-rw-r--r--xlators/features/barrier/src/barrier.h91
-rw-r--r--xlators/features/changelog/Makefile.am3
-rw-r--r--xlators/features/changelog/lib/Makefile.am3
-rw-r--r--xlators/features/changelog/lib/examples/c/get-changes.c87
-rw-r--r--xlators/features/changelog/lib/examples/python/changes.py32
-rw-r--r--xlators/features/changelog/lib/examples/python/libgfchangelog.py64
-rw-r--r--xlators/features/changelog/lib/src/Makefile.am38
-rw-r--r--xlators/features/changelog/lib/src/changelog.h31
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-helpers.c180
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-helpers.h102
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-process.c571
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog.c571
-rw-r--r--xlators/features/changelog/lib/src/gf-history-changelog.c274
-rw-r--r--xlators/features/changelog/src/Makefile.am21
-rw-r--r--xlators/features/changelog/src/changelog-default-fops.c561
-rw-r--r--xlators/features/changelog/src/changelog-encoders.c182
-rw-r--r--xlators/features/changelog/src/changelog-encoders.h48
-rw-r--r--xlators/features/changelog/src/changelog-fops.h157
-rw-r--r--xlators/features/changelog/src/changelog-helpers.c719
-rw-r--r--xlators/features/changelog/src/changelog-helpers.h578
-rw-r--r--xlators/features/changelog/src/changelog-mem-types.h30
-rw-r--r--xlators/features/changelog/src/changelog-misc.h107
-rw-r--r--xlators/features/changelog/src/changelog-notifier.c314
-rw-r--r--xlators/features/changelog/src/changelog-notifier.h19
-rw-r--r--xlators/features/changelog/src/changelog-rt.c83
-rw-r--r--xlators/features/changelog/src/changelog-rt.h40
-rw-r--r--xlators/features/changelog/src/changelog.c1389
-rw-r--r--xlators/features/changelog/src/policy/changelog-policy-default.c45
-rw-r--r--xlators/features/changelog/src/policy/changelog-policy-replication.c1374
-rw-r--r--xlators/features/changelog/src/policy/changelog-policy.h41
-rw-r--r--xlators/features/compress/Makefile.am3
-rw-r--r--xlators/features/compress/src/Makefile.am17
-rw-r--r--xlators/features/compress/src/cdc-helper.c547
-rw-r--r--xlators/features/compress/src/cdc-mem-types.h23
-rw-r--r--xlators/features/compress/src/cdc.c361
-rw-r--r--xlators/features/compress/src/cdc.h107
-rw-r--r--xlators/features/filter/src/Makefile.am2
-rw-r--r--xlators/features/gfid-access/Makefile.am (renamed from xlators/bindings/python/Makefile.am)0
-rw-r--r--xlators/features/gfid-access/src/Makefile.am15
-rw-r--r--xlators/features/gfid-access/src/gfid-access-mem-types.h23
-rw-r--r--xlators/features/gfid-access/src/gfid-access.c1299
-rw-r--r--xlators/features/gfid-access/src/gfid-access.h134
-rw-r--r--xlators/features/glupy/Makefile.am3
-rw-r--r--xlators/features/glupy/doc/README.md44
-rw-r--r--xlators/features/glupy/doc/TESTING9
-rw-r--r--xlators/features/glupy/doc/test.vol10
-rw-r--r--xlators/features/glupy/examples/Makefile.am5
-rw-r--r--xlators/features/glupy/examples/debug-trace.py775
-rw-r--r--xlators/features/glupy/examples/helloworld.py19
-rw-r--r--xlators/features/glupy/examples/negative.py91
-rw-r--r--xlators/features/glupy/src/Makefile.am21
-rw-r--r--xlators/features/glupy/src/glupy.c2490
-rw-r--r--xlators/features/glupy/src/glupy.h69
-rw-r--r--xlators/features/glupy/src/glupy.py841
-rw-r--r--xlators/features/glupy/src/setup.py.in24
-rw-r--r--xlators/features/index/src/Makefile.am2
-rw-r--r--xlators/features/index/src/index.c142
-rw-r--r--xlators/features/locks/src/Makefile.am3
-rw-r--r--xlators/features/locks/src/clear.c14
-rw-r--r--xlators/features/locks/src/common.c66
-rw-r--r--xlators/features/locks/src/common.h28
-rw-r--r--xlators/features/locks/src/entrylk.c549
-rw-r--r--xlators/features/locks/src/inodelk.c374
-rw-r--r--xlators/features/locks/src/locks.h60
-rw-r--r--xlators/features/locks/src/posix.c624
-rw-r--r--xlators/features/mac-compat/src/Makefile.am7
-rw-r--r--xlators/features/mac-compat/src/mac-compat.c249
-rw-r--r--xlators/features/mac-compat/src/mac-compat.h41
-rw-r--r--xlators/features/marker/Makefile.am2
-rw-r--r--xlators/features/marker/src/Makefile.am2
-rw-r--r--xlators/features/marker/src/marker-quota-helper.c31
-rw-r--r--xlators/features/marker/src/marker-quota-helper.h2
-rw-r--r--xlators/features/marker/src/marker-quota.c291
-rw-r--r--xlators/features/marker/src/marker-quota.h23
-rw-r--r--xlators/features/marker/src/marker.c691
-rw-r--r--xlators/features/marker/src/marker.h7
-rw-r--r--xlators/features/marker/utils/Makefile.am3
-rw-r--r--xlators/features/marker/utils/src/Makefile.am26
-rw-r--r--xlators/features/marker/utils/src/gsyncd.c363
-rw-r--r--xlators/features/marker/utils/src/procdiggy.c121
-rw-r--r--xlators/features/marker/utils/syncdaemon/Makefile.am6
-rw-r--r--xlators/features/marker/utils/syncdaemon/README.md81
-rw-r--r--xlators/features/marker/utils/syncdaemon/__codecheck.py46
-rw-r--r--xlators/features/marker/utils/syncdaemon/__init__.py0
-rw-r--r--xlators/features/marker/utils/syncdaemon/configinterface.py224
-rw-r--r--xlators/features/marker/utils/syncdaemon/gconf.py20
-rw-r--r--xlators/features/marker/utils/syncdaemon/gsyncd.py403
-rw-r--r--xlators/features/marker/utils/syncdaemon/libcxattr.py72
-rw-r--r--xlators/features/marker/utils/syncdaemon/master.py913
-rw-r--r--xlators/features/marker/utils/syncdaemon/monitor.py129
-rw-r--r--xlators/features/marker/utils/syncdaemon/repce.py225
-rw-r--r--xlators/features/marker/utils/syncdaemon/resource.py970
-rw-r--r--xlators/features/marker/utils/syncdaemon/syncdutils.py282
-rw-r--r--xlators/features/path-convertor/src/Makefile.am2
-rw-r--r--xlators/features/protect/Makefile.am3
-rw-r--r--xlators/features/protect/src/Makefile.am21
-rw-r--r--xlators/features/protect/src/prot_client.c217
-rw-r--r--xlators/features/protect/src/prot_dht.c168
-rw-r--r--xlators/features/protect/src/prot_server.c51
-rw-r--r--xlators/features/qemu-block/Makefile.am1
-rw-r--r--xlators/features/qemu-block/src/Makefile.am155
-rw-r--r--xlators/features/qemu-block/src/bdrv-xlator.c389
-rw-r--r--xlators/features/qemu-block/src/bh-syncop.c48
-rw-r--r--xlators/features/qemu-block/src/clock-timer.c60
-rw-r--r--xlators/features/qemu-block/src/coroutine-synctask.c116
-rw-r--r--xlators/features/qemu-block/src/monitor-logging.c50
-rw-r--r--xlators/features/qemu-block/src/qb-coroutines.c667
-rw-r--r--xlators/features/qemu-block/src/qb-coroutines.h30
-rw-r--r--xlators/features/qemu-block/src/qemu-block-memory-types.h25
-rw-r--r--xlators/features/qemu-block/src/qemu-block.c1140
-rw-r--r--xlators/features/qemu-block/src/qemu-block.h109
-rw-r--r--xlators/features/quiesce/src/Makefile.am2
-rw-r--r--xlators/features/quiesce/src/quiesce.c14
-rw-r--r--xlators/features/quota/src/Makefile.am17
-rw-r--r--xlators/features/quota/src/quota-enforcer-client.c403
-rw-r--r--xlators/features/quota/src/quota-mem-types.h3
-rw-r--r--xlators/features/quota/src/quota.c3060
-rw-r--r--xlators/features/quota/src/quota.h144
-rw-r--r--xlators/features/quota/src/quotad-aggregator.c423
-rw-r--r--xlators/features/quota/src/quotad-aggregator.h37
-rw-r--r--xlators/features/quota/src/quotad-helpers.c113
-rw-r--r--xlators/features/quota/src/quotad-helpers.h24
-rw-r--r--xlators/features/quota/src/quotad.c210
-rw-r--r--xlators/features/read-only/src/Makefile.am4
-rw-r--r--xlators/features/read-only/src/worm.c3
-rw-r--r--xlators/features/trash/src/Makefile.am2
-rw-r--r--xlators/lib/src/libxlator.c224
-rw-r--r--xlators/lib/src/libxlator.h89
-rw-r--r--xlators/mgmt/glusterd/src/Makefile.am18
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-brick-ops.c896
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-etcd.c87
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-etcd.h23
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-geo-rep.c3789
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-handler.c2212
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-handshake.c968
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-hooks.c44
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-locks.c656
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-locks.h51
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-log-ops.c43
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mem-types.h6
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mgmt-handler.c936
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mgmt.c1899
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mgmt.h45
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mountbroker.c1
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-op-sm.c3250
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-op-sm.h58
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-pmap.c100
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-quota.c1478
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-rebalance.c395
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-replace-brick.c419
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-rpc-ops.c712
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-sm.c113
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-sm.h35
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-snapshot.c5787
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-store.c3111
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-store.h142
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-syncop.c1642
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-syncop.h73
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-utils.c7152
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-utils.h346
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volgen.c1622
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volgen.h92
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-ops.c1063
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c1558
-rw-r--r--xlators/mgmt/glusterd/src/glusterd.c695
-rw-r--r--xlators/mgmt/glusterd/src/glusterd.h612
-rw-r--r--xlators/mount/fuse/src/Makefile.am6
-rw-r--r--xlators/mount/fuse/src/fuse-bridge.c1644
-rw-r--r--xlators/mount/fuse/src/fuse-bridge.h125
-rw-r--r--xlators/mount/fuse/src/fuse-helpers.c38
-rw-r--r--xlators/mount/fuse/src/fuse-resolve.c37
-rwxr-xr-xxlators/mount/fuse/utils/mount.glusterfs.in711
-rwxr-xr-xxlators/mount/fuse/utils/mount_glusterfs.in600
-rw-r--r--xlators/nfs/server/src/Makefile.am9
-rw-r--r--xlators/nfs/server/src/acl3.c843
-rw-r--r--xlators/nfs/server/src/acl3.h42
-rw-r--r--xlators/nfs/server/src/mount3.c1041
-rw-r--r--xlators/nfs/server/src/mount3.h39
-rw-r--r--xlators/nfs/server/src/mount3udp_svc.c17
-rw-r--r--xlators/nfs/server/src/nfs-common.c59
-rw-r--r--xlators/nfs/server/src/nfs-common.h20
-rw-r--r--xlators/nfs/server/src/nfs-fops.c189
-rw-r--r--xlators/nfs/server/src/nfs-fops.h28
-rw-r--r--xlators/nfs/server/src/nfs-generics.c33
-rw-r--r--xlators/nfs/server/src/nfs-generics.h27
-rw-r--r--xlators/nfs/server/src/nfs-inodes.c27
-rw-r--r--xlators/nfs/server/src/nfs-inodes.h17
-rw-r--r--xlators/nfs/server/src/nfs-mem-types.h20
-rw-r--r--xlators/nfs/server/src/nfs.c898
-rw-r--r--xlators/nfs/server/src/nfs.h30
-rw-r--r--xlators/nfs/server/src/nfs3-fh.c39
-rw-r--r--xlators/nfs/server/src/nfs3-fh.h32
-rw-r--r--xlators/nfs/server/src/nfs3-helpers.c179
-rw-r--r--xlators/nfs/server/src/nfs3-helpers.h26
-rw-r--r--xlators/nfs/server/src/nfs3.c570
-rw-r--r--xlators/nfs/server/src/nfs3.h94
-rw-r--r--xlators/nfs/server/src/nlm4.c431
-rw-r--r--xlators/nfs/server/src/nlm4.h57
-rw-r--r--xlators/nfs/server/src/nlmcbk_svc.c28
-rw-r--r--xlators/performance/Makefile.am2
-rw-r--r--xlators/performance/io-cache/src/Makefile.am2
-rw-r--r--xlators/performance/io-cache/src/io-cache.c118
-rw-r--r--xlators/performance/io-cache/src/io-cache.h2
-rw-r--r--xlators/performance/io-cache/src/page.c38
-rw-r--r--xlators/performance/io-threads/src/Makefile.am2
-rw-r--r--xlators/performance/io-threads/src/io-threads.c1952
-rw-r--r--xlators/performance/io-threads/src/io-threads.h10
-rw-r--r--xlators/performance/md-cache/src/Makefile.am2
-rw-r--r--xlators/performance/md-cache/src/md-cache.c382
-rw-r--r--xlators/performance/open-behind/Makefile.am1
-rw-r--r--xlators/performance/open-behind/src/Makefile.am15
-rw-r--r--xlators/performance/open-behind/src/open-behind-mem-types.h21
-rw-r--r--xlators/performance/open-behind/src/open-behind.c1020
-rw-r--r--xlators/performance/quick-read/src/Makefile.am2
-rw-r--r--xlators/performance/quick-read/src/quick-read-mem-types.h1
-rw-r--r--xlators/performance/quick-read/src/quick-read.c3725
-rw-r--r--xlators/performance/quick-read/src/quick-read.h57
-rw-r--r--xlators/performance/read-ahead/src/Makefile.am2
-rw-r--r--xlators/performance/read-ahead/src/page.c7
-rw-r--r--xlators/performance/read-ahead/src/read-ahead.c116
-rw-r--r--xlators/performance/readdir-ahead/Makefile.am3
-rw-r--r--xlators/performance/readdir-ahead/src/Makefile.am15
-rw-r--r--xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h24
-rw-r--r--xlators/performance/readdir-ahead/src/readdir-ahead.c560
-rw-r--r--xlators/performance/readdir-ahead/src/readdir-ahead.h46
-rw-r--r--xlators/performance/symlink-cache/src/Makefile.am2
-rw-r--r--xlators/performance/write-behind/src/Makefile.am2
-rw-r--r--xlators/performance/write-behind/src/write-behind.c566
-rw-r--r--xlators/playground/Makefile.am2
-rw-r--r--xlators/playground/template/Makefile.am2
-rw-r--r--xlators/playground/template/src/Makefile.am16
-rw-r--r--xlators/playground/template/src/template.c49
-rw-r--r--xlators/playground/template/src/template.h24
-rw-r--r--xlators/protocol/auth/addr/src/Makefile.am2
-rw-r--r--xlators/protocol/auth/addr/src/addr.c18
-rw-r--r--xlators/protocol/auth/login/src/Makefile.am2
-rw-r--r--xlators/protocol/auth/login/src/login.c17
-rw-r--r--xlators/protocol/client/src/Makefile.am2
-rw-r--r--xlators/protocol/client/src/client-handshake.c293
-rw-r--r--xlators/protocol/client/src/client-helpers.c73
-rw-r--r--xlators/protocol/client/src/client-lk.c362
-rw-r--r--xlators/protocol/client/src/client-rpc-fops.c638
-rw-r--r--xlators/protocol/client/src/client.c221
-rw-r--r--xlators/protocol/client/src/client.h56
-rw-r--r--xlators/protocol/server/src/Makefile.am11
-rw-r--r--xlators/protocol/server/src/authenticate.c64
-rw-r--r--xlators/protocol/server/src/authenticate.h19
-rw-r--r--xlators/protocol/server/src/server-handshake.c153
-rw-r--r--xlators/protocol/server/src/server-helpers.c1303
-rw-r--r--xlators/protocol/server/src/server-helpers.h76
-rw-r--r--xlators/protocol/server/src/server-mem-types.h19
-rw-r--r--xlators/protocol/server/src/server-resolve.c58
-rw-r--r--xlators/protocol/server/src/server-rpc-fops.c2337
-rw-r--r--xlators/protocol/server/src/server.c838
-rw-r--r--xlators/protocol/server/src/server.h126
-rw-r--r--xlators/storage/Makefile.am6
-rw-r--r--xlators/storage/bd/Makefile.am3
-rw-r--r--xlators/storage/bd/src/Makefile.am20
-rw-r--r--xlators/storage/bd/src/bd-aio.c528
-rw-r--r--xlators/storage/bd/src/bd-aio.h41
-rw-r--r--xlators/storage/bd/src/bd-helper.c1021
-rw-r--r--xlators/storage/bd/src/bd-mem-types.h27
-rw-r--r--xlators/storage/bd/src/bd.c2450
-rw-r--r--xlators/storage/bd/src/bd.h173
-rw-r--r--xlators/storage/posix/src/Makefile.am2
-rw-r--r--xlators/storage/posix/src/posix-aio.c20
-rw-r--r--xlators/storage/posix/src/posix-handle.c212
-rw-r--r--xlators/storage/posix/src/posix-handle.h95
-rw-r--r--xlators/storage/posix/src/posix-helpers.c647
-rw-r--r--xlators/storage/posix/src/posix.c2217
-rw-r--r--xlators/storage/posix/src/posix.h71
-rw-r--r--xlators/system/posix-acl/src/Makefile.am4
-rw-r--r--xlators/system/posix-acl/src/posix-acl-mem-types.h24
-rw-r--r--xlators/system/posix-acl/src/posix-acl-xattr.c10
-rw-r--r--xlators/system/posix-acl/src/posix-acl-xattr.h16
-rw-r--r--xlators/system/posix-acl/src/posix-acl.c102
-rw-r--r--xlators/system/posix-acl/src/posix-acl.h49
411 files changed, 120114 insertions, 43703 deletions
diff --git a/xlators/Makefile.am b/xlators/Makefile.am
index b1643d26c..f60fa85ce 100644
--- a/xlators/Makefile.am
+++ b/xlators/Makefile.am
@@ -1,3 +1,4 @@
-SUBDIRS = cluster storage protocol performance debug features encryption mount nfs mgmt system
+SUBDIRS = cluster storage protocol performance debug features encryption mount nfs mgmt system \
+ playground
CLEANFILES =
diff --git a/xlators/bindings/Makefile.am b/xlators/bindings/Makefile.am
deleted file mode 100644
index f77665802..000000000
--- a/xlators/bindings/Makefile.am
+++ /dev/null
@@ -1 +0,0 @@
-SUBDIRS = $(BINDINGS_SUBDIRS)
diff --git a/xlators/bindings/python/src/Makefile.am b/xlators/bindings/python/src/Makefile.am
deleted file mode 100644
index 90370d861..000000000
--- a/xlators/bindings/python/src/Makefile.am
+++ /dev/null
@@ -1,19 +0,0 @@
-
-xlator_PROGRAMS = python.so
-
-xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/bindings
-
-python_PYTHON = gluster.py glustertypes.py glusterstack.py
-
-pythondir = $(xlatordir)/python
-
-python_so_SOURCES = python.c
-
-AM_CFLAGS = -fPIC $(GF_CPPFLAGS) -Wall \
- -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles \
- $(PYTHON_CPPLAGS) -DGLUSTER_PYTHON_PATH=\"$(pythondir)\"
-
-AM_LDFLAGS = $(PYTHON_LDFLAGS)
-
-CLEANFILES =
-
diff --git a/xlators/bindings/python/src/gluster.py b/xlators/bindings/python/src/gluster.py
deleted file mode 100644
index 337c983ec..000000000
--- a/xlators/bindings/python/src/gluster.py
+++ /dev/null
@@ -1,40 +0,0 @@
-
-# Copyright (c) 2007-2012 Red Hat, Inc. <http://www.redhat.com>
-# This file is part of GlusterFS.
-#
-# This file is licensed to you under your choice of the GNU Lesser
-# General Public License, version 3 or any later version (LGPLv3 or
-# later), or the GNU General Public License, version 2 (GPLv2), in all
-# cases as published by the Free Software Foundation.
-
-from ctypes import *
-from glustertypes import *
-from glusterstack import *
-import sys
-import inspect
-
-libglusterfs = CDLL("libglusterfs.so")
-_gf_log = libglusterfs._gf_log
-_gf_log.restype = c_int32
-_gf_log.argtypes = [c_char_p, c_char_p, c_char_p, c_int32, c_int, c_char_p]
-
-gf_log_loglevel = c_int.in_dll(libglusterfs, "gf_log_loglevel")
-
-GF_LOG_NONE = 0
-GF_LOG_CRITICAL = 1
-GF_LOG_ERROR = 2
-GF_LOG_WARNING = 3
-GF_LOG_DEBUG = 4
-
-def gf_log(module, level, fmt, *params):
- if level <= gf_log_loglevel:
- frame = sys._getframe(1)
- _gf_log(module, frame.f_code.co_filename, frame.f_code.co_name,
- frame.f_lineno, level, fmt, *params)
-
-class ComplexTranslator(object):
- def __init__(self, xlator):
- self.xlator = xlator_t.from_address(xlator)
-
- def __getattr__(self, item):
- return getattr(self.xlator, item)
diff --git a/xlators/bindings/python/src/glusterstack.py b/xlators/bindings/python/src/glusterstack.py
deleted file mode 100644
index 0c071ae98..000000000
--- a/xlators/bindings/python/src/glusterstack.py
+++ /dev/null
@@ -1,48 +0,0 @@
-
-# Copyright (c) 2007-2012 Red Hat, Inc. <http://www.redhat.com>
-# This file is part of GlusterFS.
-#
-# This file is licensed to you under your choice of the GNU Lesser
-# General Public License, version 3 or any later version (LGPLv3 or
-# later), or the GNU General Public License, version 2 (GPLv2), in all
-# cases as published by the Free Software Foundation.
-
-from ctypes import *
-from glustertypes import *
-
-libc = CDLL("libc.so.6")
-calloc = libc.calloc
-calloc.argtypes = [c_int, c_int]
-calloc.restype = c_void_p
-
-# TODO: Can these be done in C somehow?
-def stack_wind(frame, rfn, obj, fn, *params):
- """Frame is a frame object"""
- _new = cast(calloc(1, sizeof(call_frame_t)), POINTER(call_frame_t))
- _new[0].root = frame.root
- _new[0].next = frame.root[0].frames.next
- _new[0].prev = pointer(frame.root[0].frames)
- if frame.root[0].frames.next:
- frame.root[0].frames.next[0].prev = _new
- frame.root[0].frames.next = _new
- _new[0].this = obj
- # TODO: Type checking like tmp_cbk?
- _new[0].ret = rfn
- _new[0].parent = pointer(frame)
- _new[0].cookie = cast(_new, c_void_p)
- # TODO: Initialize lock
- #_new.lock.init()
- frame.ref_count += 1
- fn(_new, obj, *params)
-
-def stack_unwind(frame, *params):
- """Frame is a frame object"""
- fn = frame[0].ret
- parent = frame[0].parent[0]
- parent.ref_count -= 1
-
- op_ret = params[0]
- op_err = params[1]
- params = params[2:]
- fn(parent, call_frame_t.from_address(frame[0].cookie), parent.this,
- op_ret, op_err, *params)
diff --git a/xlators/bindings/python/src/glustertypes.py b/xlators/bindings/python/src/glustertypes.py
deleted file mode 100644
index 98437d22e..000000000
--- a/xlators/bindings/python/src/glustertypes.py
+++ /dev/null
@@ -1,160 +0,0 @@
-
-# Copyright (c) 2007-2012 Red Hat, Inc. <http://www.redhat.com>
-# This file is part of GlusterFS.
-#
-# This file is licensed to you under your choice of the GNU Lesser
-# General Public License, version 3 or any later version (LGPLv3 or
-# later), or the GNU General Public License, version 2 (GPLv2), in all
-# cases as published by the Free Software Foundation.
-
-from ctypes import *
-import collections
-
-#
-# Forward declaration of some gluster types
-#
-class call_frame_t(Structure):
- pass
-
-class call_ctx_t(Structure):
- pass
-
-class call_pool_t(Structure):
- pass
-
-class xlator_t(Structure):
- def _getFirstChild(self):
- return self.children[0].xlator
- firstChild = property(_getFirstChild)
-
-class xlator_list_t(Structure):
- pass
-
-class xlator_fops(Structure):
- pass
-
-class xlator_mops(Structure):
- pass
-
-class glusterfs_ctx_t(Structure):
- pass
-
-class list_head(Structure):
- pass
-
-class dict_t(Structure):
- pass
-
-class inode_table_t(Structure):
- pass
-
-class fd_t(Structure):
- pass
-
-class iovec(Structure):
- _fields_ = [
- ("iov_base", c_void_p),
- ("iov_len", c_size_t),
- ]
-
- def __init__(self, s):
- self.iov_base = cast(c_char_p(s), c_void_p)
- self.iov_len = len(s)
-
- def getBytes(self):
- return string_at(self.iov_base, self.iov_len)
-
-# This is a pthread_spinlock_t
-# TODO: what happens to volatile-ness?
-gf_lock_t = c_int
-
-uid_t = c_uint32
-gid_t = c_uint32
-pid_t = c_int32
-
-off_t = c_int64
-
-#
-# Function pointer types
-#
-ret_fn_t = CFUNCTYPE(c_int32, POINTER(call_frame_t), POINTER(call_frame_t),
- POINTER(xlator_t), c_int32, c_int32)
-
-fini_fn_t = CFUNCTYPE(None, POINTER(xlator_t))
-init_fn_t = CFUNCTYPE(c_int32, POINTER(xlator_t))
-event_notify_fn_t = CFUNCTYPE(c_int32, POINTER(xlator_t), c_int32, c_void_p)
-
-list_head._fields_ = [
- ("next", POINTER(list_head)),
- ("prev", POINTER(list_head)),
- ]
-
-call_frame_t._fields_ = [
- ("root", POINTER(call_ctx_t)),
- ("parent", POINTER(call_frame_t)),
- ("next", POINTER(call_frame_t)),
- ("prev", POINTER(call_frame_t)),
- ("local", c_void_p),
- ("this", POINTER(xlator_t)),
- ("ret", ret_fn_t),
- ("ref_count", c_int32),
- ("lock", gf_lock_t),
- ("cookie", c_void_p),
- ("op", c_int32),
- ("type", c_int8),
- ]
-
-call_ctx_t._fields_ = [
- ("all_frames", list_head),
- ("trans", c_void_p),
- ("pool", call_pool_t),
- ("unique", c_uint64),
- ("state", c_void_p),
- ("uid", uid_t),
- ("gid", gid_t),
- ("pid", pid_t),
- ("frames", call_frame_t),
- ("req_refs", POINTER(dict_t)),
- ("rsp_refs", POINTER(dict_t)),
- ]
-
-xlator_t._fields_ = [
- ("name", c_char_p),
- ("type", c_char_p),
- ("next", POINTER(xlator_t)),
- ("prev", POINTER(xlator_t)),
- ("parent", POINTER(xlator_t)),
- ("children", POINTER(xlator_list_t)),
- ("fops", POINTER(xlator_fops)),
- ("mops", POINTER(xlator_mops)),
- ("fini", fini_fn_t),
- ("init", init_fn_t),
- ("notify", event_notify_fn_t),
- ("options", POINTER(dict_t)),
- ("ctx", POINTER(glusterfs_ctx_t)),
- ("itable", POINTER(inode_table_t)),
- ("ready", c_char),
- ("private", c_void_p),
- ]
-
-xlator_list_t._fields_ = [
- ("xlator", POINTER(xlator_t)),
- ("next", POINTER(xlator_list_t)),
- ]
-
-fop_functions = collections.defaultdict(lambda: c_void_p)
-fop_function_names = ['lookup', 'forget', 'stat', 'fstat', 'chmod', 'fchmod',
- 'chown', 'fchown', 'truncate', 'ftruncate', 'utimens', 'access',
- 'readlink', 'mknod', 'mkdir', 'unlink', 'rmdir', 'symlink',
- 'rename', 'link', 'create', 'open', 'readv', 'writev', 'flush',
- 'close', 'fsync', 'opendir', 'readdir', 'closedir', 'fsyncdir',
- 'statfs', 'setxattr', 'getxattr', 'removexattr', 'lk', 'writedir',
- # TODO: Call backs?
- ]
-
-fop_writev_t = CFUNCTYPE(c_int32, POINTER(call_frame_t), POINTER(xlator_t),
- POINTER(fd_t), POINTER(iovec), c_int32,
- off_t)
-
-fop_functions['writev'] = fop_writev_t
-xlator_fops._fields_ = [(f, fop_functions[f]) for f in fop_function_names]
diff --git a/xlators/bindings/python/src/python.c b/xlators/bindings/python/src/python.c
deleted file mode 100644
index 9b96790de..000000000
--- a/xlators/bindings/python/src/python.c
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-#include <Python.h>
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include "glusterfs.h"
-#include "xlator.h"
-#include "logging.h"
-#include "defaults.h"
-
-typedef struct
-{
- char *scriptname;
- PyObject *pXlator;
- PyObject *pScriptModule;
- PyObject *pGlusterModule;
- PyThreadState *pInterp;
-
- PyObject *pFrameType, *pVectorType, *pFdType;
-} python_private_t;
-
-int32_t
-python_writev (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- struct iovec *vector,
- int32_t count,
- off_t offset)
-{
- python_private_t *priv = (python_private_t *)this->private;
- gf_log("python", GF_LOG_DEBUG, "In writev");
- if (PyObject_HasAttrString(priv->pXlator, "writev"))
- {
-
- PyObject *retval = PyObject_CallMethod(priv->pXlator, "writev",
- "O O O i l",
- PyObject_CallMethod(priv->pFrameType, "from_address", "O&", PyLong_FromVoidPtr, frame),
- PyObject_CallMethod(priv->pFdType, "from_address", "O&", PyLong_FromVoidPtr, fd),
- PyObject_CallMethod(priv->pVectorType, "from_address", "O&", PyLong_FromVoidPtr, vector),
- count,
- offset);
- if (PyErr_Occurred())
- {
- PyErr_Print();
- }
- Py_XDECREF(retval);
- }
- else
- {
- return default_writev(frame, this, fd, vector, count, offset);
- }
- return 0;
-}
-
-struct xlator_fops fops = {
- .writev = python_writev
-};
-
-static PyObject *
-AnonModule_FromFile (const char* fname)
-{
- // Get the builtins
- PyThreadState* pThread = PyThreadState_Get();
- PyObject *pBuiltins = pThread->interp->builtins;
-
- if (PyErr_Occurred())
- {
- PyErr_Print();
- return NULL;
- }
-
- // Create a new dictionary for running code in
- PyObject *pModuleDict = PyDict_New();
- PyDict_SetItemString(pModuleDict, "__builtins__", pBuiltins);
- Py_INCREF(pBuiltins);
-
- // Run the file in the new context
- FILE* fp = fopen(fname, "r");
- PyRun_File(fp, fname, Py_file_input, pModuleDict, pModuleDict);
- fclose(fp);
- if (PyErr_Occurred())
- {
- PyErr_Print();
- Py_DECREF(pModuleDict);
- Py_DECREF(pBuiltins);
- return NULL;
- }
-
- // Create an object to hold the new context
- PyRun_String("class ModuleWrapper(object):\n\tpass\n", Py_single_input, pModuleDict, pModuleDict);
- if (PyErr_Occurred())
- {
- PyErr_Print();
- Py_DECREF(pModuleDict);
- Py_DECREF(pBuiltins);
- return NULL;
- }
- PyObject *pModule = PyRun_String("ModuleWrapper()", Py_eval_input, pModuleDict, pModuleDict);
- if (PyErr_Occurred())
- {
- PyErr_Print();
- Py_DECREF(pModuleDict);
- Py_DECREF(pBuiltins);
- Py_XDECREF(pModule);
- return NULL;
- }
-
- // Set the new context's dictionary to the one we used to run the code
- // inside
- PyObject_SetAttrString(pModule, "__dict__", pModuleDict);
- if (PyErr_Occurred())
- {
- PyErr_Print();
- Py_DECREF(pModuleDict);
- Py_DECREF(pBuiltins);
- Py_DECREF(pModule);
- return NULL;
- }
-
- return pModule;
-}
-
-int32_t
-init (xlator_t *this)
-{
- // This is ok to call more than once per process
- Py_InitializeEx(0);
-
- if (!this->children) {
- gf_log ("python", GF_LOG_ERROR,
- "FATAL: python should have exactly one child");
- return -1;
- }
-
- python_private_t *priv = CALLOC (sizeof (python_private_t), 1);
- ERR_ABORT (priv);
-
- data_t *scriptname = dict_get (this->options, "scriptname");
- if (scriptname) {
- priv->scriptname = data_to_str(scriptname);
- } else {
- gf_log("python", GF_LOG_ERROR,
- "FATAL: python requires the scriptname parameter");
- return -1;
- }
-
- priv->pInterp = Py_NewInterpreter();
-
- // Adjust python's path
- PyObject *syspath = PySys_GetObject("path");
- PyObject *path = PyString_FromString(GLUSTER_PYTHON_PATH);
- PyList_Append(syspath, path);
- Py_DECREF(path);
-
- gf_log("python", GF_LOG_DEBUG,
- "Loading gluster module");
-
- priv->pGlusterModule = PyImport_ImportModule("gluster");
- if (PyErr_Occurred())
- {
- PyErr_Print();
- return -1;
- }
-
- priv->pFrameType = PyObject_GetAttrString(priv->pGlusterModule, "call_frame_t");
- priv->pFdType = PyObject_GetAttrString(priv->pGlusterModule, "fd_t");
- priv->pVectorType = PyObject_GetAttrString(priv->pGlusterModule, "iovec");
-
- gf_log("python", GF_LOG_DEBUG, "Loading script...%s", priv->scriptname);
-
- priv->pScriptModule = AnonModule_FromFile(priv->scriptname);
- if (!priv->pScriptModule || PyErr_Occurred())
- {
- gf_log("python", GF_LOG_ERROR, "Error loading %s", priv->scriptname);
- PyErr_Print();
- return -1;
- }
-
- if (!PyObject_HasAttrString(priv->pScriptModule, "xlator"))
- {
- gf_log("python", GF_LOG_ERROR, "%s does not have a xlator attribute", priv->scriptname);
- return -1;
- }
- gf_log("python", GF_LOG_DEBUG, "Instantiating translator");
- priv->pXlator = PyObject_CallMethod(priv->pScriptModule, "xlator", "O&",
- PyLong_FromVoidPtr, this);
- if (PyErr_Occurred() || !priv->pXlator)
- {
- PyErr_Print();
- return -1;
- }
-
- this->private = priv;
-
- gf_log ("python", GF_LOG_DEBUG, "python xlator loaded");
- return 0;
-}
-
-void
-fini (xlator_t *this)
-{
- python_private_t *priv = (python_private_t*)(this->private);
- Py_DECREF(priv->pXlator);
- Py_DECREF(priv->pScriptModule);
- Py_DECREF(priv->pGlusterModule);
- Py_DECREF(priv->pFrameType);
- Py_DECREF(priv->pFdType);
- Py_DECREF(priv->pVectorType);
- Py_EndInterpreter(priv->pInterp);
- return;
-}
diff --git a/xlators/bindings/python/src/testxlator.py b/xlators/bindings/python/src/testxlator.py
deleted file mode 100644
index 59a991dca..000000000
--- a/xlators/bindings/python/src/testxlator.py
+++ /dev/null
@@ -1,49 +0,0 @@
-"""
- Copyright (c) 2007-2012 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-"""
-
-"""
-This is a test translator written in python.
-
-Important things to note:
- This file must be import-able from glusterfsd. This probably means
- setting PYTHONPATH to where this file is located.
-
- This file must have a top-level xlator class object that will be
- used to instantiate individual translators.
-"""
-from gluster import *
-
-class MyXlator(ComplexTranslator):
- name = "MyXlator"
- def writev_cbk(self, frame, cookie, op_ret, op_errno, buf):
- stack_unwind(frame, op_ret, op_errno, buf)
- return 0
-
- def writev(self, frame, fd, vector, count, offset):
- gf_log(self.name, GF_LOG_WARNING, "writev %i bytes", vector.iov_len)
- # TODO: Use cookie to pass this to writev_cbk
- old_count = vector.iov_len
-
- data = vector.getBytes().encode("zlib")
-
- vector = iovec(data)
- gf_log(self.name, GF_LOG_WARNING, "writev %i bytes", vector.iov_len)
-
- @ret_fn_t
- def rfn(frame, prev, this, op_ret, op_errno, *params):
- if len(params) == 0:
- params = [0]
- return self.writev_cbk(frame, prev, old_count, op_errno, *params)
-
- stack_wind(frame, rfn, self.firstChild,
- self.firstChild[0].fops[0].writev, fd, vector, count, offset)
- return 0
-
-xlator = MyXlator
diff --git a/xlators/cluster/Makefile.am b/xlators/cluster/Makefile.am
index 0990822a7..6e883e565 100644
--- a/xlators/cluster/Makefile.am
+++ b/xlators/cluster/Makefile.am
@@ -1,3 +1,3 @@
-SUBDIRS = stripe afr dht
+SUBDIRS = stripe afr dht nsr-server nsr-recon nsr-client
CLEANFILES =
diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am
index ef6bf9786..ea5a90abb 100644
--- a/xlators/cluster/afr/src/Makefile.am
+++ b/xlators/cluster/afr/src/Makefile.am
@@ -2,24 +2,26 @@ xlator_LTLIBRARIES = afr.la pump.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
afr_common_source = afr-dir-read.c afr-dir-write.c afr-inode-read.c \
- afr-inode-write.c afr-open.c afr-transaction.c afr-self-heal-data.c \
- afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c \
- afr-self-heal-algorithm.c afr-lk-common.c afr-self-heald.c \
+ afr-inode-write.c afr-open.c afr-transaction.c afr-lk-common.c \
+ afr-read-txn.c \
$(top_builddir)/xlators/lib/src/libxlator.c
-afr_la_LDFLAGS = -module -avoidversion
-afr_la_SOURCES = $(afr_common_source) afr.c
+AFR_SELFHEAL_SOURCES = afr-self-heal-common.c afr-self-heal-data.c \
+ afr-self-heal-entry.c afr-self-heal-metadata.c afr-self-heald.c \
+ afr-self-heal-name.c
+
+afr_la_LDFLAGS = -module -avoid-version
+afr_la_SOURCES = $(afr_common_source) $(AFR_SELFHEAL_SOURCES) afr.c
afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-pump_la_LDFLAGS = -module -avoidversion
-pump_la_SOURCES = $(afr_common_source) pump.c
+pump_la_LDFLAGS = -module -avoid-version
+pump_la_SOURCES = $(afr_common_source) $(AFR_SELFHEAL_SOURCES) pump.c
pump_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h \
- afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h \
- afr-self-heal-algorithm.h pump.h afr-mem-types.h afr-common.c \
- afr-self-heald.h $(top_builddir)/xlators/lib/src/libxlator.h \
- $(top_builddir)/glusterfsd/src/glusterfsd.h
+ afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-mem-types.h \
+ afr-common.c afr-self-heald.h pump.h \
+ $(top_builddir)/xlators/lib/src/libxlator.h
AM_CPPFLAGS = $(GF_CPPFLAGS) \
-I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \
@@ -31,7 +33,6 @@ CLEANFILES =
uninstall-local:
rm -f $(DESTDIR)$(xlatordir)/replicate.so
- rm -f $(DESTDIR)$(xlatordir)/pump.so
install-data-hook:
ln -sf afr.so $(DESTDIR)$(xlatordir)/replicate.so
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 21272c0d7..164a651ba 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -45,541 +45,693 @@
#include "afr-dir-write.h"
#include "afr-transaction.h"
#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
#include "afr-self-heald.h"
-#include "pump.h"
-#define AFR_ICTX_OPENDIR_DONE_MASK 0x0000000200000000ULL
-#define AFR_ICTX_SPLIT_BRAIN_MASK 0x0000000100000000ULL
-#define AFR_ICTX_READ_CHILD_MASK 0x00000000FFFFFFFFULL
-int
-afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this,
- gf_boolean_t fail_conflict);
-void
-afr_children_copy (int32_t *dst, int32_t *src, unsigned int child_count)
+call_frame_t *
+afr_copy_frame (call_frame_t *base)
{
- int i = 0;
+ afr_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ int op_errno = 0;
+
+ frame = copy_frame (base);
+ if (!frame)
+ return NULL;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local) {
+ AFR_STACK_DESTROY (frame);
+ return NULL;
+ }
- for (i = 0; i < child_count; i++)
- dst[i] = src[i];
+ return frame;
}
-void
-afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, const char *path)
+/*
+ * INODE CTX 64-bit VALUE FORMAT FOR SMALL (<= 16) SUBVOL COUNTS:
+ *
+ * |<---------- 64bit ------------>|
+ * 63 32 31 16 15 0
+ * | EVENT_GEN | DATA | METADATA |
+ *
+ *
+ * METADATA (bit-0 .. bit-15): bitmap representing subvolumes from which
+ * metadata can be attempted to be read.
+ *
+ * bit-0 => priv->subvolumes[0]
+ * bit-1 => priv->subvolumes[1]
+ * ... etc. till bit-15
+ *
+ * DATA (bit-16 .. bit-31): bitmap representing subvolumes from which data
+ * can be attempted to be read.
+ *
+ * bit-16 => priv->subvolumes[0]
+ * bit-17 => priv->subvolumes[1]
+ * ... etc. till bit-31
+ *
+ * EVENT_GEN (bit-32 .. bit-63): event generation (i.e priv->event_generation)
+ * when DATA and METADATA was last updated.
+ *
+ * If EVENT_GEN is < priv->event_generation,
+ * or is 0, it means afr_inode_refresh() needs
+ * to be called to recalculate the bitmaps.
+ */
+
+int
+__afr_inode_read_subvol_get_small (inode_t *inode, xlator_t *this,
+ unsigned char *data, unsigned char *metadata,
+ int *event_p)
{
- int i = 0;
- afr_private_t *priv = NULL;
- int ret = 0;
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ uint16_t datamap = 0;
+ uint16_t metadatamap = 0;
+ uint32_t event = 0;
+ uint64_t val = 0;
+ int i = 0;
- priv = this->private;
+ priv = this->private;
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_set_uint64 (xattr_req, priv->pending_key[i],
- 3 * sizeof(int32_t));
- if (ret < 0)
- gf_log (this->name, GF_LOG_WARNING,
- "%s: Unable to set dict value for %s",
- path, priv->pending_key[i]);
- /* 3 = data+metadata+entry */
- }
- ret = dict_set_int32 (xattr_req, GF_GFIDLESS_LOOKUP, 1);
- if (ret) {
- gf_log (this->name, GF_LOG_DEBUG, "%s: failed to set gfidless "
- "lookup", path);
- }
+ ret = __inode_ctx_get (inode, this, &val);
+ if (ret < 0)
+ return ret;
+
+ metadatamap = (val & 0x000000000000ffff);
+ datamap = (val & 0x00000000ffff0000) >> 16;
+ event = (val & 0xffffffff00000000) >> 32;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (metadata)
+ metadata[i] = (metadatamap >> i) & 1;
+ if (data)
+ data[i] = (datamap >> i) & 1;
+ }
+
+ if (event_p)
+ *event_p = event;
+ return ret;
}
+
int
-afr_lookup_xattr_req_prepare (afr_local_t *local, xlator_t *this,
- dict_t *xattr_req, loc_t *loc, void **gfid_req)
+__afr_inode_read_subvol_set_small (inode_t *inode, xlator_t *this,
+ unsigned char *data, unsigned char *metadata,
+ int event)
{
- int ret = -ENOMEM;
+ afr_private_t *priv = NULL;
+ uint16_t datamap = 0;
+ uint16_t metadatamap = 0;
+ uint64_t val = 0;
+ int i = 0;
- GF_ASSERT (gfid_req);
+ priv = this->private;
- *gfid_req = NULL;
- local->xattr_req = dict_new ();
- if (!local->xattr_req)
- goto out;
- if (xattr_req)
- dict_copy (xattr_req, local->xattr_req);
+ for (i = 0; i < priv->child_count; i++) {
+ if (data[i])
+ datamap |= (1 << i);
+ if (metadata[i])
+ metadatamap |= (1 << i);
+ }
- afr_xattr_req_prepare (this, local->xattr_req, loc->path);
- ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "%s: Unable to set dict value for %s",
- loc->path, GLUSTERFS_INODELK_COUNT);
- }
- ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "%s: Unable to set dict value for %s",
- loc->path, GLUSTERFS_ENTRYLK_COUNT);
- }
+ val = ((uint64_t) metadatamap) |
+ (((uint64_t) datamap) << 16) |
+ (((uint64_t) event) << 32);
- ret = dict_set_uint32 (local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "%s: Unable to set dict value for %s",
- loc->path, GLUSTERFS_PARENT_ENTRYLK);
- }
-
- ret = dict_get_ptr (local->xattr_req, "gfid-req", gfid_req);
- if (ret) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s: failed to get the gfid from dict", loc->path);
- *gfid_req = NULL;
- } else {
- if (loc->parent != NULL)
- dict_del (local->xattr_req, "gfid-req");
- }
- ret = 0;
-out:
- return ret;
+ return __inode_ctx_set (inode, this, &val);
}
-void
-afr_lookup_save_gfid (uuid_t dst, void* new, const loc_t *loc)
+
+int
+__afr_inode_read_subvol_reset_small (inode_t *inode, xlator_t *this)
{
- inode_t *inode = NULL;
-
- inode = loc->inode;
- if (inode && !uuid_is_null (inode->gfid))
- uuid_copy (dst, inode->gfid);
- else if (!uuid_is_null (loc->gfid))
- uuid_copy (dst, loc->gfid);
- else if (new && !uuid_is_null (new))
- uuid_copy (dst, new);
+ int ret = -1;
+ uint16_t datamap = 0;
+ uint16_t metadatamap = 0;
+ uint32_t event = 0;
+ uint64_t val = 0;
+
+ ret = __inode_ctx_get (inode, this, &val);
+ (void) ret;
+
+ metadatamap = (val & 0x000000000000ffff) >> 0;
+ datamap = (val & 0x00000000ffff0000) >> 16;
+ event = 0;
+
+ val = ((uint64_t) metadatamap) |
+ (((uint64_t) datamap) << 16) |
+ (((uint64_t) event) << 32);
+
+ return __inode_ctx_set (inode, this, &val);
}
+
int
-afr_errno_count (int32_t *children, int *child_errno,
- unsigned int child_count, int32_t op_errno)
+__afr_inode_read_subvol_get (inode_t *inode, xlator_t *this,
+ unsigned char *data, unsigned char *metadata,
+ int *event_p)
{
- int i = 0;
- int errno_count = 0;
- int child = 0;
+ afr_private_t *priv = NULL;
+ int ret = -1;
- for (i = 0; i < child_count; i++) {
- if (children) {
- child = children[i];
- if (child == -1)
- break;
- } else {
- child = i;
- }
- if (child_errno[child] == op_errno)
- errno_count++;
- }
- return errno_count;
+ priv = this->private;
+
+ if (priv->child_count <= 16)
+ ret = __afr_inode_read_subvol_get_small (inode, this, data,
+ metadata, event_p);
+ else
+ /* TBD: allocate structure with array and read from it */
+ ret = -1;
+
+ return ret;
}
-int32_t
-afr_set_dict_gfid (dict_t *dict, uuid_t gfid)
+
+int
+__afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data,
+ unsigned char *metadata, int event)
{
- int ret = 0;
- uuid_t *pgfid = NULL;
+ afr_private_t *priv = NULL;
+ int ret = -1;
- GF_ASSERT (gfid);
+ priv = this->private;
- pgfid = GF_CALLOC (1, sizeof (uuid_t), gf_common_mt_char);
- if (!pgfid) {
- ret = -1;
- goto out;
- }
+ if (priv->child_count <= 16)
+ ret = __afr_inode_read_subvol_set_small (inode, this, data,
+ metadata, event);
+ else
+ ret = -1;
- uuid_copy (*pgfid, gfid);
+ return ret;
+}
- ret = dict_set_dynptr (dict, "gfid-req", pgfid, sizeof (uuid_t));
- if (ret)
- gf_log (THIS->name, GF_LOG_ERROR, "gfid set failed");
-out:
- if (ret && pgfid)
- GF_FREE (pgfid);
+int
+__afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int ret = -1;
- return ret;
+ priv = this->private;
+
+ if (priv->child_count <= 16)
+ ret = __afr_inode_read_subvol_reset_small (inode, this);
+ else
+ ret = -1;
+
+ return ret;
}
-afr_inode_ctx_t*
-afr_inode_ctx_get_from_addr (uint64_t addr, int32_t child_count)
+
+int
+afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, unsigned char *data,
+ unsigned char *metadata, int *event_p)
{
- int ret = -1;
- afr_inode_ctx_t *ctx = NULL;
- size_t size = 0;
+ int ret = -1;
- GF_ASSERT (child_count > 0);
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_read_subvol_get (inode, this, data,
+ metadata, event_p);
+ }
+ UNLOCK(&inode->lock);
- if (!addr) {
- ctx = GF_CALLOC (1, sizeof (*ctx),
- gf_afr_mt_inode_ctx_t);
- if (!ctx)
- goto out;
- size = sizeof (*ctx->fresh_children);
- ctx->fresh_children = GF_CALLOC (child_count, size,
- gf_afr_mt_int32_t);
- if (!ctx->fresh_children)
- goto out;
- } else {
- ctx = (afr_inode_ctx_t*) (long) addr;
- }
- ret = 0;
-out:
- if (ret && ctx) {
- GF_FREE (ctx->fresh_children);
- GF_FREE (ctx);
- ctx = NULL;
- }
- return ctx;
+ return ret;
}
-void
-afr_inode_get_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params)
+
+int
+afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data,
+ unsigned char *metadata, int event)
{
- GF_ASSERT (inode);
- GF_ASSERT (params);
+ int ret = -1;
- int ret = 0;
- afr_inode_ctx_t *ctx = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- uint64_t ctx_addr = 0;
- int32_t read_child = -1;
- int32_t *fresh_children = NULL;
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_read_subvol_set (inode, this, data, metadata,
+ event);
+ }
+ UNLOCK(&inode->lock);
- priv = this->private;
- LOCK (&inode->lock);
- {
- ret = __inode_ctx_get (inode, this, &ctx_addr);
- if (ret < 0)
- goto unlock;
- ctx = afr_inode_ctx_get_from_addr (ctx_addr, priv->child_count);
- if (!ctx)
- goto unlock;
- switch (params->op) {
- case AFR_INODE_GET_READ_CTX:
- fresh_children = params->u.read_ctx.children;
- read_child = (int32_t)(ctx->masks &
- AFR_ICTX_READ_CHILD_MASK);
- params->u.read_ctx.read_child = read_child;
- if (!fresh_children)
- goto unlock;
- for (i = 0; i < priv->child_count; i++)
- fresh_children[i] = ctx->fresh_children[i];
- break;
- case AFR_INODE_GET_OPENDIR_DONE:
- params->u.value = _gf_false;
- if (ctx->masks & AFR_ICTX_OPENDIR_DONE_MASK)
- params->u.value = _gf_true;
- break;
- case AFR_INODE_GET_SPLIT_BRAIN:
- params->u.value = _gf_false;
- if (ctx->masks & AFR_ICTX_SPLIT_BRAIN_MASK)
- params->u.value = _gf_true;
- ;
- break;
- default:
- GF_ASSERT (0);
- break;
- }
- }
-unlock:
- UNLOCK (&inode->lock);
+ return ret;
}
-gf_boolean_t
-afr_is_split_brain (xlator_t *this, inode_t *inode)
+
+int
+afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this)
{
- afr_inode_params_t params = {0};
+ int ret = -1;
- params.op = AFR_INODE_GET_SPLIT_BRAIN;
- afr_inode_get_ctx (this, inode, &params);
- return params.u.value;
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_read_subvol_reset (inode, this);
+ }
+ UNLOCK(&inode->lock);
+
+ return ret;
}
-gf_boolean_t
-afr_is_opendir_done (xlator_t *this, inode_t *inode)
+
+int
+afr_accused_fill (xlator_t *this, dict_t *xdata, unsigned char *accused,
+ afr_transaction_type type)
{
- afr_inode_params_t params = {0};
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int idx = afr_index_for_transaction_type (type);
+ void *pending_raw = NULL;
+ int pending[3];
+ int ret = 0;
+
+ priv = this->private;
- params.op = AFR_INODE_GET_OPENDIR_DONE;
- afr_inode_get_ctx (this, inode, &params);
- return params.u.value;
+ for (i = 0; i < priv->child_count; i++) {
+ ret = dict_get_ptr (xdata, priv->pending_key[i],
+ &pending_raw);
+ if (ret) /* no pending flags */
+ continue;
+ memcpy (pending, pending_raw, sizeof(pending));
+
+ if (ntoh32 (pending[idx]))
+ accused[i] = 1;
+ }
+
+ return 0;
}
-int32_t
-afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children)
+int
+afr_accuse_smallfiles (xlator_t *this, struct afr_reply *replies,
+ unsigned char *data_accused)
{
- afr_inode_params_t params = {0};
+ int i = 0;
+ afr_private_t *priv = NULL;
+ uint64_t maxsize = 0;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (data_accused[i])
+ continue;
+ if (replies[i].poststat.ia_size > maxsize)
+ maxsize = replies[i].poststat.ia_size;
+ }
- params.op = AFR_INODE_GET_READ_CTX;
- params.u.read_ctx.children = fresh_children;
- afr_inode_get_ctx (this, inode, &params);
- return params.u.read_ctx.read_child;
+ for (i = 0; i < priv->child_count; i++) {
+ if (data_accused[i])
+ continue;
+ if (replies[i].poststat.ia_size < maxsize)
+ data_accused[i] = 1;
+ }
+
+ return 0;
}
-void
-afr_inode_ctx_set_read_child (afr_inode_ctx_t *ctx, int32_t read_child)
-{
- uint64_t remaining_mask = 0;
- uint64_t mask = 0;
- remaining_mask = (~AFR_ICTX_READ_CHILD_MASK & ctx->masks);
- mask = (AFR_ICTX_READ_CHILD_MASK & read_child);
- ctx->masks = remaining_mask | mask;
+int
+afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ struct afr_reply *replies = NULL;
+ int event_generation = 0;
+ int i = 0;
+ unsigned char *data_accused = NULL;
+ unsigned char *metadata_accused = NULL;
+ unsigned char *data_readable = NULL;
+ unsigned char *metadata_readable = NULL;
+ int ret = 0;
+
+ local = frame->local;
+ priv = this->private;
+ replies = local->replies;
+ event_generation = local->event_generation;
+
+ data_accused = alloca0 (priv->child_count);
+ data_readable = alloca0 (priv->child_count);
+ metadata_accused = alloca0 (priv->child_count);
+ metadata_readable = alloca0 (priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ data_readable[i] = 1;
+ metadata_readable[i] = 1;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid) {
+ data_readable[i] = 0;
+ metadata_readable[i] = 0;
+ continue;
+ }
+
+ if (replies[i].op_ret == -1) {
+ data_readable[i] = 0;
+ metadata_readable[i] = 0;
+ continue;
+ }
+
+ afr_accused_fill (this, replies[i].xdata, data_accused,
+ (inode->ia_type == IA_IFDIR) ?
+ AFR_ENTRY_TRANSACTION : AFR_DATA_TRANSACTION);
+
+ afr_accused_fill (this, replies[i].xdata,
+ metadata_accused, AFR_METADATA_TRANSACTION);
+
+ }
+
+ if (inode->ia_type != IA_IFDIR)
+ afr_accuse_smallfiles (this, replies, data_accused);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (data_accused[i]) {
+ data_readable[i] = 0;
+ ret = 1;
+ }
+ if (metadata_accused[i]) {
+ metadata_readable[i] = 0;
+ ret = 1;
+ }
+ }
+
+ afr_inode_read_subvol_set (inode, this, data_readable,
+ metadata_readable, event_generation);
+ return ret;
}
-void
-afr_inode_ctx_set_read_ctx (afr_inode_ctx_t *ctx, int32_t read_child,
- int32_t *fresh_children, int32_t child_count)
+
+
+int
+afr_refresh_selfheal_done (int ret, call_frame_t *heal, void *opaque)
{
- int i = 0;
-
- afr_inode_ctx_set_read_child (ctx, read_child);
- for (i = 0; i < child_count; i++) {
- if (fresh_children)
- ctx->fresh_children[i] = fresh_children[i];
- else
- ctx->fresh_children[i] = -1;
- }
+ if (heal)
+ STACK_DESTROY (heal->root);
+ return 0;
}
-void
-afr_inode_ctx_rm_stale_children (afr_inode_ctx_t *ctx, int32_t *stale_children,
- int32_t child_count)
+int
+afr_inode_refresh_err (call_frame_t *frame, xlator_t *this)
{
- int i = 0;
- int32_t read_child = -1;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int err = 0;
- GF_ASSERT (stale_children);
- for (i = 0; i < child_count; i++) {
- if (stale_children[i] == -1)
- break;
- afr_children_rm_child (ctx->fresh_children,
- stale_children[i], child_count);
- }
- read_child = (int32_t)(ctx->masks & AFR_ICTX_READ_CHILD_MASK);
- if (!afr_is_child_present (ctx->fresh_children, child_count,
- read_child))
- afr_inode_ctx_set_read_child (ctx, ctx->fresh_children[0]);
-}
+ local = frame->local;
+ priv = this->private;
-void
-afr_inode_ctx_set_opendir_done (afr_inode_ctx_t *ctx)
-{
- uint64_t remaining_mask = 0;
- uint64_t mask = 0;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].valid && !local->replies[i].op_ret) {
+ err = 0;
+ goto ret;
+ }
+ }
- remaining_mask = (~AFR_ICTX_OPENDIR_DONE_MASK & ctx->masks);
- mask = (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_OPENDIR_DONE_MASK);
- ctx->masks = remaining_mask | mask;
+ err = afr_final_errno (local, priv);
+ret:
+ return -err;
}
-void
-afr_inode_ctx_set_splitbrain (afr_inode_ctx_t *ctx, gf_boolean_t set)
+
+int
+afr_refresh_selfheal_wrap (void *opaque)
{
- uint64_t remaining_mask = 0;
- uint64_t mask = 0;
+ call_frame_t *frame = opaque;
+ afr_local_t *local = NULL;
+ xlator_t *this = NULL;
+ int err = 0;
- if (set) {
- remaining_mask = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx->masks);
- mask = (0xFFFFFFFFFFFFFFFFULL & AFR_ICTX_SPLIT_BRAIN_MASK);
- ctx->masks = remaining_mask | mask;
- } else {
- ctx->masks = (~AFR_ICTX_SPLIT_BRAIN_MASK & ctx->masks);
- }
+ local = frame->local;
+ this = frame->this;
+
+ afr_selfheal (frame->this, local->refreshinode->gfid);
+
+ afr_selfheal_unlocked_discover (frame, local->refreshinode,
+ local->refreshinode->gfid,
+ local->replies);
+
+ afr_replies_interpret (frame, this, local->refreshinode);
+
+ err = afr_inode_refresh_err (frame, this);
+
+ afr_replies_wipe (local, this->private);
+
+ local->refreshfn (frame, this, err);
+
+ return 0;
}
-void
-afr_inode_set_ctx (xlator_t *this, inode_t *inode, afr_inode_params_t *params)
+
+gf_boolean_t
+afr_selfheal_enabled (xlator_t *this)
{
- GF_ASSERT (inode);
- GF_ASSERT (params);
+ afr_private_t *priv = NULL;
+ gf_boolean_t data = _gf_false;
- int ret = 0;
- afr_inode_ctx_t *ctx = NULL;
- afr_private_t *priv = NULL;
- uint64_t ctx_addr = 0;
- gf_boolean_t set = _gf_false;
- int32_t read_child = -1;
- int32_t *fresh_children = NULL;
- int32_t *stale_children = NULL;
+ priv = this->private;
- priv = this->private;
- LOCK (&inode->lock);
- {
- ret = __inode_ctx_get (inode, this, &ctx_addr);
- if (ret < 0)
- ctx_addr = 0;
- ctx = afr_inode_ctx_get_from_addr (ctx_addr, priv->child_count);
- if (!ctx)
- goto unlock;
- switch (params->op) {
- case AFR_INODE_SET_READ_CTX:
- read_child = params->u.read_ctx.read_child;
- fresh_children = params->u.read_ctx.children;
- afr_inode_ctx_set_read_ctx (ctx, read_child,
- fresh_children,
- priv->child_count);
- break;
- case AFR_INODE_RM_STALE_CHILDREN:
- stale_children = params->u.read_ctx.children;
- afr_inode_ctx_rm_stale_children (ctx,
- stale_children,
- priv->child_count);
- break;
- case AFR_INODE_SET_OPENDIR_DONE:
- afr_inode_ctx_set_opendir_done (ctx);
- break;
- case AFR_INODE_SET_SPLIT_BRAIN:
- set = params->u.value;
- afr_inode_ctx_set_splitbrain (ctx, set);
- break;
- default:
- GF_ASSERT (0);
- break;
- }
- ret = __inode_ctx_put (inode, this, (uint64_t)ctx);
- if (ret) {
- gf_log_callingfn (this->name, GF_LOG_ERROR, "failed to "
- "set the inode ctx (%s)",
- uuid_utoa (inode->gfid));
- }
- }
-unlock:
- UNLOCK (&inode->lock);
+ gf_string2boolean (priv->data_self_heal, &data);
+
+ return data || priv->metadata_self_heal || priv->entry_self_heal;
}
-void
-afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set)
+
+
+int
+afr_inode_refresh_done (call_frame_t *frame, xlator_t *this)
{
- afr_inode_params_t params = {0};
+ call_frame_t *heal = NULL;
+ afr_local_t *local = NULL;
+ int ret = 0;
+ int err = 0;
+
+ local = frame->local;
- params.op = AFR_INODE_SET_SPLIT_BRAIN;
- params.u.value = set;
- afr_inode_set_ctx (this, inode, &params);
+ ret = afr_replies_interpret (frame, this, local->refreshinode);
+
+ err = afr_inode_refresh_err (frame, this);
+
+ afr_replies_wipe (local, this->private);
+
+ if (ret && afr_selfheal_enabled (this)) {
+ heal = copy_frame (frame);
+ if (heal)
+ heal->root->pid = -1;
+ ret = synctask_new (this->ctx->env, afr_refresh_selfheal_wrap,
+ afr_refresh_selfheal_done, heal, frame);
+ if (ret)
+ goto refresh_done;
+ } else {
+ refresh_done:
+ local->refreshfn (frame, this, err);
+ }
+
+ return 0;
}
-void
-afr_set_opendir_done (xlator_t *this, inode_t *inode)
+
+int
+afr_inode_refresh_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *par)
{
- afr_inode_params_t params = {0};
+ afr_local_t *local = NULL;
+ int call_child = (long) cookie;
+ int call_count = 0;
+
+ local = frame->local;
+
+ local->replies[call_child].valid = 1;
+ local->replies[call_child].op_ret = op_ret;
+ local->replies[call_child].op_errno = op_errno;
+ if (op_ret != -1) {
+ local->replies[call_child].poststat = *buf;
+ local->replies[call_child].postparent = *par;
+ local->replies[call_child].xdata = dict_ref (xdata);
+ }
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ afr_inode_refresh_done (frame, this);
- params.op = AFR_INODE_SET_OPENDIR_DONE;
- afr_inode_set_ctx (this, inode, &params);
+ return 0;
}
-void
-afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child,
- int32_t *fresh_children)
+
+int
+afr_inode_refresh_subvol (call_frame_t *frame, xlator_t *this, int i,
+ inode_t *inode, dict_t *xdata)
{
- afr_inode_params_t params = {0};
- afr_private_t *priv = NULL;
+ loc_t loc = {0, };
+ afr_private_t *priv = NULL;
- priv = this->private;
- GF_ASSERT (read_child >= 0);
- GF_ASSERT (fresh_children);
- GF_ASSERT (afr_is_child_present (fresh_children, priv->child_count,
- read_child));
-
- params.op = AFR_INODE_SET_READ_CTX;
- params.u.read_ctx.read_child = read_child;
- params.u.read_ctx.children = fresh_children;
- afr_inode_set_ctx (this, inode, &params);
+ priv = this->private;
+
+ loc.inode = inode;
+ uuid_copy (loc.gfid, inode->gfid);
+
+ STACK_WIND_COOKIE (frame, afr_inode_refresh_subvol_cbk,
+ (void *) (long) i, priv->children[i],
+ priv->children[i]->fops->lookup, &loc, xdata);
+ return 0;
}
-void
-afr_inode_rm_stale_children (xlator_t *this, inode_t *inode,
- int32_t *stale_children)
+
+int
+afr_inode_refresh_do (call_frame_t *frame, xlator_t *this)
{
- afr_inode_params_t params = {0};
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int i = 0;
+ dict_t *xdata = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ afr_replies_wipe (local, priv);
+
+ xdata = dict_new ();
+ if (!xdata) {
+ afr_inode_refresh_done (frame, this);
+ return 0;
+ }
+
+ if (afr_xattr_req_prepare (this, xdata) != 0) {
+ dict_unref (xdata);
+ afr_inode_refresh_done (frame, this);
+ return 0;
+ }
+
+ local->call_count = AFR_COUNT (local->child_up, priv->child_count);
+
+ call_count = local->call_count;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->child_up[i])
+ continue;
- GF_ASSERT (stale_children);
+ afr_inode_refresh_subvol (frame, this, i, local->refreshinode,
+ xdata);
- params.op = AFR_INODE_RM_STALE_CHILDREN;
- params.u.read_ctx.children = stale_children;
- afr_inode_set_ctx (this, inode, &params);
+ if (!--call_count)
+ break;
+ }
+
+ dict_unref (xdata);
+
+ return 0;
}
-gf_boolean_t
-afr_is_source_child (int32_t *sources, int32_t child_count, int32_t child)
+
+int
+afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ afr_inode_refresh_cbk_t refreshfn)
{
- gf_boolean_t source_xattrs = _gf_false;
+ afr_local_t *local = NULL;
- GF_ASSERT (child < child_count);
+ local = frame->local;
- if ((child >= 0) && (child < child_count) &&
- sources[child]) {
- source_xattrs = _gf_true;
- }
- return source_xattrs;
+ local->refreshfn = refreshfn;
+
+ if (local->refreshinode) {
+ inode_unref (local->refreshinode);
+ local->refreshinode = NULL;
+ }
+
+ local->refreshinode = inode_ref (inode);
+
+ afr_inode_refresh_do (frame, this);
+
+ return 0;
}
-gf_boolean_t
-afr_is_child_present (int32_t *success_children, int32_t child_count,
- int32_t child)
+
+int
+afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req)
{
- gf_boolean_t success_child = _gf_false;
- int i = 0;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int ret = 0;
- GF_ASSERT (child < child_count);
+ priv = this->private;
- for (i = 0; i < child_count; i++) {
- if (success_children[i] == -1)
- break;
- if (child == success_children[i]) {
- success_child = _gf_true;
- break;
- }
+ for (i = 0; i < priv->child_count; i++) {
+ ret = dict_set_uint64 (xattr_req, priv->pending_key[i],
+ AFR_NUM_CHANGE_LOGS * sizeof(int));
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_WARNING,
+ "Unable to set dict value for %s",
+ priv->pending_key[i]);
+ /* 3 = data+metadata+entry */
+ }
+ ret = dict_set_uint64 (xattr_req, AFR_DIRTY,
+ AFR_NUM_CHANGE_LOGS * sizeof(int));
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "failed to set dirty "
+ "query flag");
}
- return success_child;
+
+ return ret;
}
-gf_boolean_t
-afr_is_read_child (int32_t *success_children, int32_t *sources,
- int32_t child_count, int32_t child)
+int
+afr_lookup_xattr_req_prepare (afr_local_t *local, xlator_t *this,
+ dict_t *xattr_req, loc_t *loc)
{
- gf_boolean_t success_child = _gf_false;
- gf_boolean_t source = _gf_false;
+ int ret = -ENOMEM;
- if (child < 0) {
- return _gf_false;
- }
+ local->xattr_req = dict_new ();
+ if (!local->xattr_req)
+ goto out;
+ if (xattr_req)
+ dict_copy (xattr_req, local->xattr_req);
- GF_ASSERT (success_children);
- GF_ASSERT (child_count > 0);
+ ret = afr_xattr_req_prepare (this, local->xattr_req);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: Unable to prepare xattr_req", loc->path);
+ }
- success_child = afr_is_child_present (success_children, child_count,
- child);
- if (!success_child)
- goto out;
- if (NULL == sources) {
- source = _gf_true;
- goto out;
+ ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: Unable to set dict value for %s",
+ loc->path, GLUSTERFS_INODELK_COUNT);
}
- source = afr_is_source_child (sources, child_count, child);
+ ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: Unable to set dict value for %s",
+ loc->path, GLUSTERFS_ENTRYLK_COUNT);
+ }
+
+ ret = dict_set_uint32 (local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: Unable to set dict value for %s",
+ loc->path, GLUSTERFS_PARENT_ENTRYLK);
+ }
+
+ ret = 0;
out:
- return (success_child && source);
+ return ret;
}
-int32_t
-afr_hash_child (int32_t *success_children, int32_t child_count,
- unsigned int hmode, uuid_t gfid)
+
+int
+afr_hash_child (inode_t *inode, int32_t child_count, int hashmode)
{
- uuid_t gfid_copy = {0,};
+ uuid_t gfid_copy = {0,};
pid_t pid;
- if (!hmode) {
+ if (!hashmode) {
return -1;
}
- if (gfid) {
- uuid_copy(gfid_copy,gfid);
+ if (inode) {
+ uuid_copy (gfid_copy, inode->gfid);
}
- if (hmode > 1) {
+
+ if (hashmode > 1) {
/*
* Why getpid? Because it's one of the cheapest calls
* available - faster than gethostname etc. - and returns a
@@ -597,261 +749,157 @@ afr_hash_child (int32_t *success_children, int32_t child_count,
sizeof(gfid_copy)) % child_count;
}
-/* If sources is NULL the xattrs are assumed to be of source for all
- * success_children.
- */
+
int
-afr_select_read_child_from_policy (int32_t *success_children,
- int32_t child_count, int32_t prev_read_child,
- int32_t config_read_child, int32_t *sources,
- unsigned int hmode, uuid_t gfid)
+afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this,
+ unsigned char *readable)
{
- int32_t read_child = -1;
- int i = 0;
+ afr_private_t *priv = NULL;
+ int read_subvol = -1;
+ int i = 0;
- GF_ASSERT (success_children);
+ priv = this->private;
- read_child = config_read_child;
- if (afr_is_read_child (success_children, sources, child_count,
- read_child))
- goto out;
-
- read_child = prev_read_child;
- if (afr_is_read_child (success_children, sources, child_count,
- read_child))
- goto out;
-
- read_child = afr_hash_child (success_children, child_count,
- hmode, gfid);
- if (afr_is_read_child (success_children, sources, child_count,
- read_child)) {
- goto out;
- }
+ /* first preference - explicitly specified or local subvolume */
+ if (priv->read_child >= 0 && readable[priv->read_child])
+ return priv->read_child;
- for (i = 0; i < child_count; i++) {
- read_child = success_children[i];
- if (read_child < 0)
- break;
- if (afr_is_read_child (success_children, sources, child_count,
- read_child))
- goto out;
- }
- read_child = -1;
+ /* second preference - use hashed mode */
+ read_subvol = afr_hash_child (inode, priv->child_count,
+ priv->hash_mode);
+ if (read_subvol >= 0 && readable[read_subvol])
+ return read_subvol;
-out:
- return read_child;
-}
+ for (i = 0; i < priv->child_count; i++) {
+ if (readable[i])
+ return i;
+ }
-/* This function should be used when all the success_children are sources
- */
-void
-afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode,
- int32_t *fresh_children, int32_t prev_read_child,
- int32_t config_read_child, uuid_t gfid)
-{
- int read_child = -1;
- afr_private_t *priv = NULL;
+ /* no readable subvolumes, either split brain or all subvols down */
- priv = this->private;
- read_child = afr_select_read_child_from_policy (fresh_children,
- priv->child_count,
- prev_read_child,
- config_read_child,
- NULL,
- priv->hash_mode, gfid);
- if (read_child >= 0)
- afr_inode_set_read_ctx (this, inode, read_child,
- fresh_children);
+ return -1;
}
-/* afr_next_call_child ()
- * This is a common function used by all the read-type fops
- * This function should not be called with the inode's read_children array.
- * The fop's handler should make a copy of the inode's read_children,
- * preferred read_child into the local vars, because while this function is
- * in execution there is a chance for inode's read_ctx to change.
- */
-int32_t
-afr_next_call_child (int32_t *fresh_children, unsigned char *child_up,
- size_t child_count, int32_t *last_index,
- int32_t read_child)
-{
- int next_index = 0;
- int32_t next_call_child = -1;
- GF_ASSERT (last_index);
-
- next_index = *last_index;
-retry:
- next_index++;
- if ((next_index >= child_count) ||
- (fresh_children[next_index] == -1))
- goto out;
- if ((fresh_children[next_index] == read_child) ||
- (!child_up[fresh_children[next_index]]))
- goto retry;
- *last_index = next_index;
- next_call_child = fresh_children[next_index];
-out:
- return next_call_child;
-}
-
- /* This function should not be called with the inode's read_children array.
- * The fop's handler should make a copy of the inode's read_children,
- * preferred read_child into the local vars, because while this function is
- * in execution there is a chance for inode's read_ctx to change.
- */
-int32_t
-afr_get_call_child (xlator_t *this, unsigned char *child_up, int32_t read_child,
- int32_t *fresh_children,
- int32_t *call_child, int32_t *last_index)
+int
+afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this,
+ unsigned char *readable, int *event_p,
+ int type)
{
- int ret = 0;
- afr_private_t *priv = NULL;
- int i = 0;
-
- GF_ASSERT (child_up);
- GF_ASSERT (call_child);
- GF_ASSERT (last_index);
- GF_ASSERT (fresh_children);
-
- if (read_child < 0) {
- ret = -EIO;
- goto out;
- }
- priv = this->private;
- *call_child = -1;
- *last_index = -1;
+ int ret = -1;
- if (child_up[read_child]) {
- *call_child = read_child;
- } else {
- for (i = 0; i < priv->child_count; i++) {
- if (fresh_children[i] == -1)
- break;
- if (child_up[fresh_children[i]]) {
- *call_child = fresh_children[i];
- ret = 0;
- break;
- }
- }
-
- if (*call_child == -1) {
- ret = -ENOTCONN;
- goto out;
- }
-
- *last_index = i;
- }
-out:
- gf_log (this->name, GF_LOG_DEBUG, "Returning %d, call_child: %d, "
- "last_index: %d", ret, *call_child, *last_index);
- return ret;
+ if (type == AFR_METADATA_TRANSACTION)
+ ret = afr_inode_read_subvol_get (inode, this, 0, readable,
+ event_p);
+ else
+ ret = afr_inode_read_subvol_get (inode, this, readable, 0,
+ event_p);
+ return ret;
}
-void
-afr_reset_xattr (dict_t **xattr, unsigned int child_count)
-{
- unsigned int i = 0;
-
- if (!xattr)
- goto out;
- for (i = 0; i < child_count; i++) {
- if (xattr[i]) {
- dict_unref (xattr[i]);
- xattr[i] = NULL;
- }
- }
-out:
- return;
-}
-void
-afr_local_sh_cleanup (afr_local_t *local, xlator_t *this)
+int
+afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p,
+ int *event_p, afr_transaction_type type)
{
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
-
- sh = &local->self_heal;
- priv = this->private;
-
- GF_FREE (sh->buf);
-
- GF_FREE (sh->parentbufs);
-
- if (sh->inode)
- inode_unref (sh->inode);
-
- if (sh->xattr) {
- afr_reset_xattr (sh->xattr, priv->child_count);
- GF_FREE (sh->xattr);
- }
+ afr_private_t *priv = NULL;
+ unsigned char *data_readable = NULL;
+ unsigned char *metadata_readable = NULL;
+ unsigned char *readable = NULL;
+ unsigned char *intersection = NULL;
+ int subvol = -1;
+ int event = 0;
- GF_FREE (sh->child_errno);
+ priv = this->private;
- afr_matrix_cleanup (sh->pending_matrix, priv->child_count);
- afr_matrix_cleanup (sh->delta_matrix, priv->child_count);
-
- GF_FREE (sh->sources);
-
- GF_FREE (sh->success);
-
- GF_FREE (sh->locked_nodes);
-
- if (sh->healing_fd) {
- fd_unref (sh->healing_fd);
- sh->healing_fd = NULL;
- }
+ readable = alloca0 (priv->child_count);
+ data_readable = alloca0 (priv->child_count);
+ metadata_readable = alloca0 (priv->child_count);
+ intersection = alloca0 (priv->child_count);
- GF_FREE ((char *)sh->linkname);
+ afr_inode_read_subvol_type_get (inode, this, readable, &event, type);
- GF_FREE (sh->success_children);
+ afr_inode_read_subvol_get (inode, this, data_readable, metadata_readable,
+ &event);
- GF_FREE (sh->fresh_children);
+ AFR_INTERSECT (intersection, data_readable, metadata_readable,
+ priv->child_count);
- GF_FREE (sh->fresh_parent_dirs);
-
- loc_wipe (&sh->parent_loc);
- loc_wipe (&sh->lookup_loc);
-
- GF_FREE (sh->checksum);
-
- GF_FREE (sh->write_needed);
- if (sh->healing_fd)
- fd_unref (sh->healing_fd);
+ if (AFR_COUNT (intersection, priv->child_count) > 0)
+ subvol = afr_read_subvol_select_by_policy (inode, this,
+ intersection);
+ else
+ subvol = afr_read_subvol_select_by_policy (inode, this,
+ readable);
+ if (subvol_p)
+ *subvol_p = subvol;
+ if (event_p)
+ *event_p = event;
+ return subvol;
}
void
afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)
{
- afr_private_t * priv = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
priv = this->private;
afr_matrix_cleanup (local->pending, priv->child_count);
- afr_matrix_cleanup (local->transaction.txn_changelog,
- priv->child_count);
GF_FREE (local->internal_lock.locked_nodes);
- GF_FREE (local->internal_lock.inode_locked_nodes);
-
- GF_FREE (local->internal_lock.entry_locked_nodes);
+ for (i = 0; local->internal_lock.inodelk[i].domain; i++) {
+ GF_FREE (local->internal_lock.inodelk[i].locked_nodes);
+ }
GF_FREE (local->internal_lock.lower_locked_nodes);
+ afr_entry_lockee_cleanup (&local->internal_lock);
GF_FREE (local->transaction.pre_op);
GF_FREE (local->transaction.eager_lock);
+ GF_FREE (local->transaction.fop_subvols);
+ GF_FREE (local->transaction.failed_subvols);
GF_FREE (local->transaction.basename);
GF_FREE (local->transaction.new_basename);
loc_wipe (&local->transaction.parent_loc);
loc_wipe (&local->transaction.new_parent_loc);
+
+}
+
+
+void
+afr_replies_wipe (afr_local_t *local, afr_private_t *priv)
+{
+ int i;
+
+ if (!local->replies)
+ return;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].xdata) {
+ dict_unref (local->replies[i].xdata);
+ local->replies[i].xdata = NULL;
+ }
+ }
+
+ memset (local->replies, 0, sizeof(*local->replies) * priv->child_count);
}
+void
+afr_remove_eager_lock_stub (afr_local_t *local)
+{
+ LOCK (&local->fd->lock);
+ {
+ list_del_init (&local->transaction.eager_locked);
+ }
+ UNLOCK (&local->fd->lock);
+}
void
afr_local_cleanup (afr_local_t *local, xlator_t *this)
@@ -861,7 +909,11 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this)
if (!local)
return;
- afr_local_sh_cleanup (local, this);
+ syncbarrier_destroy (&local->barrier);
+
+ if (local->transaction.eager_lock_on &&
+ !list_empty (&local->transaction.eager_locked))
+ afr_remove_eager_lock_stub (local);
afr_local_transaction_cleanup (local, this);
@@ -879,40 +931,26 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this)
if (local->dict)
dict_unref (local->dict);
- GF_FREE (local->child_up);
-
- GF_FREE (local->child_errno);
-
- GF_FREE (local->fresh_children);
+ afr_replies_wipe (local, priv);
+ GF_FREE(local->replies);
- GF_FREE (local->fd_open_on);
-
- { /* lookup */
- if (local->cont.lookup.xattrs) {
- afr_reset_xattr (local->cont.lookup.xattrs,
- priv->child_count);
- GF_FREE (local->cont.lookup.xattrs);
- local->cont.lookup.xattrs = NULL;
- }
+ GF_FREE (local->child_up);
- if (local->cont.lookup.xattr) {
- dict_unref (local->cont.lookup.xattr);
- }
+ GF_FREE (local->read_attempted);
- if (local->cont.lookup.inode) {
- inode_unref (local->cont.lookup.inode);
- }
+ GF_FREE (local->readable);
- GF_FREE (local->cont.lookup.postparents);
+ if (local->inode)
+ inode_unref (local->inode);
- GF_FREE (local->cont.lookup.bufs);
+ if (local->parent)
+ inode_unref (local->parent);
- GF_FREE (local->cont.lookup.success_children);
+ if (local->parent2)
+ inode_unref (local->parent2);
- GF_FREE (local->cont.lookup.sources);
- afr_matrix_cleanup (local->cont.lookup.pending_matrix,
- priv->child_count);
- }
+ if (local->refreshinode)
+ inode_unref (local->refreshinode);
{ /* getxattr */
GF_FREE (local->cont.getxattr.name);
@@ -946,6 +984,8 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this)
{ /* writev */
GF_FREE (local->cont.writev.vector);
+ if (local->cont.writev.iobref)
+ iobref_unref (local->cont.writev.iobref);
}
{ /* setxattr */
@@ -1007,1051 +1047,325 @@ afr_frame_return (call_frame_t *frame)
return call_count;
}
-int
-afr_set_elem_count_get (unsigned char *elems, int child_count)
-{
- int i = 0;
- int ret = 0;
-
- for (i = 0; i < child_count; i++)
- if (elems[i])
- ret++;
- return ret;
-}
-
-/**
- * up_children_count - return the number of children that are up
- */
-
-unsigned int
-afr_up_children_count (unsigned char *child_up, unsigned int child_count)
-{
- return afr_set_elem_count_get (child_up, child_count);
-}
-
-unsigned int
-afr_locked_children_count (unsigned char *children, unsigned int child_count)
-{
- return afr_set_elem_count_get (children, child_count);
-}
-
-unsigned int
-afr_pre_op_done_children_count (unsigned char *pre_op,
- unsigned int child_count)
-{
- return afr_set_elem_count_get (pre_op, child_count);
-}
gf_boolean_t
-afr_is_fresh_lookup (loc_t *loc, xlator_t *this)
-{
- uint64_t ctx = 0;
- int32_t ret = 0;
-
- GF_ASSERT (loc);
- GF_ASSERT (this);
- GF_ASSERT (loc->inode);
-
- ret = inode_ctx_get (loc->inode, this, &ctx);
- if (0 == ret)
- return _gf_false;
- return _gf_true;
-}
-
-void
-afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent)
-{
- GF_ASSERT (loc);
- GF_ASSERT (buf);
-
- uuid_copy (loc->gfid, buf->ia_gfid);
- if (postparent)
- uuid_copy (loc->pargfid, postparent->ia_gfid);
-}
-
-int
-afr_lookup_build_response_params (afr_local_t *local, xlator_t *this)
-{
- struct iatt *buf = NULL;
- struct iatt *postparent = NULL;
- dict_t **xattr = NULL;
- int32_t *success_children = NULL;
- int32_t *sources = NULL;
- afr_private_t *priv = NULL;
- int32_t read_child = -1;
- int ret = 0;
- int i = 0;
-
- GF_ASSERT (local);
-
- buf = &local->cont.lookup.buf;
- postparent = &local->cont.lookup.postparent;
- xattr = &local->cont.lookup.xattr;
- priv = this->private;
-
- read_child = afr_inode_get_read_ctx (this, local->cont.lookup.inode,
- local->fresh_children);
- if (read_child < 0) {
- ret = -1;
- goto out;
- }
- success_children = local->cont.lookup.success_children;
- sources = local->cont.lookup.sources;
- memset (sources, 0, sizeof (*sources) * priv->child_count);
- afr_children_intersection_get (local->fresh_children, success_children,
- sources, priv->child_count);
- if (!sources[read_child]) {
- read_child = -1;
- for (i = 0; i < priv->child_count; i++) {
- if (sources[i]) {
- read_child = i;
- break;
- }
- }
- }
- if (read_child < 0) {
- ret = -1;
- goto out;
- }
- gf_log (this->name, GF_LOG_DEBUG, "Building lookup response from %d",
- read_child);
- if (!*xattr)
- *xattr = dict_ref (local->cont.lookup.xattrs[read_child]);
- *buf = local->cont.lookup.bufs[read_child];
- *postparent = local->cont.lookup.postparents[read_child];
-
- if (IA_INVAL == local->cont.lookup.inode->ia_type) {
- /* fix for RT #602 */
- local->cont.lookup.inode->ia_type = buf->ia_type;
- }
-out:
- return ret;
-}
-
-static void
-afr_lookup_update_lk_counts (afr_local_t *local, xlator_t *this,
- int child_index, dict_t *xattr)
+afr_is_entry_possibly_under_txn (afr_local_t *local, xlator_t *this)
{
- uint32_t inodelk_count = 0;
- uint32_t entrylk_count = 0;
- int ret = -1;
- uint32_t parent_entrylk = 0;
+ int i = 0;
+ int tmp = 0;
+ afr_private_t *priv = NULL;
- GF_ASSERT (local);
- GF_ASSERT (this);
- GF_ASSERT (xattr);
- GF_ASSERT (child_index >= 0);
+ priv = this->private;
- ret = dict_get_uint32 (xattr, GLUSTERFS_INODELK_COUNT,
- &inodelk_count);
- if (ret == 0)
- local->inodelk_count += inodelk_count;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].xdata)
+ continue;
+ if (dict_get_int32 (local->replies[i].xdata,
+ GLUSTERFS_PARENT_ENTRYLK,
+ &tmp) == 0)
+ if (tmp)
+ return _gf_true;
+ }
- ret = dict_get_uint32 (xattr, GLUSTERFS_ENTRYLK_COUNT,
- &entrylk_count);
- if (ret == 0)
- local->entrylk_count += entrylk_count;
- ret = dict_get_uint32 (xattr, GLUSTERFS_PARENT_ENTRYLK,
- &parent_entrylk);
- if (!ret)
- local->cont.lookup.parent_entrylk += parent_entrylk;
-}
-
-static void
-afr_lookup_set_self_heal_params_by_xattr (afr_local_t *local, xlator_t *this,
- dict_t *xattr)
-{
- GF_ASSERT (local);
- GF_ASSERT (this);
- GF_ASSERT (xattr);
-
- if (afr_sh_has_metadata_pending (xattr, this)) {
- local->self_heal.do_metadata_self_heal = _gf_true;
- gf_log(this->name, GF_LOG_DEBUG,
- "metadata self-heal is pending for %s.",
- local->loc.path);
- }
-
- if (afr_sh_has_entry_pending (xattr, this)) {
- local->self_heal.do_entry_self_heal = _gf_true;
- gf_log(this->name, GF_LOG_DEBUG,
- "entry self-heal is pending for %s.", local->loc.path);
- }
-
- if (afr_sh_has_data_pending (xattr, this)) {
- local->self_heal.do_data_self_heal = _gf_true;
- gf_log(this->name, GF_LOG_DEBUG,
- "data self-heal is pending for %s.", local->loc.path);
- }
+ return _gf_false;
}
-void
-afr_lookup_check_set_metadata_split_brain (afr_local_t *local, xlator_t *this)
-{
- int32_t *sources = NULL;
- afr_private_t *priv = NULL;
- int32_t subvol_status = 0;
- int32_t *success_children = NULL;
- dict_t **xattrs = NULL;
- struct iatt *bufs = NULL;
- int32_t **pending_matrix = NULL;
-
- priv = this->private;
- sources = GF_CALLOC (priv->child_count, sizeof (*sources),
- gf_afr_mt_int32_t);
- if (NULL == sources)
- goto out;
- success_children = local->cont.lookup.success_children;
- xattrs = local->cont.lookup.xattrs;
- bufs = local->cont.lookup.bufs;
- pending_matrix = local->cont.lookup.pending_matrix;
- afr_build_sources (this, xattrs, bufs, pending_matrix,
- sources, success_children, AFR_METADATA_TRANSACTION,
- &subvol_status, _gf_false);
- if (subvol_status & SPLIT_BRAIN)
- local->cont.lookup.possible_spb = _gf_true;
-out:
- GF_FREE (sources);
-}
+/*
+ * Quota size xattrs are not maintained by afr. There is a
+ * possibility that they differ even when both the directory changelog xattrs
+ * suggest everything is fine. So if there is at least one 'source' check among
+ * the sources which has the maximum quota size. Otherwise check among all the
+ * available ones for maximum quota size. This way if there is a source and
+ * stale copies it always votes for the 'source'.
+ * */
static void
-afr_detect_self_heal_by_iatt (afr_local_t *local, xlator_t *this,
- struct iatt *buf, struct iatt *lookup_buf)
-{
- if (PERMISSION_DIFFERS (buf, lookup_buf)) {
- /* mismatching permissions */
- gf_log (this->name, GF_LOG_DEBUG,
- "permissions differ for %s ", local->loc.path);
- local->self_heal.do_metadata_self_heal = _gf_true;
- }
-
- if (OWNERSHIP_DIFFERS (buf, lookup_buf)) {
- /* mismatching permissions */
- local->self_heal.do_metadata_self_heal = _gf_true;
- gf_log (this->name, GF_LOG_DEBUG,
- "ownership differs for %s ", local->loc.path);
- }
-
- if (SIZE_DIFFERS (buf, lookup_buf)
- && IA_ISREG (buf->ia_type)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "size differs for %s ", local->loc.path);
- local->self_heal.do_data_self_heal = _gf_true;
- }
-
- if (uuid_compare (buf->ia_gfid, lookup_buf->ia_gfid)) {
- /* mismatching gfid */
- gf_log (this->name, GF_LOG_WARNING,
- "%s: gfid different on subvolume", local->loc.path);
- }
+afr_handle_quota_size (call_frame_t *frame, xlator_t *this)
+{
+ unsigned char *readable = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ struct afr_reply *replies = NULL;
+ int i = 0;
+ uint64_t size = 0;
+ uint64_t max_size = 0;
+ int readable_cnt = 0;
+
+ local = frame->local;
+ priv = this->private;
+ replies = local->replies;
+
+ readable = alloca0 (priv->child_count);
+
+ afr_inode_read_subvol_get (local->inode, this, readable, 0, 0);
+
+ readable_cnt = AFR_COUNT (readable, priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1)
+ continue;
+ if (readable_cnt && !readable[i])
+ continue;
+ if (!replies[i].xdata)
+ continue;
+ if (dict_get_uint64 (replies[i].xdata, QUOTA_SIZE_KEY, &size))
+ continue;
+ if (size > max_size)
+ max_size = size;
+ }
+
+ if (!max_size)
+ return;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1)
+ continue;
+ if (readable_cnt && !readable[i])
+ continue;
+ if (!replies[i].xdata)
+ continue;
+ if (dict_set_uint64 (replies[i].xdata, QUOTA_SIZE_KEY, max_size))
+ continue;
+ }
}
-static void
-afr_detect_self_heal_by_split_brain_status (afr_local_t *local, xlator_t *this)
-{
- gf_boolean_t split_brain = _gf_false;
- afr_self_heal_t *sh = NULL;
-
- sh = &local->self_heal;
-
- split_brain = afr_is_split_brain (this, local->cont.lookup.inode);
- split_brain = split_brain || local->cont.lookup.possible_spb;
- if ((local->success_count > 0) && split_brain &&
- IA_ISREG (local->cont.lookup.inode->ia_type)) {
- sh->force_confirm_spb = _gf_true;
- gf_log (this->name, GF_LOG_DEBUG,
- "split brain detected during lookup of %s.",
- local->loc.path);
- }
-}
static void
-afr_detect_self_heal_by_lookup_status (afr_local_t *local, xlator_t *this)
-{
- GF_ASSERT (local);
- GF_ASSERT (this);
-
- if ((local->success_count > 0) && (local->enoent_count > 0)) {
- local->self_heal.do_metadata_self_heal = _gf_true;
- local->self_heal.do_data_self_heal = _gf_true;
- local->self_heal.do_entry_self_heal = _gf_true;
- local->self_heal.do_gfid_self_heal = _gf_true;
- local->self_heal.do_missing_entry_self_heal = _gf_true;
- gf_log(this->name, GF_LOG_DEBUG,
- "entries are missing in lookup of %s.",
- local->loc.path);
- }
-
- return;
-}
-
-gf_boolean_t
-afr_can_self_heal_proceed (afr_self_heal_t *sh, afr_private_t *priv)
-{
- GF_ASSERT (sh);
- GF_ASSERT (priv);
-
- if (sh->force_confirm_spb)
- return _gf_true;
- return (sh->do_gfid_self_heal
- || sh->do_missing_entry_self_heal
- || (afr_data_self_heal_enabled (priv->data_self_heal) &&
- sh->do_data_self_heal)
- || (priv->metadata_self_heal && sh->do_metadata_self_heal)
- || (priv->entry_self_heal && sh->do_entry_self_heal));
-}
-
-afr_transaction_type
-afr_transaction_type_get (ia_type_t ia_type)
-{
- afr_transaction_type type = AFR_METADATA_TRANSACTION;
-
- GF_ASSERT (ia_type != IA_INVAL);
-
- if (IA_ISDIR (ia_type)) {
- type = AFR_ENTRY_TRANSACTION;
- } else if (IA_ISREG (ia_type)) {
- type = AFR_DATA_TRANSACTION;
- }
- return type;
-}
-
-int
-afr_lookup_select_read_child (afr_local_t *local, xlator_t *this,
- int32_t *read_child)
-{
- ia_type_t ia_type = IA_INVAL;
- int32_t source = -1;
- int ret = -1;
- dict_t **xattrs = NULL;
- int32_t *success_children = NULL;
- afr_transaction_type type = AFR_METADATA_TRANSACTION;
- uuid_t *gfid = NULL;
-
- GF_ASSERT (local);
- GF_ASSERT (this);
- GF_ASSERT (local->success_count > 0);
-
- success_children = local->cont.lookup.success_children;
- /*We can take the success_children[0] only because we already
- *handle the conflicting children other wise, we could select the
- *read_child based on wrong file type
- */
- ia_type = local->cont.lookup.bufs[success_children[0]].ia_type;
- type = afr_transaction_type_get (ia_type);
- xattrs = local->cont.lookup.xattrs;
- gfid = &local->cont.lookup.buf.ia_gfid;
- source = afr_lookup_select_read_child_by_txn_type (this, local, xattrs,
- type, *gfid);
- if (source < 0) {
- gf_log (this->name, GF_LOG_DEBUG, "failed to select source "
- "for %s", local->loc.path);
- goto out;
- }
-
- gf_log (this->name, GF_LOG_DEBUG, "Source selected as %d for %s",
- source, local->loc.path);
- *read_child = source;
- ret = 0;
-out:
- return ret;
-}
-
-static inline gf_boolean_t
-afr_is_transaction_running (afr_local_t *local)
-{
- GF_ASSERT (local->fop == GF_FOP_LOOKUP);
- return ((local->inodelk_count > 0) || (local->entrylk_count > 0));
-}
-
-void
-afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode,
- gf_boolean_t background, ia_type_t ia_type, char *reason,
- void (*gfid_sh_success_cbk) (call_frame_t *sh_frame,
- xlator_t *this),
- int (*unwind) (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- int32_t sh_failed))
+afr_lookup_done (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- char sh_type_str[256] = {0,};
- char *bg = "";
-
- GF_ASSERT (frame);
- GF_ASSERT (this);
- GF_ASSERT (inode);
- GF_ASSERT (ia_type != IA_INVAL);
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = -1;
+ int op_errno = 0;
+ int read_subvol = 0;
+ unsigned char *readable = NULL;
+ int event = 0;
+ struct afr_reply *replies = NULL;
+ uuid_t read_gfid = {0, };
+ gf_boolean_t locked_entry = _gf_false;
+ gf_boolean_t can_interpret = _gf_true;
+ priv = this->private;
local = frame->local;
- local->self_heal.background = background;
- local->self_heal.type = ia_type;
- local->self_heal.unwind = unwind;
- local->self_heal.gfid_sh_success_cbk = gfid_sh_success_cbk;
-
- afr_self_heal_type_str_get (&local->self_heal,
- sh_type_str,
- sizeof (sh_type_str));
-
- if (background)
- bg = "background";
- gf_log (this->name, GF_LOG_DEBUG,
- "%s %s self-heal triggered. path: %s, reason: %s", bg,
- sh_type_str, local->loc.path, reason);
-
- afr_self_heal (frame, this, inode);
-}
+ replies = local->replies;
-unsigned int
-afr_gfid_missing_count (const char *xlator_name, int32_t *success_children,
- struct iatt *bufs, unsigned int child_count,
- const char *path)
-{
- unsigned int gfid_miss_count = 0;
- int i = 0;
- struct iatt *child1 = NULL;
+ locked_entry = afr_is_entry_possibly_under_txn (local, this);
- for (i = 0; i < child_count; i++) {
- if (success_children[i] == -1)
- break;
- child1 = &bufs[success_children[i]];
- if (uuid_is_null (child1->ia_gfid)) {
- gf_log (xlator_name, GF_LOG_DEBUG, "%s: gfid is null"
- " on subvolume %d", path, success_children[i]);
- gfid_miss_count++;
- }
- }
-
- return gfid_miss_count;
-}
-
-static int
-afr_lookup_gfid_missing_count (afr_local_t *local, xlator_t *this)
-{
- int32_t *success_children = NULL;
- afr_private_t *priv = NULL;
- struct iatt *bufs = NULL;
- int miss_count = 0;
-
- priv = this->private;
- bufs = local->cont.lookup.bufs;
- success_children = local->cont.lookup.success_children;
+ readable = alloca0 (priv->child_count);
- miss_count = afr_gfid_missing_count (this->name, success_children,
- bufs, priv->child_count,
- local->loc.path);
- return miss_count;
-}
+ afr_inode_read_subvol_get (local->loc.parent, this, readable,
+ NULL, &event);
-gf_boolean_t
-afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children,
- unsigned int child_count, const char *path,
- const char *xlator_name)
-{
- gf_boolean_t conflicting = _gf_false;
- int i = 0;
- struct iatt *child1 = NULL;
- struct iatt *child2 = NULL;
- uuid_t *gfid = NULL;
-
- for (i = 0; i < child_count; i++) {
- if (success_children[i] == -1)
- break;
- child1 = &bufs[success_children[i]];
- if ((!gfid) && (!uuid_is_null (child1->ia_gfid)))
- gfid = &child1->ia_gfid;
-
- if (i == 0)
- continue;
-
- child2 = &bufs[success_children[i-1]];
- if (FILETYPE_DIFFERS (child1, child2)) {
- gf_log (xlator_name, GF_LOG_WARNING, "%s: filetype "
- "differs on subvolumes (%d, %d)", path,
- success_children[i-1], success_children[i]);
- conflicting = _gf_true;
- goto out;
- }
- if (!gfid || uuid_is_null (child1->ia_gfid))
- continue;
- if (uuid_compare (*gfid, child1->ia_gfid)) {
- gf_log (xlator_name, GF_LOG_WARNING, "%s: gfid differs"
- " on subvolume %d", path, success_children[i]);
- conflicting = _gf_true;
- goto out;
- }
- }
-out:
- return conflicting;
+ /* First, check if we have an ESTALE from somewhere,
+ If so, propagate that so that a revalidate can be
+ issued
+ */
+ op_errno = afr_final_errno (frame->local, this->private);
+ local->op_errno = op_errno;
+ if (op_errno == ESTALE) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ goto unwind;
+ }
+
+ read_subvol = -1;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (locked_entry && replies[i].op_ret == -1 &&
+ replies[i].op_errno == ENOENT) {
+ /* Second, check entry is still
+ "underway" in creation */
+ local->op_ret = -1;
+ local->op_errno = ENOENT;
+ read_subvol = i;
+ goto unwind;
+ }
+
+ if (replies[i].op_ret == -1)
+ continue;
+
+ if (read_subvol == -1 || !readable[read_subvol]) {
+ read_subvol = i;
+ uuid_copy (read_gfid, replies[i].poststat.ia_gfid);
+ local->op_ret = 0;
+ }
+ }
+
+ if (read_subvol == -1)
+ goto unwind;
+ /* We now have a read_subvol, which is readable[] (if there
+ were any). Next we look for GFID mismatches. We don't
+ consider a GFID mismatch as an error if read_subvol is
+ readable[] but the mismatching GFID subvol is not.
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1) {
+ if (priv->child_up[i])
+ can_interpret = _gf_false;
+ continue;
+ }
+
+ if (!uuid_compare (replies[i].poststat.ia_gfid,
+ read_gfid))
+ continue;
+
+ can_interpret = _gf_false;
+
+ if (locked_entry)
+ continue;
+
+ /* Now GFIDs mismatch. It's OK as long as this subvol
+ is not readable[] but read_subvol is */
+ if (readable[read_subvol] && !readable[i])
+ continue;
+
+ /* LOG ERROR */
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ goto unwind;
+ }
+
+ /* Forth, for the finalized GFID, pick the best subvolume
+ to return stats from.
+ */
+ if (can_interpret) {
+ /* It is safe to call afr_replies_interpret() because we have
+ a response from all the UP subvolumes and all of them resolved
+ to the same GFID
+ */
+ if (afr_replies_interpret (frame, this, local->inode)) {
+ read_subvol = afr_data_subvol_get (local->inode, this,
+ 0, 0);
+ afr_inode_read_subvol_reset (local->inode, this);
+ goto cant_interpret;
+ } else {
+ read_subvol = afr_data_subvol_get (local->inode, this,
+ 0, 0);
+ }
+ } else {
+ cant_interpret:
+ if (read_subvol == -1)
+ dict_del (replies[0].xdata, GF_CONTENT_KEY);
+ else
+ dict_del (replies[read_subvol].xdata, GF_CONTENT_KEY);
+ }
+
+ afr_handle_quota_size (frame, this);
+
+unwind:
+ if (read_subvol == -1)
+ read_subvol = 0;
+
+ AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
+ local->inode, &local->replies[read_subvol].poststat,
+ local->replies[read_subvol].xdata,
+ &local->replies[read_subvol].postparent);
}
-/* afr_update_gfid_from_iatts: This function should be called only if the
- * iatts are not conflicting.
+/*
+ * During a lookup, some errors are more "important" than
+ * others in that they must be given higher priority while
+ * returning to the user.
+ *
+ * The hierarchy is ESTALE > ENOENT > others
*/
-void
-afr_update_gfid_from_iatts (uuid_t uuid, struct iatt *bufs,
- int32_t *success_children, unsigned int child_count)
-{
- uuid_t *gfid = NULL;
- int i = 0;
- int child = 0;
-
- for (i = 0; i < child_count; i++) {
- child = success_children[i];
- if (child == -1)
- break;
- if ((!gfid) && (!uuid_is_null (bufs[child].ia_gfid))) {
- gfid = &bufs[child].ia_gfid;
- } else if (gfid && (!uuid_is_null (bufs[child].ia_gfid))) {
- if (uuid_compare (*gfid, bufs[child].ia_gfid)) {
- GF_ASSERT (0);
- goto out;
- }
- }
- }
- if (gfid && (!uuid_is_null (*gfid)))
- uuid_copy (uuid, *gfid);
-out:
- return;
-}
-
-static gf_boolean_t
-afr_lookup_conflicting_entries (afr_local_t *local, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- gf_boolean_t conflict = _gf_false;
-
- priv = this->private;
- conflict = afr_conflicting_iattrs (local->cont.lookup.bufs,
- local->cont.lookup.success_children,
- priv->child_count, local->loc.path,
- this->name);
- return conflict;
-}
-
-gf_boolean_t
-afr_open_only_data_self_heal (char *data_self_heal)
-{
- return !strcmp (data_self_heal, "open");
-}
-
-gf_boolean_t
-afr_data_self_heal_enabled (char *data_self_heal)
-{
- gf_boolean_t enabled = _gf_false;
-
- if (gf_string2boolean (data_self_heal, &enabled) == -1) {
- enabled = !strcmp (data_self_heal, "open");
- GF_ASSERT (enabled);
- }
-
- return enabled;
-}
-
-static void
-afr_lookup_set_self_heal_params (afr_local_t *local, xlator_t *this)
-{
- int i = 0;
- struct iatt *bufs = NULL;
- dict_t **xattr = NULL;
- afr_private_t *priv = NULL;
- int32_t child1 = -1;
- int32_t child2 = -1;
- afr_self_heal_t *sh = NULL;
-
- priv = this->private;
- sh = &local->self_heal;
-
- afr_detect_self_heal_by_lookup_status (local, this);
-
- if (afr_lookup_gfid_missing_count (local, this))
- local->self_heal.do_gfid_self_heal = _gf_true;
-
- if (_gf_true == afr_lookup_conflicting_entries (local, this))
- local->self_heal.do_missing_entry_self_heal = _gf_true;
- else
- afr_update_gfid_from_iatts (local->self_heal.sh_gfid_req,
- local->cont.lookup.bufs,
- local->cont.lookup.success_children,
- priv->child_count);
-
- bufs = local->cont.lookup.bufs;
- for (i = 1; i < local->success_count; i++) {
- child1 = local->cont.lookup.success_children[i-1];
- child2 = local->cont.lookup.success_children[i];
- afr_detect_self_heal_by_iatt (local, this,
- &bufs[child1], &bufs[child2]);
- }
-
- xattr = local->cont.lookup.xattrs;
- for (i = 0; i < local->success_count; i++) {
- child1 = local->cont.lookup.success_children[i];
- afr_lookup_set_self_heal_params_by_xattr (local, this,
- xattr[child1]);
- }
- if (afr_open_only_data_self_heal (priv->data_self_heal))
- sh->do_data_self_heal = _gf_false;
- if (sh->do_metadata_self_heal)
- afr_lookup_check_set_metadata_split_brain (local, this);
- afr_detect_self_heal_by_split_brain_status (local, this);
-}
int
-afr_self_heal_lookup_unwind (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- int32_t sh_failed)
+afr_higher_errno (int32_t old_errno, int32_t new_errno)
{
- afr_local_t *local = NULL;
- int ret = -1;
- dict_t *xattr = NULL;
+ if (old_errno == ENODATA || new_errno == ENODATA)
+ return ENODATA;
+ if (old_errno == ESTALE || new_errno == ESTALE)
+ return ESTALE;
+ if (old_errno == ENOENT || new_errno == ENOENT)
+ return ENOENT;
- local = frame->local;
-
- if (op_ret == -1) {
- local->op_ret = -1;
- if (afr_error_more_important (local->op_errno, op_errno))
- local->op_errno = op_errno;
-
- goto out;
- } else {
- local->op_ret = 0;
- }
-
- afr_lookup_done_success_action (frame, this, _gf_true);
- xattr = local->cont.lookup.xattr;
- if (xattr) {
- ret = dict_set_int32 (xattr, "sh-failed", sh_failed);
- if (ret)
- gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set "
- "sh-failed to %d", local->loc.path, sh_failed);
- }
-out:
- AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
- local->cont.lookup.inode, &local->cont.lookup.buf,
- local->cont.lookup.xattr,
- &local->cont.lookup.postparent);
-
- return 0;
+ return new_errno;
}
-//TODO: At the moment only lookup needs this, so not doing any checks, in the
-// future we will have to do fop specific operations
-void
-afr_post_gfid_sh_success (call_frame_t *sh_frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_local_t *sh_local = NULL;
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
- int i = 0;
- struct iatt *lookup_bufs = NULL;
- struct iatt *lookup_parentbufs = NULL;
-
- sh_local = sh_frame->local;
- sh = &sh_local->self_heal;
- local = sh->orig_frame->local;
- lookup_bufs = local->cont.lookup.bufs;
- lookup_parentbufs = local->cont.lookup.postparents;
- priv = this->private;
-
- memcpy (lookup_bufs, sh->buf, priv->child_count * sizeof (*sh->buf));
- memcpy (lookup_parentbufs, sh->parentbufs,
- priv->child_count * sizeof (*sh->parentbufs));
-
- afr_reset_xattr (local->cont.lookup.xattrs, priv->child_count);
- if (local->cont.lookup.xattr) {
- dict_unref (local->cont.lookup.xattr);
- local->cont.lookup.xattr = NULL;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (sh->xattr[i])
- local->cont.lookup.xattrs[i] = dict_ref (sh->xattr[i]);
- }
-
- afr_reset_children (local->cont.lookup.success_children,
- priv->child_count);
- afr_children_copy (local->cont.lookup.success_children,
- sh->fresh_children, priv->child_count);
-}
-
-static void
-afr_lookup_perform_self_heal (call_frame_t *frame, xlator_t *this,
- gf_boolean_t *sh_launched)
-{
- unsigned int up_count = 0;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- char *reason = NULL;
-
- GF_ASSERT (sh_launched);
- *sh_launched = _gf_false;
- priv = this->private;
- local = frame->local;
-
- up_count = afr_up_children_count (local->child_up, priv->child_count);
- if (up_count == 1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Only 1 child up - do not attempt to detect self heal");
- goto out;
- }
-
- afr_lookup_set_self_heal_params (local, this);
- if (afr_can_self_heal_proceed (&local->self_heal, priv)) {
- if (afr_is_transaction_running (local))
- goto out;
- reason = "lookup detected pending operations";
- afr_launch_self_heal (frame, this, local->cont.lookup.inode,
- _gf_true, local->cont.lookup.buf.ia_type,
- reason, afr_post_gfid_sh_success,
- afr_self_heal_lookup_unwind);
- *sh_launched = _gf_true;
- }
-out:
- return;
-}
-
-void
-afr_get_fresh_children (int32_t *success_children, int32_t *sources,
- int32_t *fresh_children, unsigned int child_count)
+int
+afr_final_errno (afr_local_t *local, afr_private_t *priv)
{
- unsigned int i = 0;
- unsigned int j = 0;
+ int i = 0;
+ int op_errno = 0;
+ int tmp_errno = 0;
- GF_ASSERT (success_children);
- GF_ASSERT (sources);
- GF_ASSERT (fresh_children);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret == 0)
+ continue;
+ tmp_errno = local->replies[i].op_errno;
+ op_errno = afr_higher_errno (op_errno, tmp_errno);
+ }
- afr_reset_children (fresh_children, child_count);
- for (i = 0; i < child_count; i++) {
- if (success_children[i] == -1)
- break;
- if (afr_is_read_child (success_children, sources, child_count,
- success_children[i])) {
- fresh_children[j] = success_children[i];
- j++;
- }
- }
+ return op_errno;
}
static int
-afr_lookup_set_read_ctx (afr_local_t *local, xlator_t *this, int32_t read_child)
+get_pathinfo_host (char *pathinfo, char *hostname, size_t size)
{
- afr_private_t *priv = NULL;
-
- GF_ASSERT (read_child >= 0);
-
- priv = this->private;
- afr_get_fresh_children (local->cont.lookup.success_children,
- local->cont.lookup.sources,
- local->fresh_children, priv->child_count);
- afr_inode_set_read_ctx (this, local->cont.lookup.inode, read_child,
- local->fresh_children);
-
- return 0;
-}
-
-int
-afr_lookup_done_success_action (call_frame_t *frame, xlator_t *this,
- gf_boolean_t fail_conflict)
-{
- int32_t read_child = -1;
- int32_t ret = -1;
- afr_local_t *local = NULL;
- gf_boolean_t fresh_lookup = _gf_false;
-
- local = frame->local;
- fresh_lookup = local->cont.lookup.fresh_lookup;
+ char *start = NULL;
+ char *end = NULL;
+ int ret = -1;
+ int i = 0;
- if (local->loc.parent == NULL)
- fail_conflict = _gf_true;
-
- if (afr_lookup_conflicting_entries (local, this)) {
- if (fail_conflict == _gf_false)
- ret = 0;
+ if (!pathinfo)
goto out;
- }
-
- ret = afr_lookup_select_read_child (local, this, &read_child);
- if (!afr_is_transaction_running (local) || fresh_lookup) {
- if (read_child < 0)
- goto out;
- ret = afr_lookup_set_read_ctx (local, this, read_child);
- if (ret)
- goto out;
- }
-
- ret = afr_lookup_build_response_params (local, this);
- if (ret)
+ start = strchr (pathinfo, ':');
+ if (!start)
+ goto out;
+ end = strrchr (pathinfo, ':');
+ if (start == end)
goto out;
- afr_update_loc_gfids (&local->loc,
- &local->cont.lookup.buf,
- &local->cont.lookup.postparent);
+ memset (hostname, 0, size);
+ i = 0;
+ while (++start != end)
+ hostname[i++] = *start;
ret = 0;
out:
- if (ret) {
- local->op_ret = -1;
- local->op_errno = EIO;
- }
return ret;
}
int
-afr_lookup_get_latest_subvol (afr_local_t *local, xlator_t *this)
+afr_local_pathinfo (char *pathinfo, gf_boolean_t *local)
{
- afr_private_t *priv = NULL;
- int32_t *success_children = NULL;
- struct iatt *bufs = NULL;
- int i = 0;
- int child = 0;
- int lsubvol = -1;
-
- priv = this->private;
- success_children = local->cont.lookup.success_children;
- bufs = local->cont.lookup.bufs;
- for (i = 0; i < priv->child_count; i++) {
- child = success_children[i];
- if (child == -1)
- break;
- if (uuid_is_null (bufs[child].ia_gfid))
- continue;
- if (lsubvol < 0) {
- lsubvol = child;
- } else if (bufs[lsubvol].ia_ctime < bufs[child].ia_ctime) {
- lsubvol = child;
- } else if ((bufs[lsubvol].ia_ctime == bufs[child].ia_ctime) &&
- (bufs[lsubvol].ia_ctime_nsec < bufs[child].ia_ctime_nsec)) {
- lsubvol = child;
- }
- }
- return lsubvol;
-}
-
-void
-afr_lookup_mark_other_entries_stale (afr_local_t *local, xlator_t *this,
- int subvol)
-{
- afr_private_t *priv = NULL;
- int32_t *success_children = NULL;
- struct iatt *bufs = NULL;
- int i = 0;
- int child = 0;
-
- priv = this->private;
- success_children = local->cont.lookup.success_children;
- bufs = local->cont.lookup.bufs;
- memcpy (local->fresh_children, success_children,
- sizeof (*success_children) * priv->child_count);
- for (i = 0; i < priv->child_count; i++) {
- child = local->fresh_children[i];
- if (child == -1)
- break;
- if (child == subvol)
- continue;
- if (uuid_is_null (bufs[child].ia_gfid) &&
- (bufs[child].ia_type == bufs[subvol].ia_type))
- continue;
- afr_children_rm_child (success_children, child,
- priv->child_count);
- local->success_count--;
- }
- afr_reset_children (local->fresh_children, priv->child_count);
-}
-
-void
-afr_succeed_lookup_on_latest_iatt (afr_local_t *local, xlator_t *this)
-{
- int lsubvol = 0;
-
- if (!afr_lookup_conflicting_entries (local, this))
- goto out;
+ int ret = 0;
+ char pathinfohost[1024] = {0};
+ char localhost[1024] = {0};
+ xlator_t *this = THIS;
- lsubvol = afr_lookup_get_latest_subvol (local, this);
- if (lsubvol < 0)
+ *local = _gf_false;
+ ret = get_pathinfo_host (pathinfo, pathinfohost, sizeof (pathinfohost));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Invalid pathinfo: %s",
+ pathinfo);
goto out;
- afr_lookup_mark_other_entries_stale (local, this, lsubvol);
-out:
- return;
-}
-
-static void
-afr_lookup_done (call_frame_t *frame, xlator_t *this)
-{
- int unwind = 1;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int ret = -1;
- gf_boolean_t sh_launched = _gf_false;
- gf_boolean_t fail_conflict = _gf_false;
- int gfid_miss_count = 0;
- int enotconn_count = 0;
- int up_children_count = 0;
-
- priv = this->private;
- local = frame->local;
-
- if (local->op_ret < 0)
- goto unwind;
-
- if (local->cont.lookup.parent_entrylk && local->success_count > 1)
- afr_succeed_lookup_on_latest_iatt (local, this);
-
- gfid_miss_count = afr_lookup_gfid_missing_count (local, this);
- up_children_count = afr_up_children_count (local->child_up,
- priv->child_count);
- enotconn_count = priv->child_count - up_children_count;
- if ((gfid_miss_count == local->success_count) &&
- (enotconn_count > 0)) {
- local->op_ret = -1;
- local->op_errno = EIO;
- gf_log (this->name, GF_LOG_ERROR, "Failing lookup for %s, "
- "LOOKUP on a file without gfid is not allowed when "
- "some of the children are down", local->loc.path);
- goto unwind;
- }
-
- if ((gfid_miss_count == local->success_count) &&
- uuid_is_null (local->cont.lookup.gfid_req)) {
- local->op_ret = -1;
- local->op_errno = ENODATA;
- gf_log (this->name, GF_LOG_ERROR, "%s: No gfid present",
- local->loc.path);
- goto unwind;
- }
-
- if (gfid_miss_count && uuid_is_null (local->cont.lookup.gfid_req))
- fail_conflict = _gf_true;
- ret = afr_lookup_done_success_action (frame, this, fail_conflict);
- if (ret)
- goto unwind;
- uuid_copy (local->self_heal.sh_gfid_req, local->cont.lookup.gfid_req);
-
- afr_lookup_perform_self_heal (frame, this, &sh_launched);
- if (sh_launched) {
- unwind = 0;
- goto unwind;
- }
-
- unwind:
- if (unwind) {
- AFR_STACK_UNWIND (lookup, frame, local->op_ret,
- local->op_errno, local->cont.lookup.inode,
- &local->cont.lookup.buf,
- local->cont.lookup.xattr,
- &local->cont.lookup.postparent);
}
-}
-
-/*
- * During a lookup, some errors are more "important" than
- * others in that they must be given higher priority while
- * returning to the user.
- *
- * The hierarchy is ESTALE > ENOENT > others
- *
- */
-gf_boolean_t
-afr_error_more_important (int32_t old_errno, int32_t new_errno)
-{
- gf_boolean_t ret = _gf_true;
-
- /* Nothing should ever overwrite ESTALE */
- if (old_errno == ESTALE)
- ret = _gf_false;
-
- /* Nothing should overwrite ENOENT, except ESTALE/EIO*/
- else if ((old_errno == ENOENT) && (new_errno != ESTALE)
- && (new_errno != EIO))
- ret = _gf_false;
-
- return ret;
-}
-
-int32_t
-afr_resultant_errno_get (int32_t *children,
- int *child_errno, unsigned int child_count)
-{
- int i = 0;
- int32_t op_errno = 0;
- int child = 0;
-
- for (i = 0; i < child_count; i++) {
- if (children) {
- child = children[i];
- if (child == -1)
- break;
- } else {
- child = i;
- }
- if (afr_error_more_important (op_errno, child_errno[child]))
- op_errno = child_errno[child];
- }
- return op_errno;
-}
-
-static void
-afr_lookup_handle_error (afr_local_t *local, int32_t op_ret, int32_t op_errno)
-{
- GF_ASSERT (local);
- if (op_errno == ENOENT)
- local->enoent_count++;
-
- if (afr_error_more_important (local->op_errno, op_errno))
- local->op_errno = op_errno;
-
- if (local->op_errno == ESTALE) {
- local->op_ret = -1;
- }
-}
-
-static void
-afr_set_root_inode_on_first_lookup (afr_local_t *local, xlator_t *this,
- inode_t *inode)
-{
- afr_private_t *priv = NULL;
- GF_ASSERT (inode);
-
- if (!__is_root_gfid (inode->gfid))
- goto out;
- if (!afr_is_fresh_lookup (&local->loc, this))
+ ret = gethostname (localhost, sizeof (localhost));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "gethostname() failed, "
+ "reason: %s", strerror (errno));
goto out;
- priv = this->private;
- if ((priv->first_lookup)) {
- gf_log (this->name, GF_LOG_INFO, "added root inode");
- priv->root_inode = inode_ref (inode);
- priv->first_lookup = 0;
}
-out:
- return;
-}
-
-static void
-afr_lookup_cache_args (afr_local_t *local, int child_index, dict_t *xattr,
- struct iatt *buf, struct iatt *postparent)
-{
- GF_ASSERT (child_index >= 0);
- local->cont.lookup.xattrs[child_index] = dict_ref (xattr);
- local->cont.lookup.postparents[child_index] = *postparent;
- local->cont.lookup.bufs[child_index] = *buf;
-}
-static void
-afr_lookup_handle_first_success (afr_local_t *local, xlator_t *this,
- inode_t *inode, struct iatt *buf)
-{
- local->cont.lookup.inode = inode_ref (inode);
- local->cont.lookup.buf = *buf;
- afr_set_root_inode_on_first_lookup (local, this, inode);
+ if (!strcmp (localhost, pathinfohost))
+ *local = _gf_true;
+out:
+ return ret;
}
static int32_t
-afr_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict,
- dict_t *xdata)
+afr_local_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
{
int ret = 0;
char *pathinfo = NULL;
@@ -2063,6 +1377,9 @@ afr_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
+ priv = this->private;
+ child_index = (int32_t)(long)cookie;
+
ret = dict_get_str (dict, GF_XATTR_PATHINFO_KEY, &pathinfo);
if (ret != 0) {
goto out;
@@ -2073,7 +1390,6 @@ afr_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- priv = this->private;
/*
* Note that one local subvolume will override another here. The only
* way to avoid that would be to retain extra information about whether
@@ -2081,13 +1397,11 @@ afr_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
* the slowest local subvolume is far preferable to a remote one.
*/
if (is_local) {
- child_index = (int32_t)(long)cookie;
gf_log (this->name, GF_LOG_INFO,
"selecting local read_child %s",
priv->children[child_index]->name);
priv->read_child = child_index;
}
-
out:
STACK_DESTROY(frame->root);
return 0;
@@ -2106,220 +1420,357 @@ afr_attempt_local_discovery (xlator_t *this, int32_t child_index)
}
tmploc.gfid[sizeof(tmploc.gfid)-1] = 1;
- STACK_WIND_COOKIE (newframe, afr_discovery_cbk,
+ STACK_WIND_COOKIE (newframe, afr_local_discovery_cbk,
(void *)(long)child_index,
priv->children[child_index],
priv->children[child_index]->fops->getxattr,
&tmploc, GF_XATTR_PATHINFO_KEY, NULL);
}
-static void
-afr_lookup_handle_success (afr_local_t *local, xlator_t *this, int32_t child_index,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, dict_t *xattr,
- struct iatt *postparent)
+
+int
+afr_lookup_selfheal_wrap (void *opaque)
{
- afr_private_t *priv = this->private;
+ call_frame_t *frame = opaque;
+ afr_local_t *local = NULL;
+ xlator_t *this = NULL;
+ inode_t *inode = NULL;
- if (local->success_count == 0) {
- if (local->op_errno != ESTALE) {
- local->op_ret = op_ret;
- local->op_errno = 0;
- }
- afr_lookup_handle_first_success (local, this, inode, buf);
- }
- afr_lookup_update_lk_counts (local, this,
- child_index, xattr);
+ local = frame->local;
+ this = frame->this;
- afr_lookup_cache_args (local, child_index, xattr,
- buf, postparent);
+ afr_selfheal_name (frame->this, local->loc.pargfid, local->loc.name);
- if (local->do_discovery && (priv->read_child == (-1))) {
- afr_attempt_local_discovery(this,child_index);
- }
+ afr_replies_wipe (local, this->private);
+
+ inode = afr_selfheal_unlocked_lookup_on (frame, local->loc.parent,
+ local->loc.name, local->replies,
+ local->child_up);
+ if (inode)
+ inode_unref (inode);
+ afr_lookup_done (frame, this);
- local->cont.lookup.success_children[local->success_count] = child_index;
- local->success_count++;
+ return 0;
}
+
int
-afr_lookup_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf, dict_t *xattr,
- struct iatt *postparent)
+afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ call_frame_t *heal = NULL;
+ int i = 0, first = -1;
+ gf_boolean_t need_heal = _gf_false;
+ struct afr_reply *replies = NULL;
+ int ret = 0;
+
+ local = frame->local;
+ replies = local->replies;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (first == -1) {
+ first = i;
+ continue;
+ }
+
+ if (replies[i].op_ret != replies[first].op_ret) {
+ need_heal = _gf_true;
+ break;
+ }
+
+ if (uuid_compare (replies[i].poststat.ia_gfid,
+ replies[first].poststat.ia_gfid)) {
+ need_heal = _gf_true;
+ break;
+ }
+ }
+
+ if (need_heal) {
+ heal = copy_frame (frame);
+ if (heal)
+ heal->root->pid = -1;
+ ret = synctask_new (this->ctx->env, afr_lookup_selfheal_wrap,
+ afr_refresh_selfheal_done, heal, frame);
+ if (ret)
+ goto lookup_done;
+ } else {
+ lookup_done:
+ afr_lookup_done (frame, this);
+ }
+
+ return ret;
+}
+
+
+int
+afr_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode, struct iatt *buf,
+ dict_t *xdata, struct iatt *postparent)
{
afr_local_t * local = NULL;
int call_count = -1;
int child_index = -1;
- child_index = (long) cookie;
+ child_index = (long) cookie;
- LOCK (&frame->lock);
- {
- local = frame->local;
+ local = frame->local;
- if (op_ret == -1) {
- afr_lookup_handle_error (local, op_ret, op_errno);
- goto unlock;
- }
- afr_lookup_handle_success (local, this, child_index, op_ret,
- op_errno, inode, buf, xattr,
- postparent);
-
- }
-unlock:
- UNLOCK (&frame->lock);
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+ if (op_ret != -1) {
+ local->replies[child_index].poststat = *buf;
+ local->replies[child_index].postparent = *postparent;
+ if (xdata)
+ local->replies[child_index].xdata = dict_ref (xdata);
+ }
call_count = afr_frame_return (frame);
if (call_count == 0) {
- afr_lookup_done (frame, this);
+ afr_lookup_entry_heal (frame, this);
}
- return 0;
+ return 0;
}
-int
-afr_lookup_cont_init (afr_local_t *local, unsigned int child_count)
+
+
+static void
+afr_discover_done (call_frame_t *frame, xlator_t *this)
{
- int ret = -ENOMEM;
- struct iatt *iatts = NULL;
- int32_t *success_children = NULL;
- int32_t *sources = NULL;
- int32_t **pending_matrix = NULL;
-
- GF_ASSERT (local);
- local->cont.lookup.xattrs = GF_CALLOC (child_count,
- sizeof (*local->cont.lookup.xattr),
- gf_afr_mt_dict_t);
- if (NULL == local->cont.lookup.xattrs)
- goto out;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = -1;
+ int op_errno = 0;
+ int read_subvol = 0;
- iatts = GF_CALLOC (child_count, sizeof (*iatts), gf_afr_mt_iatt);
- if (NULL == iatts)
- goto out;
- local->cont.lookup.postparents = iatts;
+ priv = this->private;
+ local = frame->local;
- iatts = GF_CALLOC (child_count, sizeof (*iatts), gf_afr_mt_iatt);
- if (NULL == iatts)
- goto out;
- local->cont.lookup.bufs = iatts;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret == 0)
+ local->op_ret = 0;
+ }
- success_children = afr_children_create (child_count);
- if (NULL == success_children)
- goto out;
- local->cont.lookup.success_children = success_children;
+ op_errno = afr_final_errno (frame->local, this->private);
- local->fresh_children = afr_children_create (child_count);
- if (NULL == local->fresh_children)
- goto out;
+ if (local->op_ret < 0) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ goto unwind;
+ }
- sources = GF_CALLOC (sizeof (*sources), child_count, gf_afr_mt_int32_t);
- if (NULL == sources)
- goto out;
- local->cont.lookup.sources = sources;
+ afr_replies_interpret (frame, this, local->inode);
- pending_matrix = afr_matrix_create (child_count, child_count);
- if (NULL == pending_matrix)
- goto out;
- local->cont.lookup.pending_matrix = pending_matrix;
+ read_subvol = afr_data_subvol_get (local->inode, this, 0, 0);
+ if (read_subvol == -1) {
+ gf_log (this->name, GF_LOG_WARNING, "no read subvols for %s",
+ local->loc.path);
- ret = 0;
-out:
- return ret;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid ||
+ local->replies[i].op_ret == -1)
+ continue;
+ read_subvol = i;
+ break;
+ }
+ }
+
+unwind:
+ if (read_subvol == -1)
+ read_subvol = 0;
+
+ AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
+ local->inode, &local->replies[read_subvol].poststat,
+ local->replies[read_subvol].xdata,
+ &local->replies[read_subvol].postparent);
}
+
int
-afr_lookup (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *xattr_req)
+afr_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode, struct iatt *buf,
+ dict_t *xdata, struct iatt *postparent)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- void *gfid_req = NULL;
- int ret = -1;
- int i = 0;
- int call_count = 0;
- uint64_t ctx = 0;
- int32_t op_errno = 0;
+ afr_local_t * local = NULL;
+ int call_count = -1;
+ int child_index = -1;
- priv = this->private;
+ child_index = (long) cookie;
- AFR_LOCAL_ALLOC_OR_GOTO (local, out);
+ local = frame->local;
- local->op_ret = -1;
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+ if (op_ret != -1) {
+ local->replies[child_index].poststat = *buf;
+ local->replies[child_index].postparent = *postparent;
+ if (xdata)
+ local->replies[child_index].xdata = dict_ref (xdata);
+ }
- frame->local = local;
- local->fop = GF_FOP_LOOKUP;
+ if (local->do_discovery && (op_ret == 0))
+ afr_attempt_local_discovery (this, child_index);
- loc_copy (&local->loc, loc);
- ret = loc_path (&local->loc, NULL);
- if (ret < 0) {
- op_errno = EINVAL;
- goto out;
+ call_count = afr_frame_return (frame);
+ if (call_count == 0) {
+ afr_discover_done (frame, this);
}
- ret = inode_ctx_get (local->loc.inode, this, &ctx);
- if (ret == 0) {
- /* lookup is a revalidate */
+ return 0;
+}
- local->read_child_index = afr_inode_get_read_ctx (this,
- local->loc.inode,
- NULL);
- } else {
- LOCK (&priv->read_child_lock);
- {
- if (priv->hash_mode) {
- local->read_child_index = -1;
- }
- else {
- local->read_child_index =
- (++priv->read_child_rr) %
- (priv->child_count);
- }
- }
- UNLOCK (&priv->read_child_lock);
- local->cont.lookup.fresh_lookup = _gf_true;
- }
- local->child_up = memdup (priv->child_up,
- sizeof (*local->child_up) * priv->child_count);
- if (NULL == local->child_up) {
- op_errno = ENOMEM;
+int
+afr_discover_do (call_frame_t *frame, xlator_t *this, int err)
+{
+ int ret = 0;
+ int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (err) {
+ local->op_errno = -err;
+ ret = -1;
+ goto out;
+ }
+
+ call_count = local->call_count = AFR_COUNT (local->child_up,
+ priv->child_count);
+
+ ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req,
+ &local->loc);
+ if (ret) {
+ local->op_errno = -ret;
+ ret = -1;
goto out;
}
- ret = afr_lookup_cont_init (local, priv->child_count);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, afr_discover_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->lookup,
+ &local->loc, local->xattr_req);
+ if (!--call_count)
+ break;
+ }
}
- local->call_count = afr_up_children_count (local->child_up,
- priv->child_count);
- call_count = local->call_count;
- if (local->call_count == 0) {
- ret = -1;
+ return 0;
+out:
+ AFR_STACK_UNWIND (lookup, frame, -1, local->op_errno, 0, 0, 0, 0);
+ return 0;
+}
+
+
+int
+afr_discover (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
+{
+ int op_errno = ENOMEM;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int event = 0;
+
+ priv = this->private;
+
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+
+ if (!local->call_count) {
op_errno = ENOTCONN;
goto out;
}
- /* By default assume ENOTCONN. On success it will be set to 0. */
- local->op_errno = ENOTCONN;
+ if (__is_root_gfid (loc->inode->gfid)) {
+ if (!this->itable)
+ this->itable = loc->inode->table;
+ if (!priv->root_inode)
+ priv->root_inode = inode_ref (loc->inode);
+
+ if (priv->choose_local && !priv->did_discovery) {
+ /* Logic to detect which subvolumes of AFR are
+ local, in order to prefer them for reads
+ */
+ local->do_discovery = _gf_true;
+ priv->did_discovery = _gf_true;
+ }
+ }
+
+ local->op = GF_FOP_LOOKUP;
+
+ loc_copy (&local->loc, loc);
+
+ local->inode = inode_ref (loc->inode);
+
+ if (xattr_req)
+ /* If xattr_req was null, afr_lookup_xattr_req_prepare() will
+ allocate one for us */
+ local->xattr_req = dict_ref (xattr_req);
+
+ if (uuid_is_null (loc->inode->gfid)) {
+ afr_discover_do (frame, this, 0);
+ return 0;
+ }
+
+ afr_read_subvol_get (loc->inode, this, NULL, &event,
+ AFR_DATA_TRANSACTION);
+
+ if (event != local->event_generation)
+ afr_inode_refresh (frame, this, loc->inode, afr_discover_do);
+ else
+ afr_discover_do (frame, this, 0);
+
+ return 0;
+out:
+ AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+ return 0;
+}
+
+
+int
+afr_lookup_do (call_frame_t *frame, xlator_t *this, int err)
+{
+ int ret = 0;
+ int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+ priv = this->private;
- ret = afr_lookup_xattr_req_prepare (local, this, xattr_req, &local->loc,
- &gfid_req);
+ if (err < 0) {
+ local->op_errno = -err;
+ ret = -1;
+ goto out;
+ }
+
+ call_count = local->call_count = AFR_COUNT (local->child_up,
+ priv->child_count);
+
+ ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req,
+ &local->loc);
if (ret) {
local->op_errno = -ret;
+ ret = -1;
goto out;
}
- afr_lookup_save_gfid (local->cont.lookup.gfid_req, gfid_req,
- &local->loc);
- local->fop = GF_FOP_LOOKUP;
- if (priv->choose_local && !priv->did_discovery) {
- if (gfid_req && __is_root_gfid(gfid_req)) {
- local->do_discovery = _gf_true;
- priv->did_discovery = _gf_true;
- }
- }
+
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
STACK_WIND_COOKIE (frame, afr_lookup_cbk,
@@ -2331,12 +1782,98 @@ afr_lookup (call_frame_t *frame, xlator_t *this,
break;
}
}
+ return 0;
+out:
+ AFR_STACK_UNWIND (lookup, frame, -1, local->op_errno, 0, 0, 0, 0);
+ return 0;
+}
- ret = 0;
+/*
+ * afr_lookup()
+ *
+ * The goal here is to figure out what the element getting looked up is.
+ * i.e what is the GFID, inode type and a conservative estimate of the
+ * inode attributes are.
+ *
+ * As we lookup, operations may be underway on the entry name and the
+ * inode. In lookup() we are primarily concerned only with the entry
+ * operations. If the entry is getting unlinked or renamed, we detect
+ * what operation is underway by querying for on-going transactions and
+ * pending self-healing on the entry through xdata.
+ *
+ * If the entry is a file/dir, it may need self-heal and/or in a
+ * split-brain condition. Lookup is not the place to worry about these
+ * conditions. Outcast marking will naturally handle them in the read
+ * paths.
+ *
+ * Here is a brief goal of what we are trying to achieve:
+ *
+ * - LOOKUP on all subvolumes concurrently, querying on-going transaction
+ * and pending self-heal info from the servers.
+ *
+ * - If all servers reply the same inode type and GFID, the overall call
+ * MUST be a success.
+ *
+ * - If inode types or GFIDs mismatch, and there IS either an on-going
+ * transaction or pending self-heal, inspect what the nature of the
+ * transaction or pending heal is, and select the appropriate subvolume's
+ * reply as the winner.
+ *
+ * - If inode types or GFIDs mismatch, and there are no on-going transactions
+ * or pending self-heal on the entry name on any of the servers, fail the
+ * lookup with EIO. Something has gone wrong beyond reasonable action.
+ */
+
+int
+afr_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
+{
+ afr_local_t *local = NULL;
+ int32_t op_errno = 0;
+ int event = 0;
+
+ if (!loc->parent) {
+ afr_discover (frame, this, loc, xattr_req);
+ return 0;
+ }
+
+ if (__is_root_gfid (loc->parent->gfid)) {
+ if (!strcmp (loc->name, GF_REPLICATE_TRASH_DIR)) {
+ op_errno = EPERM;
+ goto out;
+ }
+ }
+
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+
+ if (!local->call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ local->op = GF_FOP_LOOKUP;
+
+ loc_copy (&local->loc, loc);
+
+ local->inode = inode_ref (loc->inode);
+
+ if (xattr_req)
+ /* If xattr_req was null, afr_lookup_xattr_req_prepare() will
+ allocate one for us */
+ local->xattr_req = dict_ref (xattr_req);
+
+ afr_read_subvol_get (loc->parent, this, NULL, &event,
+ AFR_DATA_TRANSACTION);
+
+ if (event != local->event_generation)
+ afr_inode_refresh (frame, this, loc->parent, afr_lookup_do);
+ else
+ afr_lookup_do (frame, this, 0);
+
+ return 0;
out:
- if (ret)
- AFR_STACK_UNWIND (lookup, frame, -1, op_errno,
- NULL, NULL, NULL, NULL);
+ AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
return 0;
}
@@ -2344,6 +1881,46 @@ out:
/* {{{ open */
+afr_fd_ctx_t *
+__afr_fd_ctx_get (fd_t *fd, xlator_t *this)
+{
+ uint64_t ctx = 0;
+ int ret = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ ret = __fd_ctx_get (fd, this, &ctx);
+
+ if (ret < 0) {
+ ret = __afr_fd_ctx_set (this, fd);
+ if (ret < 0)
+ goto out;
+
+ ret = __fd_ctx_get (fd, this, &ctx);
+ if (ret < 0)
+ goto out;
+ }
+
+ fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+out:
+ return fd_ctx;
+}
+
+
+afr_fd_ctx_t *
+afr_fd_ctx_get (fd_t *fd, xlator_t *this)
+{
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ LOCK(&fd->lock);
+ {
+ fd_ctx = __afr_fd_ctx_get (fd, this);
+ }
+ UNLOCK(&fd->lock);
+
+ return fd_ctx;
+}
+
+
int
__afr_fd_ctx_set (xlator_t *this, fd_t *fd)
{
@@ -2351,6 +1928,7 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd)
int ret = -1;
uint64_t ctx = 0;
afr_fd_ctx_t * fd_ctx = NULL;
+ int i = 0;
VALIDATE_OR_GOTO (this->private, out);
VALIDATE_OR_GOTO (fd, out);
@@ -2369,21 +1947,15 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd)
goto out;
}
- fd_ctx->pre_op_done = GF_CALLOC (sizeof (*fd_ctx->pre_op_done),
- priv->child_count,
- gf_afr_mt_char);
- if (!fd_ctx->pre_op_done) {
- ret = -ENOMEM;
- goto out;
- }
-
- fd_ctx->pre_op_piggyback = GF_CALLOC (sizeof (*fd_ctx->pre_op_piggyback),
- priv->child_count,
- gf_afr_mt_char);
- if (!fd_ctx->pre_op_piggyback) {
- ret = -ENOMEM;
- goto out;
- }
+ for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
+ fd_ctx->pre_op_done[i] = GF_CALLOC (sizeof (*fd_ctx->pre_op_done[i]),
+ priv->child_count,
+ gf_afr_mt_int32_t);
+ if (!fd_ctx->pre_op_done[i]) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on),
priv->child_count,
@@ -2393,6 +1965,13 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd)
goto out;
}
+ for (i = 0; i < priv->child_count; i++) {
+ if (fd_is_anonymous (fd))
+ fd_ctx->opened_on[i] = AFR_FD_OPENED;
+ else
+ fd_ctx->opened_on[i] = AFR_FD_NOT_OPENED;
+ }
+
fd_ctx->lock_piggyback = GF_CALLOC (sizeof (*fd_ctx->lock_piggyback),
priv->child_count,
gf_afr_mt_char);
@@ -2409,20 +1988,9 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd)
goto out;
}
- fd_ctx->up_count = priv->up_count;
- fd_ctx->down_count = priv->down_count;
-
- fd_ctx->locked_on = GF_CALLOC (sizeof (*fd_ctx->locked_on),
- priv->child_count,
- gf_afr_mt_char);
- if (!fd_ctx->locked_on) {
- ret = -ENOMEM;
- goto out;
- }
-
pthread_mutex_init (&fd_ctx->delay_lock, NULL);
- INIT_LIST_HEAD (&fd_ctx->paused_calls);
- INIT_LIST_HEAD (&fd_ctx->entries);
+
+ INIT_LIST_HEAD (&fd_ctx->eager_locked);
ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx);
if (ret)
@@ -2450,189 +2018,90 @@ afr_fd_ctx_set (xlator_t *this, fd_t *fd)
/* {{{ flush */
int
-afr_flush_unwind (call_frame_t *frame, xlator_t *this)
+afr_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (flush, main_frame,
- local->op_ret, local->op_errno,
- NULL);
- }
-
- return 0;
-}
-
-
-int
-afr_flush_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int call_count = -1;
- int child_index = (long) cookie;
- int need_unwind = 0;
+ afr_local_t *local = NULL;
+ int call_count = -1;
local = frame->local;
- priv = this->private;
LOCK (&frame->lock);
{
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
-
- if (local->success_count == priv->wait_count) {
- need_unwind = 1;
- }
- }
-
- local->op_errno = op_errno;
+ local->op_ret = op_ret;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref (xdata);
+ } else {
+ local->op_errno = op_errno;
+ }
}
UNLOCK (&frame->lock);
- if (need_unwind)
- afr_flush_unwind (frame, this);
+ call_count = afr_frame_return (frame);
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
+ if (call_count == 0)
+ AFR_STACK_UNWIND (flush, frame, local->op_ret,
+ local->op_errno, local->xdata_rsp);
return 0;
}
-
-int
-afr_flush_wind (call_frame_t *frame, xlator_t *this)
+static int
+afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- int call_count = -1;
+ int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = -1;
- local = frame->local;
priv = this->private;
-
- call_count = afr_up_children_count (local->child_up, priv->child_count);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
+ local = frame->local;
+ call_count = local->call_count;
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_flush_wind_cbk,
+ STACK_WIND_COOKIE (frame, afr_flush_cbk,
(void *) (long) i,
priv->children[i],
priv->children[i]->fops->flush,
- local->fd, NULL);
-
+ local->fd, xdata);
if (!--call_count)
break;
+
}
}
return 0;
}
-
-int
-afr_flush_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
- return 0;
-}
-
-
int
afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
{
- afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_errno = 0;
+ call_stub_t *stub = NULL;
+ int op_errno = ENOMEM;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
- priv = this->private;
+ if (!local->call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
- goto out;
- }
+ local->fd = fd_ref(fd);
- AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out);
- local = transaction_frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
+ stub = fop_flush_stub (frame, afr_flush_wrapper, fd, xdata);
+ if (!stub)
goto out;
- local->op = GF_FOP_FLUSH;
-
- local->transaction.fop = afr_flush_wind;
- local->transaction.done = afr_flush_done;
- local->transaction.unwind = afr_flush_unwind;
-
- local->fd = fd_ref (fd);
-
- local->transaction.main_frame = frame;
- local->transaction.start = 0;
- local->transaction.len = 0;
+ afr_delayed_changelog_wake_resume (this, fd, stub);
- ret = afr_open_fd_fix (transaction_frame, this, _gf_false);
- if (ret) {
- op_errno = -ret;
- goto out;
- }
- ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- ret = 0;
+ return 0;
out:
- if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
-
- AFR_STACK_UNWIND (flush, frame, -1, op_errno, NULL);
- }
-
+ AFR_STACK_UNWIND (flush, frame, -1, op_errno, NULL);
return 0;
}
@@ -2645,8 +2114,7 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd)
uint64_t ctx = 0;
afr_fd_ctx_t *fd_ctx = NULL;
int ret = 0;
- afr_fd_paused_call_t *paused_call = NULL;
- afr_fd_paused_call_t *tmp = NULL;
+ int i = 0;
ret = fd_ctx_get (fd, this, &ctx);
if (ret < 0)
@@ -2655,18 +2123,16 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd)
fd_ctx = (afr_fd_ctx_t *)(long) ctx;
if (fd_ctx) {
- GF_FREE (fd_ctx->pre_op_done);
-
- GF_FREE (fd_ctx->opened_on);
+ //no need to take any locks
+ if (!list_empty (&fd_ctx->eager_locked))
+ gf_log (this->name, GF_LOG_WARNING, "%s: Stale "
+ "Eager-lock stubs found",
+ uuid_utoa (fd->inode->gfid));
- GF_FREE (fd_ctx->locked_on);
+ for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++)
+ GF_FREE (fd_ctx->pre_op_done[i]);
- GF_FREE (fd_ctx->pre_op_piggyback);
- list_for_each_entry_safe (paused_call, tmp, &fd_ctx->paused_calls,
- call_list) {
- list_del_init (&paused_call->call_list);
- GF_FREE (paused_call);
- }
+ GF_FREE (fd_ctx->opened_on);
GF_FREE (fd_ctx->lock_piggyback);
@@ -2685,24 +2151,8 @@ out:
int
afr_release (xlator_t *this, fd_t *fd)
{
- afr_locked_fd_t *locked_fd = NULL;
- afr_locked_fd_t *tmp = NULL;
- afr_private_t *priv = NULL;
-
- priv = this->private;
-
afr_cleanup_fd_ctx (this, fd);
- list_for_each_entry_safe (locked_fd, tmp, &priv->saved_fds,
- list) {
-
- if (locked_fd->fd == fd) {
- list_del_init (&locked_fd->list);
- GF_FREE (locked_fd);
- }
-
- }
-
return 0;
}
@@ -2710,6 +2160,16 @@ afr_release (xlator_t *this, fd_t *fd)
/* {{{ fsync */
int
+afr_fsync_unwind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ AFR_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ return 0;
+}
+
+int
afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
@@ -2717,45 +2177,70 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
afr_local_t *local = NULL;
int call_count = -1;
int child_index = (long) cookie;
- int read_child = 0;
+ int read_subvol = 0;
+ call_stub_t *stub = NULL;
local = frame->local;
- read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL);
+ read_subvol = afr_data_subvol_get (local->inode, this, 0, 0);
LOCK (&frame->lock);
{
- if (child_index == read_child) {
- local->read_child_returned = _gf_true;
- }
-
if (op_ret == 0) {
- local->op_ret = 0;
+ if (local->op_ret == -1) {
+ local->op_ret = 0;
- if (local->success_count == 0) {
- local->cont.fsync.prebuf = *prebuf;
- local->cont.fsync.postbuf = *postbuf;
- }
+ local->cont.inode_wfop.prebuf = *prebuf;
+ local->cont.inode_wfop.postbuf = *postbuf;
- if (child_index == read_child) {
- local->cont.fsync.prebuf = *prebuf;
- local->cont.fsync.postbuf = *postbuf;
+ if (xdata)
+ local->xdata_rsp = dict_ref (xdata);
}
- local->success_count++;
- }
-
- local->op_errno = op_errno;
+ if (child_index == read_subvol) {
+ local->cont.inode_wfop.prebuf = *prebuf;
+ local->cont.inode_wfop.postbuf = *postbuf;
+ if (xdata) {
+ if (local->xdata_rsp)
+ dict_unref (local->xdata_rsp);
+ local->xdata_rsp = dict_ref (xdata);
+ }
+ }
+ } else {
+ local->op_errno = op_errno;
+ }
}
UNLOCK (&frame->lock);
call_count = afr_frame_return (frame);
if (call_count == 0) {
- AFR_STACK_UNWIND (fsync, frame, local->op_ret, local->op_errno,
- &local->cont.fsync.prebuf,
- &local->cont.fsync.postbuf,
- NULL);
+ /* Make a stub out of the frame, and register it
+ with the waking up post-op. When the call-stub resumes,
+ we are guaranteed that there was no post-op pending
+ (i.e changelogs were unset in the server). This is an
+ essential "guarantee", that fsync() returns only after
+ completely finishing EVERYTHING, including the delayed
+ post-op. This guarantee is expected by FUSE graph switching
+ for example.
+ */
+ stub = fop_fsync_cbk_stub (frame, afr_fsync_unwind_cbk,
+ local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf,
+ local->xdata_rsp);
+ if (!stub) {
+ AFR_STACK_UNWIND (fsync, frame, -1, ENOMEM, 0, 0, 0);
+ return 0;
+ }
+
+ /* If no new unstable writes happened between the
+ time we cleared the unstable write witness flag in afr_fsync
+ and now, calling afr_delayed_changelog_wake_up() should
+ wake up and skip over the fsync phase and go straight to
+ afr_changelog_post_op_now()
+ */
+ afr_delayed_changelog_wake_resume (this, local->fd, stub);
}
return 0;
@@ -2763,32 +2248,34 @@ afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int
-afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd,
- int32_t datasync, dict_t *xdata)
+afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+ dict_t *xdata)
{
- afr_private_t *priv = NULL;
+ afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- int ret = -1;
int i = 0;
int32_t call_count = 0;
- int32_t op_errno = 0;
+ int32_t op_errno = ENOMEM;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ priv = this->private;
- priv = this->private;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
+ call_count = local->call_count;
+ if (!call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local->fd = fd_ref (fd);
- call_count = local->call_count;
+ if (afr_fd_has_witnessed_unstable_write (this, fd)) {
+ /* don't care. we only wanted to CLEAR the bit */
+ }
- local->fd = fd_ref (fd);
+ local->inode = inode_ref (fd->inode);
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
@@ -2802,10 +2289,10 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd,
}
}
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL);
+ AFR_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL);
+
return 0;
}
@@ -2813,10 +2300,9 @@ out:
/* {{{ fsync */
-int32_t
-afr_fsyncdir_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *xdata)
+int
+afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
afr_local_t *local = NULL;
int call_count = -1;
@@ -2825,10 +2311,13 @@ afr_fsyncdir_cbk (call_frame_t *frame, void *cookie,
LOCK (&frame->lock);
{
- if (op_ret == 0)
+ if (op_ret == 0) {
local->op_ret = 0;
-
- local->op_errno = op_errno;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref (xdata);
+ } else {
+ local->op_errno = op_errno;
+ }
}
UNLOCK (&frame->lock);
@@ -2836,37 +2325,33 @@ afr_fsyncdir_cbk (call_frame_t *frame, void *cookie,
if (call_count == 0)
AFR_STACK_UNWIND (fsyncdir, frame, local->op_ret,
- local->op_errno, xdata);
+ local->op_errno, local->xdata_rsp);
return 0;
}
-int32_t
-afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd,
- int32_t datasync, dict_t *xdata)
+int
+afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+ dict_t *xdata)
{
- afr_private_t *priv = NULL;
+ afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- int ret = -1;
int i = 0;
int32_t call_count = 0;
- int32_t op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
+ int32_t op_errno = ENOMEM;
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
+ priv = this->private;
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
call_count = local->call_count;
+ if (!call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
@@ -2879,10 +2364,10 @@ afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd,
}
}
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL);
+ AFR_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL);
+
return 0;
}
@@ -2905,6 +2390,10 @@ afr_xattrop_cbk (call_frame_t *frame, void *cookie,
if (op_ret == 0) {
if (!local->cont.xattrop.xattr)
local->cont.xattrop.xattr = dict_ref (xattr);
+
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref (xdata);
+
local->op_ret = 0;
}
@@ -2916,7 +2405,7 @@ afr_xattrop_cbk (call_frame_t *frame, void *cookie,
if (call_count == 0)
AFR_STACK_UNWIND (xattrop, frame, local->op_ret, local->op_errno,
- local->cont.xattrop.xattr, xdata);
+ local->cont.xattrop.xattr, local->xdata_rsp);
return 0;
}
@@ -2928,25 +2417,21 @@ afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- int ret = -1;
int i = 0;
int32_t call_count = 0;
- int32_t op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ int32_t op_errno = ENOMEM;
priv = this->private;
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
call_count = local->call_count;
+ if (!call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
@@ -2959,10 +2444,10 @@ afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
}
}
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL);
+ AFR_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL);
+
return 0;
}
@@ -2987,6 +2472,8 @@ afr_fxattrop_cbk (call_frame_t *frame, void *cookie,
if (!local->cont.fxattrop.xattr)
local->cont.fxattrop.xattr = dict_ref (xattr);
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref (xdata);
local->op_ret = 0;
}
@@ -2998,7 +2485,7 @@ afr_fxattrop_cbk (call_frame_t *frame, void *cookie,
if (call_count == 0)
AFR_STACK_UNWIND (fxattrop, frame, local->op_ret, local->op_errno,
- local->cont.fxattrop.xattr, xdata);
+ local->cont.fxattrop.xattr, local->xdata_rsp);
return 0;
}
@@ -3010,25 +2497,21 @@ afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- int ret = -1;
int i = 0;
int32_t call_count = 0;
int32_t op_errno = 0;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
priv = this->private;
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
goto out;
call_count = local->call_count;
+ if (!call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
@@ -3041,10 +2524,10 @@ afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
}
}
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL);
+ AFR_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL);
+
return 0;
}
@@ -3052,8 +2535,8 @@ out:
int32_t
-afr_inodelk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata)
+afr_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
afr_local_t *local = NULL;
@@ -3087,25 +2570,21 @@ afr_inodelk (call_frame_t *frame, xlator_t *this,
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- int ret = -1;
int i = 0;
int32_t call_count = 0;
- int32_t op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ int32_t op_errno = ENOMEM;
priv = this->private;
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
goto out;
call_count = local->call_count;
+ if (!call_count) {
+ op_errno = ENOMEM;
+ goto out;
+ }
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
@@ -3119,18 +2598,17 @@ afr_inodelk (call_frame_t *frame, xlator_t *this,
}
}
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (inodelk, frame, -1, op_errno, NULL);
+ AFR_STACK_UNWIND (inodelk, frame, -1, op_errno, NULL);
+
return 0;
}
int32_t
-afr_finodelk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *xdata)
+afr_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
afr_local_t *local = NULL;
@@ -3158,31 +2636,26 @@ afr_finodelk_cbk (call_frame_t *frame, void *cookie,
int32_t
-afr_finodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock,
- dict_t *xdata)
+afr_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+ int32_t cmd, struct gf_flock *flock, dict_t *xdata)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- int ret = -1;
int i = 0;
int32_t call_count = 0;
- int32_t op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ int32_t op_errno = ENOMEM;
priv = this->private;
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
call_count = local->call_count;
+ if (!call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
@@ -3196,10 +2669,10 @@ afr_finodelk (call_frame_t *frame, xlator_t *this,
}
}
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL);
+ AFR_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL);
+
return 0;
}
@@ -3232,33 +2705,28 @@ afr_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
-int32_t
-afr_entrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc,
- const char *basename, entrylk_cmd cmd, entrylk_type type,
- dict_t *xdata)
+int
+afr_entrylk (call_frame_t *frame, xlator_t *this, const char *volume,
+ loc_t *loc, const char *basename, entrylk_cmd cmd,
+ entrylk_type type, dict_t *xdata)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- int ret = -1;
int i = 0;
int32_t call_count = 0;
int32_t op_errno = 0;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
priv = this->private;
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
call_count = local->call_count;
+ if (!call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
@@ -3272,18 +2740,18 @@ afr_entrylk (call_frame_t *frame, xlator_t *this,
}
}
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (entrylk, frame, -1, op_errno, NULL);
+ AFR_STACK_UNWIND (entrylk, frame, -1, op_errno, NULL);
+
return 0;
}
-int32_t
-afr_fentrylk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata)
+int
+afr_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
afr_local_t *local = NULL;
@@ -3310,33 +2778,28 @@ afr_fentrylk_cbk (call_frame_t *frame, void *cookie,
}
-int32_t
-afr_fentrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd,
- const char *basename, entrylk_cmd cmd,
- entrylk_type type, dict_t *xdata)
+int
+afr_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+ const char *basename, entrylk_cmd cmd, entrylk_type type,
+ dict_t *xdata)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- int ret = -1;
int i = 0;
int32_t call_count = 0;
- int32_t op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ int32_t op_errno = ENOMEM;
priv = this->private;
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
call_count = local->call_count;
+ if (!call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
@@ -3350,82 +2813,85 @@ afr_fentrylk (call_frame_t *frame, xlator_t *this,
}
}
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (fentrylk, frame, -1, op_errno, NULL);
+ AFR_STACK_UNWIND (fentrylk, frame, -1, op_errno, NULL);
+
return 0;
}
-int32_t
-afr_statfs_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- struct statvfs *statvfs, dict_t *xdata)
+
+int
+afr_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct statvfs *statvfs, dict_t *xdata)
{
afr_local_t *local = NULL;
int call_count = 0;
+ struct statvfs *buf = NULL;
LOCK (&frame->lock);
{
local = frame->local;
- if (op_ret == 0) {
- local->op_ret = op_ret;
-
- if (local->cont.statfs.buf_set) {
- if (statvfs->f_bavail < local->cont.statfs.buf.f_bavail)
- local->cont.statfs.buf = *statvfs;
- } else {
- local->cont.statfs.buf = *statvfs;
- local->cont.statfs.buf_set = 1;
- }
- }
-
- if (op_ret == -1)
+ if (op_ret != 0) {
local->op_errno = op_errno;
-
+ goto unlock;
+ }
+
+ local->op_ret = op_ret;
+
+ buf = &local->cont.statfs.buf;
+ if (local->cont.statfs.buf_set) {
+ if (statvfs->f_bavail < buf->f_bavail) {
+ *buf = *statvfs;
+ if (xdata) {
+ if (local->xdata_rsp)
+ dict_unref (local->xdata_rsp);
+ local->xdata_rsp = dict_ref (xdata);
+ }
+ }
+ } else {
+ *buf = *statvfs;
+ local->cont.statfs.buf_set = 1;
+ if (xdata)
+ local->xdata_rsp = dict_ref (xdata);
+ }
}
+unlock:
UNLOCK (&frame->lock);
call_count = afr_frame_return (frame);
if (call_count == 0)
AFR_STACK_UNWIND (statfs, frame, local->op_ret, local->op_errno,
- &local->cont.statfs.buf, xdata);
+ &local->cont.statfs.buf, local->xdata_rsp);
return 0;
}
-int32_t
-afr_statfs (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *xdata)
+int
+afr_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- int child_count = 0;
afr_local_t * local = NULL;
+ afr_private_t *priv = NULL;
int i = 0;
- int ret = -1;
int call_count = 0;
- int32_t op_errno = 0;
+ int32_t op_errno = ENOMEM;
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (loc, out);
+ priv = this->private;
- priv = this->private;
- child_count = priv->child_count;
-
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
call_count = local->call_count;
+ if (!call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
- for (i = 0; i < child_count; i++) {
+ for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
STACK_WIND (frame, afr_statfs_cbk,
priv->children[i],
@@ -3436,10 +2902,10 @@ afr_statfs (call_frame_t *frame, xlator_t *this,
}
}
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL);
+ AFR_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL);
+
return 0;
}
@@ -3548,21 +3014,6 @@ afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
AFR_STACK_UNWIND (lk, frame, -1, ENOTCONN,
&local->cont.lk.ret_flock, NULL);
} else {
- /* locking has succeeded on all nodes that are up */
-
- /* temporarily
- ret = afr_mark_locked_nodes (this, local->fd,
- local->cont.lk.locked_nodes);
- if (ret)
- gf_log (this->name, GF_LOG_DEBUG,
- "Could not save locked nodes info in fdctx");
-
- ret = afr_save_locked_fd (this, local->fd);
- if (ret)
- gf_log (this->name, GF_LOG_DEBUG,
- "Could not save locked fd");
-
- */
AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno,
&local->cont.lk.ret_flock, NULL);
}
@@ -3578,20 +3029,12 @@ afr_lk (call_frame_t *frame, xlator_t *this,
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
int i = 0;
- int32_t op_errno = 0;
- int ret = -1;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ int32_t op_errno = ENOMEM;
priv = this->private;
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
goto out;
local->cont.lk.locked_nodes = GF_CALLOC (priv->child_count,
@@ -3613,28 +3056,16 @@ afr_lk (call_frame_t *frame, xlator_t *this,
priv->children[i]->fops->lk,
fd, cmd, flock, xdata);
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL);
+ AFR_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL);
+
return 0;
}
int
afr_forget (xlator_t *this, inode_t *inode)
{
- uint64_t ctx_addr = 0;
- afr_inode_ctx_t *ctx = NULL;
-
- inode_ctx_get (inode, this, &ctx_addr);
-
- if (!ctx_addr)
- goto out;
-
- ctx = (afr_inode_ctx_t *)(long)ctx_addr;
- GF_FREE (ctx->fresh_children);
- GF_FREE (ctx);
-out:
return 0;
}
@@ -3654,7 +3085,6 @@ afr_priv_dump (xlator_t *this)
snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name);
gf_proc_dump_add_section(key_prefix);
gf_proc_dump_write("child_count", "%u", priv->child_count);
- gf_proc_dump_write("read_child_rr", "%u", priv->read_child_rr);
for (i = 0; i < priv->child_count; i++) {
sprintf (key, "child_up[%d]", i);
gf_proc_dump_write(key, "%d", priv->child_up[i]);
@@ -3711,7 +3141,7 @@ afr_notify (xlator_t *this, int32_t event,
int idx = -1;
int ret = -1;
int call_psh = 0;
- int up_child = AFR_ALL_CHILDREN;
+ int up_child = -1;
dict_t *input = NULL;
dict_t *output = NULL;
@@ -3763,6 +3193,7 @@ afr_notify (xlator_t *this, int32_t event,
*/
if (priv->child_up[idx] != 1) {
priv->up_count++;
+ priv->event_generation++;
}
priv->child_up[idx] = 1;
@@ -3802,6 +3233,7 @@ afr_notify (xlator_t *this, int32_t event,
*/
if (priv->child_up[idx] == 1) {
priv->down_count++;
+ priv->event_generation++;
}
priv->child_up[idx] = 0;
@@ -3834,6 +3266,10 @@ afr_notify (xlator_t *this, int32_t event,
case GF_EVENT_TRANSLATOR_OP:
input = data;
output = data2;
+ if (!had_heard_from_all) {
+ ret = -1;
+ goto out;
+ }
ret = afr_xl_op (this, input, output);
goto out;
break;
@@ -3864,8 +3300,7 @@ afr_notify (xlator_t *this, int32_t event,
LOCK (&priv->lock);
{
- up_children = afr_up_children_count (priv->child_up,
- priv->child_count);
+ up_children = AFR_COUNT (priv->child_up, priv->child_count);
for (i = 0; i < priv->child_count; i++) {
if (priv->last_event[i] == GF_EVENT_CHILD_UP) {
event = GF_EVENT_CHILD_UP;
@@ -3885,39 +3320,23 @@ afr_notify (xlator_t *this, int32_t event,
ret = 0;
if (propagate)
ret = default_notify (this, event, data);
- if (call_psh && priv->shd.iamshd)
- afr_proactive_self_heal ((void*) (long) up_child);
+ if (call_psh && priv->shd.iamshd) {
+ afr_selfheal_childup (this, up_child);
+ }
out:
return ret;
}
-int
-afr_first_up_child (unsigned char *child_up, size_t child_count)
-{
- int ret = -1;
- int i = 0;
-
- GF_ASSERT (child_up);
-
- for (i = 0; i < child_count; i++) {
- if (child_up[i]) {
- ret = i;
- break;
- }
- }
-
- return ret;
-}
int
afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
{
- int ret = -1;
-
local->op_ret = -1;
local->op_errno = EUCLEAN;
+ syncbarrier_init (&local->barrier);
+
local->child_up = GF_CALLOC (priv->child_count,
sizeof (*local->child_up),
gf_afr_mt_char);
@@ -3929,27 +3348,42 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
memcpy (local->child_up, priv->child_up,
sizeof (*local->child_up) * priv->child_count);
- local->call_count = afr_up_children_count (local->child_up,
- priv->child_count);
+ local->call_count = AFR_COUNT (local->child_up, priv->child_count);
if (local->call_count == 0) {
gf_log (THIS->name, GF_LOG_INFO, "no subvolumes up");
if (op_errno)
*op_errno = ENOTCONN;
goto out;
}
-
- local->child_errno = GF_CALLOC (priv->child_count,
- sizeof (*local->child_errno),
- gf_afr_mt_int32_t);
- if (!local->child_errno) {
- if (op_errno)
- *op_errno = ENOMEM;
- goto out;
- }
-
- ret = 0;
+ local->event_generation = priv->event_generation;
+
+ local->read_attempted = GF_CALLOC (priv->child_count, sizeof (char),
+ gf_afr_mt_char);
+ if (!local->read_attempted) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->readable = GF_CALLOC (priv->child_count, sizeof (char),
+ gf_afr_mt_char);
+ if (!local->readable) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies),
+ gf_afr_mt_reply_t);
+ if (!local->replies) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ return 0;
out:
- return ret;
+ return -1;
}
int
@@ -3958,16 +3392,6 @@ afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count,
{
int ret = -ENOMEM;
- lk->inode_locked_nodes = GF_CALLOC (sizeof (*lk->inode_locked_nodes),
- child_count, gf_afr_mt_char);
- if (NULL == lk->inode_locked_nodes)
- goto out;
-
- lk->entry_locked_nodes = GF_CALLOC (sizeof (*lk->entry_locked_nodes),
- child_count, gf_afr_mt_char);
- if (NULL == lk->entry_locked_nodes)
- goto out;
-
lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes),
child_count, gf_afr_mt_char);
if (NULL == lk->locked_nodes)
@@ -4026,6 +3450,21 @@ out:
}
int
+afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count)
+{
+ int ret = -ENOMEM;
+
+ lk->domain = dom;
+ lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes),
+ child_count, gf_afr_mt_char);
+ if (NULL == lk->locked_nodes)
+ goto out;
+ ret = 0;
+out:
+ return ret;
+}
+
+int
afr_transaction_local_init (afr_local_t *local, xlator_t *this)
{
int child_up_count = 0;
@@ -4038,14 +3477,20 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this)
if (ret < 0)
goto out;
+ if ((local->transaction.type == AFR_DATA_TRANSACTION) ||
+ (local->transaction.type == AFR_METADATA_TRANSACTION)) {
+ ret = afr_inodelk_init (&local->internal_lock.inodelk[0],
+ this->name, priv->child_count);
+ if (ret < 0)
+ goto out;
+ }
+
ret = -ENOMEM;
- child_up_count = afr_up_children_count (local->child_up,
- priv->child_count);
+ child_up_count = AFR_COUNT (local->child_up, priv->child_count);
if (priv->optimistic_change_log && child_up_count == priv->child_count)
local->optimistic_change_log = 1;
- local->first_up_child = afr_first_up_child (local->child_up,
- priv->child_count);
+ local->pre_op_compat = priv->pre_op_compat;
local->transaction.eager_lock =
GF_CALLOC (sizeof (*local->transaction.eager_lock),
@@ -4055,118 +3500,36 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this)
if (!local->transaction.eager_lock)
goto out;
- local->fresh_children = afr_children_create (priv->child_count);
- if (!local->fresh_children)
- goto out;
-
- if (local->fd) {
- local->fd_open_on = GF_CALLOC (sizeof (*local->fd_open_on),
- priv->child_count,
- gf_afr_mt_char);
- if (!local->fd_open_on)
- goto out;
- }
-
local->transaction.pre_op = GF_CALLOC (sizeof (*local->transaction.pre_op),
priv->child_count,
gf_afr_mt_char);
if (!local->transaction.pre_op)
goto out;
+ local->transaction.fop_subvols = GF_CALLOC (sizeof (*local->transaction.fop_subvols),
+ priv->child_count,
+ gf_afr_mt_char);
+ if (!local->transaction.fop_subvols)
+ goto out;
+
+ local->transaction.failed_subvols = GF_CALLOC (sizeof (*local->transaction.failed_subvols),
+ priv->child_count,
+ gf_afr_mt_char);
+ if (!local->transaction.failed_subvols)
+ goto out;
+
local->pending = afr_matrix_create (priv->child_count,
AFR_NUM_CHANGE_LOGS);
if (!local->pending)
goto out;
- local->transaction.txn_changelog = afr_matrix_create (priv->child_count,
- AFR_NUM_CHANGE_LOGS);
- if (!local->transaction.txn_changelog)
- goto out;
+ INIT_LIST_HEAD (&local->transaction.eager_locked);
+
ret = 0;
out:
return ret;
}
-void
-afr_reset_children (int32_t *fresh_children, int32_t child_count)
-{
- unsigned int i = 0;
- for (i = 0; i < child_count; i++)
- fresh_children[i] = -1;
-}
-
-int32_t*
-afr_children_create (int32_t child_count)
-{
- int32_t *children = NULL;
- int i = 0;
-
- GF_ASSERT (child_count > 0);
-
- children = GF_CALLOC (child_count, sizeof (*children),
- gf_afr_mt_int32_t);
- if (NULL == children)
- goto out;
- for (i = 0; i < child_count; i++)
- children[i] = -1;
-out:
- return children;
-}
-
-void
-afr_children_add_child (int32_t *children, int32_t child,
- int32_t child_count)
-{
- gf_boolean_t child_found = _gf_false;
- int i = 0;
-
- for (i = 0; i < child_count; i++) {
- if (children[i] == -1)
- break;
- if (children[i] == child) {
- child_found = _gf_true;
- break;
- }
- }
-
- if (!child_found) {
- GF_ASSERT (i < child_count);
- children[i] = child;
- }
-}
-
-void
-afr_children_rm_child (int32_t *children, int32_t child, int32_t child_count)
-{
- int i = 0;
-
- GF_ASSERT ((child >= 0) && (child < child_count));
- for (i = 0; i < child_count; i++) {
- if (children[i] == -1)
- break;
- if (children[i] == child) {
- if (i != (child_count - 1))
- memmove (children + i, children + i + 1,
- sizeof (*children)*(child_count - i - 1));
- children[child_count - 1] = -1;
- break;
- }
- }
-}
-
-int
-afr_get_children_count (int32_t *children, unsigned int child_count)
-{
- int count = 0;
- int i = 0;
-
- for (i = 0; i < child_count; i++) {
- if (children[i] == -1)
- break;
- count++;
- }
- return count;
-}
void
afr_set_low_priority (call_frame_t *frame)
@@ -4174,38 +3537,6 @@ afr_set_low_priority (call_frame_t *frame)
frame->root->pid = LOW_PRIO_PROC_PID;
}
-int
-afr_child_fd_ctx_set (xlator_t *this, fd_t *fd, int32_t child,
- int flags)
-{
- int ret = 0;
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
-
- GF_ASSERT (fd && fd->inode);
- ret = afr_fd_ctx_set (this, fd);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "could not set fd ctx for fd=%p", fd);
- goto out;
- }
-
- ret = fd_ctx_get (fd, this, &ctx);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "could not get fd ctx for fd=%p", fd);
- goto out;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
- fd_ctx->opened_on[child] = AFR_FD_OPENED;
- if (!IA_ISDIR (fd->inode->ia_type)) {
- fd_ctx->flags = flags;
- }
- ret = 0;
-out:
- return ret;
-}
gf_boolean_t
afr_have_quorum (char *logname, afr_private_t *priv)
@@ -4252,23 +3583,6 @@ afr_priv_destroy (afr_private_t *priv)
if (!priv)
goto out;
inode_unref (priv->root_inode);
- GF_FREE (priv->shd.pos);
- GF_FREE (priv->shd.pending);
- GF_FREE (priv->shd.inprogress);
-// for (i = 0; i < priv->child_count; i++)
-// if (priv->shd.timer && priv->shd.timer[i])
-// gf_timer_call_cancel (this->ctx, priv->shd.timer[i]);
- GF_FREE (priv->shd.timer);
-
- if (priv->shd.healed)
- eh_destroy (priv->shd.healed);
-
- if (priv->shd.heal_failed)
- eh_destroy (priv->shd.heal_failed);
-
- if (priv->shd.split_brain)
- eh_destroy (priv->shd.split_brain);
-
GF_FREE (priv->last_event);
if (priv->pending_key) {
for (i = 0; i < priv->child_count; i++)
@@ -4278,8 +3592,7 @@ afr_priv_destroy (afr_private_t *priv)
GF_FREE (priv->children);
GF_FREE (priv->child_up);
LOCK_DESTROY (&priv->lock);
- LOCK_DESTROY (&priv->read_child_lock);
- pthread_mutex_destroy (&priv->mutex);
+
GF_FREE (priv);
out:
return;
@@ -4295,3 +3608,22 @@ xlator_subvolume_count (xlator_t *this)
i++;
return i;
}
+
+
+void
+afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ local = frame->local;
+
+ if (!local->fd)
+ return;
+
+ fd_ctx = afr_fd_ctx_get (local->fd, this);
+ if (!fd_ctx)
+ return;
+
+ fd_ctx->open_fd_count = local->open_fd_count;
+}
diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c
index ce91ffba7..fa1da3958 100644
--- a/xlators/cluster/afr/src/afr-dir-read.c
+++ b/xlators/cluster/afr/src/afr-dir-read.c
@@ -37,177 +37,7 @@
#include "checksum.h"
#include "afr.h"
-#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
-
-int
-afr_examine_dir_sh_unwind (call_frame_t *frame, xlator_t *this, int32_t op_ret,
- int32_t op_errno, int32_t sh_failed)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- afr_set_opendir_done (this, local->fd->inode);
-
- AFR_STACK_UNWIND (opendir, frame, local->op_ret,
- local->op_errno, local->fd, NULL);
-
- return 0;
-}
-
-
-gf_boolean_t
-__checksums_differ (uint32_t *checksum, int child_count,
- unsigned char *child_up)
-{
- int ret = _gf_false;
- int i = 0;
- uint32_t cksum = 0;
- gf_boolean_t activate_check = _gf_false;
-
- for (i = 0; i < child_count; i++) {
- if (!child_up[i])
- continue;
- if (_gf_false == activate_check) {
- cksum = checksum[i];
- activate_check = _gf_true;
- continue;
- }
-
- if (cksum != checksum[i]) {
- ret = _gf_true;
- break;
- }
-
- cksum = checksum[i];
- }
-
- return ret;
-}
-
-
-int32_t
-afr_examine_dir_readdir_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- gf_dirent_t *entries, dict_t *xdata)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t * sh = NULL;
- gf_dirent_t * entry = NULL;
- gf_dirent_t * tmp = NULL;
- char *reason = NULL;
- int child_index = 0;
- uint32_t entry_cksum = 0;
- int call_count = 0;
- off_t last_offset = 0;
- inode_t *inode = NULL;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
- inode = local->fd->inode;
-
- child_index = (long) cookie;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_INFO,
- "%s: failed to do opendir on %s",
- local->loc.path, priv->children[child_index]->name);
- local->op_ret = -1;
- local->op_ret = op_errno;
- goto out;
- }
-
- if (op_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "%s: no entries found in %s",
- local->loc.path, priv->children[child_index]->name);
- goto out;
- }
-
- list_for_each_entry_safe (entry, tmp, &entries->list, list) {
- entry_cksum = gf_rsync_weak_checksum ((unsigned char *)entry->d_name,
- strlen (entry->d_name));
- local->cont.opendir.checksum[child_index] ^= entry_cksum;
- }
-
- list_for_each_entry (entry, &entries->list, list) {
- last_offset = entry->d_off;
- }
-
- /* read more entries */
-
- STACK_WIND_COOKIE (frame, afr_examine_dir_readdir_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->readdir,
- local->fd, 131072, last_offset, NULL);
-
- return 0;
-
-out:
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if (__checksums_differ (local->cont.opendir.checksum,
- priv->child_count,
- local->child_up)) {
-
- sh->do_entry_self_heal = _gf_true;
- sh->forced_merge = _gf_true;
-
- reason = "checksums of directory differ";
- afr_launch_self_heal (frame, this, inode, _gf_false,
- inode->ia_type, reason, NULL,
- afr_examine_dir_sh_unwind);
- } else {
- afr_set_opendir_done (this, inode);
-
- AFR_STACK_UNWIND (opendir, frame, local->op_ret,
- local->op_errno, local->fd, NULL);
- }
- }
-
- return 0;
-}
-
-
-int
-afr_examine_dir (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- int i = 0;
- int call_count = 0;
-
- local = frame->local;
- priv = this->private;
-
- local->cont.opendir.checksum = GF_CALLOC (priv->child_count,
- sizeof (*local->cont.opendir.checksum),
- gf_afr_mt_int32_t);
-
- call_count = afr_up_children_count (local->child_up, priv->child_count);
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_examine_dir_readdir_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->readdir,
- local->fd, 131072, 0, NULL);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
-}
+#include "afr-transaction.h"
int32_t
@@ -215,112 +45,66 @@ afr_opendir_cbk (call_frame_t *frame, void *cookie,
xlator_t *this, int32_t op_ret, int32_t op_errno,
fd_t *fd, dict_t *xdata)
{
- afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- int32_t up_children_count = 0;
- int ret = -1;
int call_count = -1;
int32_t child_index = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
- priv = this->private;
local = frame->local;
+ fd_ctx = local->fd_ctx;
child_index = (long) cookie;
- up_children_count = afr_up_children_count (local->child_up,
- priv->child_count);
-
LOCK (&frame->lock);
{
- if (op_ret >= 0) {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
+ } else {
local->op_ret = op_ret;
- ret = afr_child_fd_ctx_set (this, fd, child_index, 0);
- if (ret) {
- local->op_ret = -1;
- local->op_errno = -ret;
- goto unlock;
- }
+ fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref (xdata);
}
-
- local->op_errno = op_errno;
}
-unlock:
UNLOCK (&frame->lock);
call_count = afr_frame_return (frame);
- if (call_count == 0) {
- if (local->op_ret != 0)
- goto out;
-
- if (!afr_is_opendir_done (this, local->fd->inode) &&
- up_children_count > 1) {
-
- /*
- * This is the first opendir on this inode. We need
- * to check if the directory's entries are the same
- * on all subvolumes. This is needed in addition
- * to regular entry self-heal because the readdir
- * call is sent only to the first subvolume, and
- * thus files that exist only there will never be healed
- * otherwise (assuming changelog shows no anomalies).
- */
-
- gf_log (this->name, GF_LOG_TRACE,
- "reading contents of directory %s looking for mismatch",
- local->loc.path);
-
- afr_examine_dir (frame, this);
-
- } else {
- /* do the unwind */
- goto out;
- }
- }
-
- return 0;
-
-out:
- AFR_STACK_UNWIND (opendir, frame, local->op_ret,
- local->op_errno, local->fd, NULL);
-
+ if (call_count == 0)
+ AFR_STACK_UNWIND (opendir, frame, local->op_ret,
+ local->op_errno, local->fd, NULL);
return 0;
}
-int32_t
-afr_opendir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, fd_t *fd)
+int
+afr_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd)
{
afr_private_t * priv = NULL;
afr_local_t * local = NULL;
- int child_count = 0;
int i = 0;
- int ret = -1;
int call_count = -1;
- int32_t op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ int32_t op_errno = ENOMEM;
+ afr_fd_ctx_t *fd_ctx = NULL;
priv = this->private;
- child_count = priv->child_count;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ fd_ctx = afr_fd_ctx_get (fd, this);
+ if (!fd_ctx)
+ goto out;
loc_copy (&local->loc, loc);
local->fd = fd_ref (fd);
+ local->fd_ctx = fd_ctx;
call_count = local->call_count;
- for (i = 0; i < child_count; i++) {
+ for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
STACK_WIND_COOKIE (frame, afr_opendir_cbk,
(void*) (long) i,
@@ -333,352 +117,284 @@ afr_opendir (call_frame_t *frame, xlator_t *this,
}
}
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (opendir, frame, -1, op_errno, fd, NULL);
-
+ AFR_STACK_UNWIND (opendir, frame, -1, op_errno, fd, NULL);
return 0;
}
-/**
- * Common algorithm for directory read calls:
- *
- * - Try the fop on the first child that is up
- * - if we have failed due to ENOTCONN:
- * try the next child
- *
- * Applicable to: readdir
- */
-
+#define BACKEND_D_OFF_BITS 63
+#define PRESENT_D_OFF_BITS 63
-struct entry_name {
- char *name;
- struct list_head list;
-};
+#define ONE 1ULL
+#define MASK (~0ULL)
+#define PRESENT_MASK (MASK >> (64 - PRESENT_D_OFF_BITS))
+#define BACKEND_MASK (MASK >> (64 - BACKEND_D_OFF_BITS))
+#define TOP_BIT (ONE << (PRESENT_D_OFF_BITS - 1))
+#define SHIFT_BITS (max (0, (BACKEND_D_OFF_BITS - PRESENT_D_OFF_BITS + 1)))
-static gf_boolean_t
-remembered_name (const char *name, struct list_head *entries)
+static uint64_t
+afr_bits_for (uint64_t num)
{
- struct entry_name *e = NULL;
- gf_boolean_t ret = _gf_false;
+ uint64_t bits = 0, ctrl = 1;
- list_for_each_entry (e, entries, list) {
- if (!strcmp (name, e->name)) {
- ret = _gf_true;
- goto out;
- }
- }
+ while (ctrl < num) {
+ ctrl *= 2;
+ bits ++;
+ }
-out:
- return ret;
+ return bits;
}
-
-static void
-afr_remember_entries (gf_dirent_t *entries, fd_t *fd)
+int
+afr_itransform (xlator_t *this, int subvol, uint64_t x, uint64_t *y_p)
{
- struct entry_name *n = NULL;
- gf_dirent_t *entry = NULL;
- int ret = 0;
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
-
- ret = fd_ctx_get (fd, THIS, &ctx);
- if (ret < 0) {
- gf_log (THIS->name, GF_LOG_INFO,
- "could not get fd ctx for fd=%p", fd);
- return;
+ afr_private_t *conf = NULL;
+ int cnt = 0;
+ int max = 0;
+ uint64_t y = 0;
+ uint64_t hi_mask = 0;
+ uint64_t off_mask = 0;
+ int max_bits = 0;
+
+ if (x == ((uint64_t) -1)) {
+ y = (uint64_t) -1;
+ goto out;
}
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+ conf = this->private;
+ if (!conf)
+ goto out;
- list_for_each_entry (entry, &entries->list, list) {
- n = GF_CALLOC (1, sizeof (*n), gf_afr_mt_entry_name);
- n->name = gf_strdup (entry->d_name);
- INIT_LIST_HEAD (&n->list);
+ max = conf->child_count;
+ cnt = subvol;
- list_add (&n->list, &fd_ctx->entries);
+ if (max == 1) {
+ y = x;
+ goto out;
+ }
+
+ max_bits = afr_bits_for (max);
+
+ hi_mask = ~(PRESENT_MASK >> (max_bits + 1));
+
+ if (x & hi_mask) {
+ /* HUGE d_off */
+ off_mask = MASK << max_bits;
+ y = TOP_BIT | ((x >> SHIFT_BITS) & off_mask) | cnt;
+ } else {
+ /* small d_off */
+ y = ((x * max) + cnt);
}
+
+out:
+ if (y_p)
+ *y_p = y;
+
+ return 0;
}
-static off_t
-afr_filter_entries (gf_dirent_t *entries, fd_t *fd)
+int
+afr_deitransform (xlator_t *this, uint64_t y, int *subvol_p,
+ uint64_t *x_p)
{
- gf_dirent_t *entry = NULL;
- gf_dirent_t *tmp = NULL;
- int ret = 0;
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
- off_t offset = 0;
-
- ret = fd_ctx_get (fd, THIS, &ctx);
- if (ret < 0) {
- gf_log (THIS->name, GF_LOG_INFO,
- "could not get fd ctx for fd=%p", fd);
+ afr_private_t *conf = NULL;
+ int cnt = 0;
+ int max = 0;
+ uint64_t x = 0;
+ int subvol = 0;
+ int max_bits = 0;
+ uint64_t off_mask = 0;
+ uint64_t host_mask = 0;
+
+ if (!this->private)
return -1;
- }
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+ conf = this->private;
+ max = conf->child_count;
- list_for_each_entry_safe (entry, tmp, &entries->list, list) {
- offset = entry->d_off;
+ if (max == 1) {
+ x = y;
+ cnt = 0;
+ goto out;
+ }
- if (remembered_name (entry->d_name, &fd_ctx->entries)) {
- list_del (&entry->list);
- GF_FREE (entry);
- }
- }
-
- return offset;
-}
+ if (y & TOP_BIT) {
+ /* HUGE d_off */
+ max_bits = afr_bits_for (max);
+ off_mask = (MASK << max_bits);
+ host_mask = ~(off_mask);
+ x = ((y & ~TOP_BIT) & off_mask) << SHIFT_BITS;
-static void
-afr_forget_entries (fd_t *fd)
-{
- struct entry_name *entry = NULL;
- struct entry_name *tmp = NULL;
- int ret = 0;
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
-
- ret = fd_ctx_get (fd, THIS, &ctx);
- if (ret < 0) {
- gf_log (THIS->name, GF_LOG_INFO,
- "could not get fd ctx for fd=%p", fd);
- return;
+ cnt = y & host_mask;
+ } else {
+ /* small d_off */
+ cnt = y % max;
+ x = y / max;
}
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- list_for_each_entry_safe (entry, tmp, &fd_ctx->entries, list) {
- GF_FREE (entry->name);
- list_del (&entry->list);
- GF_FREE (entry);
- }
-}
+out:
+ subvol = cnt;
+ if (subvol_p)
+ *subvol_p = subvol;
-int32_t
-afr_readdir_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- gf_dirent_t *entries, dict_t *xdata)
-{
- AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, entries, NULL);
+ if (x_p)
+ *x_p = x;
return 0;
}
-int32_t
-afr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
- dict_t *xdata)
+static void
+afr_readdir_transform_entries (gf_dirent_t *subvol_entries, int subvol,
+ gf_dirent_t *entries, fd_t *fd)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
- int32_t next_call_child = -1;
- int ret = 0;
- int32_t *last_index = NULL;
- int32_t read_child = -1;
- int32_t *fresh_children = NULL;
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
- off_t offset = 0;
- int32_t call_child = -1;
-
- priv = this->private;
- children = priv->children;
-
- local = frame->local;
-
- read_child = (long) cookie;
- last_index = &local->cont.readdir.last_index;
- fresh_children = local->fresh_children;
-
- /* the value of the last_index changes if afr_next_call_child is
- * called. So to find the call_child of this callback use last_index
- * before the next_call_child call.
- */
- if (*last_index == -1)
- call_child = read_child;
- else
- call_child = fresh_children[*last_index];
-
- if (priv->strict_readdir) {
- ret = fd_ctx_get (local->fd, this, &ctx);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
- "could not get fd ctx for fd=%p", local->fd);
- op_ret = -1;
- op_errno = -ret;
- goto out;
+ afr_private_t *priv = NULL;
+ gf_dirent_t *entry = NULL;
+ gf_dirent_t *tmp = NULL;
+ unsigned char *data_readable = NULL;
+ unsigned char *metadata_readable = NULL;
+ int gen = 0;
+
+ priv = THIS->private;
+
+ data_readable = alloca0 (priv->child_count);
+ metadata_readable = alloca0 (priv->child_count);
+
+ list_for_each_entry_safe (entry, tmp, &subvol_entries->list, list) {
+ if (__is_root_gfid (fd->inode->gfid) &&
+ !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) {
+ continue;
}
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- if (op_ret == -1) {
- next_call_child = afr_next_call_child (fresh_children,
- local->child_up,
- priv->child_count,
- last_index,
- read_child);
- if (next_call_child < 0)
- goto out;
- gf_log (this->name, GF_LOG_TRACE,
- "starting readdir afresh on child %d, offset %"PRId64,
- next_call_child, (uint64_t) 0);
-
- fd_ctx->failed_over = _gf_true;
-
- STACK_WIND_COOKIE (frame, afr_readdirp_cbk,
- (void *) (long) read_child,
- children[next_call_child],
- children[next_call_child]->fops->readdirp,
- local->fd,
- local->cont.readdir.size, 0,
- local->cont.readdir.dict);
- return 0;
- }
- }
+ list_del_init (&entry->list);
+ afr_itransform (THIS, subvol, entry->d_off, &entry->d_off);
+ list_add_tail (&entry->list, &entries->list);
- if (priv->strict_readdir) {
- if (fd_ctx->failed_over) {
- if (list_empty (&entries->list)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no entries found");
- goto out;
- }
-
- offset = afr_filter_entries (entries, local->fd);
-
- afr_remember_entries (entries, local->fd);
-
- if (list_empty (&entries->list)) {
- /* All the entries we got were duplicate. We
- shouldn't send an empty list now, because
- that will make the application stop reading. So
- try to get more entries */
-
- gf_log (this->name, GF_LOG_TRACE,
- "trying to fetch non-duplicate entries "
- "from offset %"PRId64", child %s",
- offset, children[call_child]->name);
-
- STACK_WIND_COOKIE (frame, afr_readdirp_cbk,
- (void *) (long) read_child,
- children[call_child],
- children[call_child]->fops->readdirp,
- local->fd, local->cont.readdir.size, offset,
- local->cont.readdir.dict);
- return 0;
- }
- } else {
- afr_remember_entries (entries, local->fd);
- }
- }
+ if (entry->inode) {
+ gen = 0;
+ afr_inode_read_subvol_get (entry->inode, THIS,
+ data_readable,
+ metadata_readable, &gen);
-out:
- AFR_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries, NULL);
+ if (gen != priv->event_generation ||
+ !data_readable[subvol] ||
+ !metadata_readable[subvol]) {
- return 0;
+ inode_unref (entry->inode);
+ entry->inode = NULL;
+ }
+ }
+ }
}
+
int32_t
-afr_do_readdir (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t offset, int whichop, dict_t *dict)
+afr_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *subvol_entries,
+ dict_t *xdata)
{
- afr_private_t * priv = NULL;
- xlator_t ** children = NULL;
- int call_child = 0;
- afr_local_t *local = NULL;
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
- int ret = -1;
- int32_t op_errno = 0;
- uint64_t read_child = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
- children = priv->children;
-
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
+ afr_local_t *local = NULL;
+ gf_dirent_t entries;
+
+ INIT_LIST_HEAD (&entries.list);
+
local = frame->local;
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ if (op_ret < 0 && !local->cont.readdir.offset) {
+ /* failover only if this was first readdir, detected
+ by offset == 0 */
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- local->fresh_children = afr_children_create (priv->child_count);
- if (!local->fresh_children) {
- op_errno = ENOMEM;
- goto out;
- }
+ afr_read_txn_continue (frame, this, (long) cookie);
+ return 0;
+ }
- read_child = afr_inode_get_read_ctx (this, fd->inode,
- local->fresh_children);
- ret = afr_get_call_child (this, local->child_up, read_child,
- local->fresh_children,
- &call_child,
- &local->cont.readdir.last_index);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ if (op_ret >= 0)
+ afr_readdir_transform_entries (subvol_entries, (long) cookie,
+ &entries, local->fd);
- local->fd = fd_ref (fd);
- local->cont.readdir.size = size;
- local->cont.readdir.dict = (dict)? dict_ref (dict) : NULL;
-
- if (priv->strict_readdir) {
- ret = fd_ctx_get (fd, this, &ctx);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
- "could not get fd ctx for fd=%p", fd);
- op_errno = -ret;
- goto out;
- }
+ AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, &entries, xdata);
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+ return 0;
+}
- if (fd_ctx->last_tried != call_child) {
- gf_log (this->name, GF_LOG_TRACE,
- "first up child has changed from %d to %d, "
- "restarting readdir from offset 0",
- fd_ctx->last_tried, call_child);
- fd_ctx->failed_over = _gf_true;
- offset = 0;
- }
+int
+afr_readdir_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- fd_ctx->last_tried = call_child;
- }
+ priv = this->private;
+ local = frame->local;
- if (whichop == GF_FOP_READDIR)
+ if (subvol == -1) {
+ AFR_STACK_UNWIND (readdir, frame, local->op_ret,
+ local->op_errno, 0, 0);
+ return 0;
+ }
+
+ if (local->op == GF_FOP_READDIR)
STACK_WIND_COOKIE (frame, afr_readdir_cbk,
- (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->readdir, fd,
- size, offset, dict);
+ (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->readdir,
+ local->fd, local->cont.readdir.size,
+ local->cont.readdir.offset,
+ local->xdata_req);
else
- STACK_WIND_COOKIE (frame, afr_readdirp_cbk,
- (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->readdirp, fd,
- size, offset, dict);
+ STACK_WIND_COOKIE (frame, afr_readdir_cbk,
+ (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->readdirp,
+ local->fd, local->cont.readdir.size,
+ local->cont.readdir.offset,
+ local->xdata_req);
+ return 0;
+}
+
+
+int
+afr_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, int whichop, dict_t *dict)
+{
+ afr_local_t *local = NULL;
+ int32_t op_errno = 0;
+ int subvol = -1;
+
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->op = whichop;
+ local->fd = fd_ref (fd);
+ local->cont.readdir.size = size;
+ local->cont.readdir.offset = offset;
+ local->xdata_req = (dict)? dict_ref (dict) : NULL;
+
+ if (offset == 0) {
+ /* First readdir has option of failing over and selecting
+ an appropriate read subvolume */
+ afr_read_txn (frame, this, fd->inode, afr_readdir_wind,
+ AFR_DATA_TRANSACTION);
+ } else {
+ /* But continued readdirs MUST stick to the same subvolume
+ without an option to failover */
+ afr_deitransform (this, offset, &subvol,
+ (uint64_t *)&local->cont.readdir.offset);
+ afr_readdir_wind (frame, this, subvol);
+ }
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (readdir, frame, -1, op_errno, NULL, NULL);
+ AFR_STACK_UNWIND (readdir, frame, -1, op_errno, NULL, NULL);
return 0;
}
@@ -687,7 +403,8 @@ int32_t
afr_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
off_t offset, dict_t *xdata)
{
- afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIR, xdata);
+ afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIR, xdata);
+
return 0;
}
@@ -697,6 +414,7 @@ afr_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
off_t offset, dict_t *dict)
{
afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIRP, dict);
+
return 0;
}
@@ -704,7 +422,6 @@ afr_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
int32_t
afr_releasedir (xlator_t *this, fd_t *fd)
{
- afr_forget_entries (fd);
afr_cleanup_fd_ctx (this, fd);
return 0;
diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c
index 49cf8d453..465dde54f 100644
--- a/xlators/cluster/afr/src/afr-dir-write.c
+++ b/xlators/cluster/afr/src/afr-dir-write.c
@@ -34,10 +34,14 @@
#include "common-utils.h"
#include "compat-errno.h"
#include "compat.h"
+#include "byte-order.h"
#include "afr.h"
#include "afr-transaction.h"
+void
+afr_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this);
+
int
afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno)
{
@@ -56,290 +60,429 @@ afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno)
*op_errno = ENOMEM;
goto out;
}
- parent->path = gf_strdup( dirname (child_path) );
- if (!parent->path) {
+
+ parent->path = gf_strdup (dirname (child_path));
+ if (!parent->path) {
if (op_errno)
*op_errno = ENOMEM;
goto out;
}
- parent->inode = inode_ref (child->parent);
- uuid_copy (parent->gfid, child->pargfid);
+
+ parent->inode = inode_ref (child->parent);
+ uuid_copy (parent->gfid, child->pargfid);
ret = 0;
out:
- GF_FREE(child_path);
+ GF_FREE (child_path);
return ret;
}
-void
-__dir_entry_fop_common_cbk (call_frame_t *frame, int child_index,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent, struct iatt *prenewparent,
- struct iatt *postnewparent)
+
+static void
+__afr_dir_write_finalize (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int inode_read_subvol = -1;
+ int parent_read_subvol = -1;
+ int parent2_read_subvol = -1;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (local->inode) {
+ afr_replies_interpret (frame, this, local->inode);
+ inode_read_subvol = afr_data_subvol_get (local->inode, this,
+ NULL, NULL);
+ }
+ if (local->parent)
+ parent_read_subvol = afr_data_subvol_get (local->parent, this,
+ NULL, NULL);
+ if (local->parent2)
+ parent2_read_subvol = afr_data_subvol_get (local->parent2, this,
+ NULL, NULL);
+
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno (local, priv);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret < 0) {
+ if (local->inode)
+ afr_inode_read_subvol_reset (local->inode,
+ this);
+ if (local->parent)
+ afr_inode_read_subvol_reset (local->parent,
+ this);
+ if (local->parent2)
+ afr_inode_read_subvol_reset (local->parent2,
+ this);
+ continue;
+ }
+
+ if (local->op_ret == -1) {
+ local->op_ret = local->replies[i].op_ret;
+ local->op_errno = local->replies[i].op_errno;
+
+ local->cont.dir_fop.buf =
+ local->replies[i].poststat;
+ local->cont.dir_fop.preparent =
+ local->replies[i].preparent;
+ local->cont.dir_fop.postparent =
+ local->replies[i].postparent;
+ local->cont.dir_fop.prenewparent =
+ local->replies[i].preparent2;
+ local->cont.dir_fop.postnewparent =
+ local->replies[i].postparent2;
+ if (local->replies[i].xdata)
+ local->xdata_rsp =
+ dict_ref (local->replies[i].xdata);
+ continue;
+ }
+
+ if (i == inode_read_subvol) {
+ local->cont.dir_fop.buf =
+ local->replies[i].poststat;
+ if (local->replies[i].xdata) {
+ if (local->xdata_rsp)
+ dict_unref (local->xdata_rsp);
+ local->xdata_rsp =
+ dict_ref (local->replies[i].xdata);
+ }
+ }
+
+ if (i == parent_read_subvol) {
+ local->cont.dir_fop.preparent =
+ local->replies[i].preparent;
+ local->cont.dir_fop.postparent =
+ local->replies[i].postparent;
+ }
+
+ if (i == parent2_read_subvol) {
+ local->cont.dir_fop.prenewparent =
+ local->replies[i].preparent2;
+ local->cont.dir_fop.postnewparent =
+ local->replies[i].postparent2;
+ }
+ }
+}
+
+
+static void
+__afr_dir_write_fill (call_frame_t *frame, xlator_t *this, int child_index,
+ int op_ret, int op_errno, struct iatt *poststat,
+ struct iatt *preparent, struct iatt *postparent,
+ struct iatt *preparent2, struct iatt *postparent2,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
local = frame->local;
+ fd_ctx = local->fd_ctx;
+
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+
+ if (op_ret >= 0) {
+ if (poststat)
+ local->replies[child_index].poststat = *poststat;
+ if (preparent)
+ local->replies[child_index].preparent = *preparent;
+ if (postparent)
+ local->replies[child_index].postparent = *postparent;
+ if (preparent2)
+ local->replies[child_index].preparent2 = *preparent2;
+ if (postparent2)
+ local->replies[child_index].postparent2 = *postparent2;
+ if (xdata)
+ local->replies[child_index].xdata = dict_ref (xdata);
+
+ if (fd_ctx)
+ fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
+ } else {
+ if (op_errno != ENOTEMPTY)
+ afr_transaction_fop_failed (frame, this, child_index);
+ if (fd_ctx)
+ fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
+ }
+
+ return;
+}
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
- if (op_ret > -1) {
- local->op_ret = op_ret;
+static int
+__afr_dir_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *buf,
+ struct iatt *preparent, struct iatt *postparent,
+ struct iatt *preparent2, struct iatt *postparent2,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int child_index = (long) cookie;
+ int call_count = -1;
- if ((local->success_count == 0) ||
- (child_index == local->read_child_index)) {
- local->cont.dir_fop.preparent = *preparent;
- local->cont.dir_fop.postparent = *postparent;
- if (buf)
- local->cont.dir_fop.buf = *buf;
- if (prenewparent)
- local->cont.dir_fop.prenewparent = *prenewparent;
- if (postnewparent)
- local->cont.dir_fop.postnewparent = *postnewparent;
- }
+ local = frame->local;
- local->cont.dir_fop.inode = inode;
+ LOCK (&frame->lock);
+ {
+ __afr_dir_write_fill (frame, this, child_index, op_ret,
+ op_errno, buf, preparent, postparent,
+ preparent2, postparent2, xdata);
+ }
+ UNLOCK (&frame->lock);
+ call_count = afr_frame_return (frame);
- local->fresh_children[local->success_count] = child_index;
- local->success_count++;
- }
+ if (call_count == 0) {
+ __afr_dir_write_finalize (frame, this);
- local->op_errno = op_errno;
-}
+ if (afr_txn_nothing_failed (frame, this))
+ local->transaction.unwind (frame, this);
-void
-afr_dir_fop_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_mark_entry_pending_changelog (frame, this);
- local = frame->local;
- priv = this->private;
+ local->transaction.resume (frame, this);
+ }
- if (local->cont.dir_fop.inode == NULL)
- goto done;
- afr_set_read_ctx_from_policy (this, local->cont.dir_fop.inode,
- local->fresh_children,
- local->read_child_index,
- priv->read_child,
- local->cont.dir_fop.buf.ia_gfid);
-done:
- local->transaction.unwind (frame, this);
- local->transaction.resume (frame, this);
+ return 0;
}
-/* {{{ create */
int
-afr_create_unwind (call_frame_t *frame, xlator_t *this)
+afr_mark_new_entry_changelog_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ dict_t *xattr, dict_t *xdata)
{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
+ int call_count = 0;
- local = frame->local;
+ call_count = afr_frame_return (frame);
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (create, main_frame,
- local->op_ret, local->op_errno,
- local->cont.create.fd,
- local->cont.dir_fop.inode,
- &local->cont.dir_fop.buf,
- &local->cont.dir_fop.preparent,
- &local->cont.dir_fop.postparent,
- local->xdata_rsp);
- }
+ if (call_count == 0)
+ AFR_STACK_DESTROY (frame);
return 0;
}
-int
-afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- fd_t *fd, inode_t *inode, struct iatt *buf,
- struct iatt *preparent, struct iatt *postparent,
- dict_t *xdata)
+void
+afr_mark_new_entry_changelog (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
- int ret = 0;
- int call_count = -1;
- int child_index = -1;
+ call_frame_t *new_frame = NULL;
+ afr_local_t *local = NULL;
+ afr_local_t *new_local = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xattr = NULL;
+ int32_t **changelog = NULL;
+ int i = 0;
+ int idx = 0;
+ int op_errno = ENOMEM;
+ unsigned char *pending = NULL;
+ int call_count = 0;
local = frame->local;
+ priv = this->private;
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret > -1) {
- ret = afr_fd_ctx_set (this, fd);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "could not set ctx on fd=%p", fd);
-
- local->op_ret = -1;
- local->op_errno = -ret;
- goto unlock;
- }
-
- ret = fd_ctx_get (fd, this, &ctx);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "could not get fd ctx for fd=%p", fd);
- local->op_ret = -1;
- local->op_errno = -ret;
- goto unlock;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
- fd_ctx->flags = local->cont.create.flags;
-
- if (local->success_count == 0) {
- if (xdata)
- local->xdata_rsp = dict_ref(xdata);
- }
- }
- __dir_entry_fop_common_cbk (frame, child_index, this,
- op_ret, op_errno, inode, buf,
- preparent, postparent, NULL, NULL);
- }
+ new_frame = copy_frame (frame);
+ if (!new_frame)
+ goto out;
-unlock:
- UNLOCK (&frame->lock);
+ new_local = AFR_FRAME_INIT (new_frame, op_errno);
+ if (!new_local)
+ goto out;
- call_count = afr_frame_return (frame);
+ changelog = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS);
+ if (!changelog)
+ goto out;
- if (call_count == 0)
- afr_dir_fop_done (frame, this);
+ xattr = dict_new ();
+ if (!xattr)
+ goto out;
- return 0;
+ idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION);
+
+ pending = alloca0 (priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i] &&
+ !local->transaction.failed_subvols[i]) {
+ call_count ++;
+ continue;
+ }
+
+ changelog[i][idx] = hton32(1);
+ pending[i] = 1;
+ }
+
+ new_local->pending = changelog;
+ uuid_copy (new_local->loc.gfid, local->cont.dir_fop.buf.ia_gfid);
+ new_local->loc.inode = inode_ref (local->inode);
+
+
+ afr_set_pending_dict (priv, xattr, changelog);
+
+ new_local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (pending[i])
+ continue;
+
+ STACK_WIND_COOKIE (new_frame, afr_mark_new_entry_changelog_cbk,
+ (void *) (long) i, priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &new_local->loc, GF_XATTROP_ADD_ARRAY,
+ xattr, NULL);
+ if (!--call_count)
+ break;
+ }
+
+ new_frame = NULL;
+out:
+ if (new_frame)
+ AFR_STACK_DESTROY (new_frame);
+ if (xattr)
+ dict_unref (xattr);
+ return;
}
-int
-afr_create_wind (call_frame_t *frame, xlator_t *this)
+void
+afr_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
+ int pre_op_count = 0;
+ int failed_count = 0;
local = frame->local;
- priv = this->private;
+ priv = this->private;
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
+ if (local->op_ret < 0)
+ return;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
+ if (local->op != GF_FOP_CREATE && local->op != GF_FOP_MKNOD)
+ return;
- local->call_count = call_count;
+ pre_op_count = AFR_COUNT (local->transaction.pre_op, priv->child_count);
+ failed_count = AFR_COUNT (local->transaction.failed_subvols,
+ priv->child_count);
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_create_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->create,
- &local->loc,
- local->cont.create.flags,
- local->cont.create.mode,
- local->umask,
- local->cont.create.fd,
- local->xdata_req);
- if (!--call_count)
- break;
- }
- }
+ if (pre_op_count == priv->child_count && !failed_count)
+ return;
- return 0;
+ afr_mark_new_entry_changelog (frame, this);
+
+ return;
}
+/* {{{ create */
+
int
-afr_create_done (call_frame_t *frame, xlator_t *this)
+afr_create_unwind (call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
local = frame->local;
- local->transaction.unwind (frame, this);
+ main_frame = afr_transaction_detach_fop_frame (frame);
- AFR_STACK_DESTROY (frame);
+ if (!main_frame)
+ return 0;
+ AFR_STACK_UNWIND (create, main_frame, local->op_ret, local->op_errno,
+ local->cont.create.fd, local->inode,
+ &local->cont.dir_fop.buf,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
return 0;
}
int
-afr_create (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t flags, mode_t mode,
- mode_t umask, fd_t *fd, dict_t *params)
+afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ fd_t *fd, inode_t *inode, struct iatt *buf,
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
+{
+ return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
+}
+
+
+int
+afr_create_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_errno = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE (frame, afr_create_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->create,
+ &local->loc, local->cont.create.flags,
+ local->cont.create.mode, local->umask,
+ local->cont.create.fd, local->xdata_req);
+ return 0;
+}
+
+
+int
+afr_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
priv = this->private;
QUORUM_CHECK(create,out);
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
+ if (!transaction_frame)
goto out;
- }
- AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out);
- local = transaction_frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
loc_copy (&local->loc, loc);
- LOCK (&priv->read_child_lock);
- {
- local->read_child_index = (++priv->read_child_rr)
- % (priv->child_count);
- }
- UNLOCK (&priv->read_child_lock);
+ local->fd_ctx = afr_fd_ctx_get (fd, this);
+ if (!local->fd_ctx)
+ goto out;
+
+ local->inode = inode_ref (loc->inode);
+ local->parent = inode_ref (loc->parent);
+ local->op = GF_FOP_CREATE;
local->cont.create.flags = flags;
local->cont.create.mode = mode;
local->cont.create.fd = fd_ref (fd);
local->umask = umask;
- if (params)
- local->xdata_req = dict_ref (params);
- local->transaction.fop = afr_create_wind;
- local->transaction.done = afr_create_done;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->transaction.wind = afr_create_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
local->transaction.unwind = afr_create_unwind;
ret = afr_build_parent_loc (&local->transaction.parent_loc, loc,
@@ -349,22 +492,30 @@ afr_create (call_frame_t *frame, xlator_t *this,
local->transaction.main_frame = frame;
local->transaction.basename = AFR_BASENAME (loc->path);
+ int_lock = &local->internal_lock;
+
+ int_lock->lockee_count = 0;
+ ret = afr_init_entry_lockee (&int_lock->lockee[0], local,
+ &local->transaction.parent_loc,
+ local->transaction.basename,
+ priv->child_count);
+ if (ret)
+ goto out;
+ int_lock->lockee_count++;
ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
if (ret < 0) {
op_errno = -ret;
goto out;
}
- ret = 0;
+ return 0;
out:
- if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (create, frame, -1, op_errno,
- NULL, NULL, NULL, NULL, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+ NULL, NULL);
return 0;
}
@@ -380,25 +531,14 @@ afr_mknod_unwind (call_frame_t *frame, xlator_t *this)
local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (mknod, main_frame,
- local->op_ret, local->op_errno,
- local->cont.dir_fop.inode,
- &local->cont.dir_fop.buf,
- &local->cont.dir_fop.preparent,
- &local->cont.dir_fop.postparent,
- NULL);
- }
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
+ AFR_STACK_UNWIND (mknod, main_frame, local->op_ret, local->op_errno,
+ local->inode, &local->cont.dir_fop.buf,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
return 0;
}
@@ -409,129 +549,72 @@ afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *buf, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- int call_count = -1;
- int child_index = -1;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- __dir_entry_fop_common_cbk (frame, child_index, this,
- op_ret, op_errno, inode, buf,
- preparent, postparent, NULL, NULL);
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- afr_dir_fop_done (frame, this);
-
- return 0;
+ return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
}
-int32_t
-afr_mknod_wind (call_frame_t *frame, xlator_t *this)
+int
+afr_mknod_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
local = frame->local;
priv = this->private;
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->mknod,
- &local->loc, local->cont.mknod.mode,
- local->cont.mknod.dev,
- local->umask,
- local->xdata_req);
- if (!--call_count)
- break;
- }
- }
-
- return 0;
-}
-
-
-int
-afr_mknod_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t * local = NULL;
-
- local = frame->local;
-
- local->transaction.unwind (frame, this);
- AFR_STACK_DESTROY (frame);
-
+ STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->mknod,
+ &local->loc, local->cont.mknod.mode,
+ local->cont.mknod.dev, local->umask,
+ local->xdata_req);
return 0;
}
-
int
afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
- dev_t dev, mode_t umask, dict_t *params)
+ dev_t dev, mode_t umask, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
- int ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
priv = this->private;
QUORUM_CHECK(mknod,out);
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
+ if (!transaction_frame)
goto out;
- }
- AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out);
- local = transaction_frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
loc_copy (&local->loc, loc);
+ local->inode = inode_ref (loc->inode);
+ local->parent = inode_ref (loc->parent);
- LOCK (&priv->read_child_lock);
- {
- local->read_child_index = (++priv->read_child_rr)
- % (priv->child_count);
- }
- UNLOCK (&priv->read_child_lock);
-
+ local->op = GF_FOP_MKNOD;
local->cont.mknod.mode = mode;
local->cont.mknod.dev = dev;
local->umask = umask;
- if (params)
- local->xdata_req = dict_ref (params);
- local->transaction.fop = afr_mknod_wind;
- local->transaction.done = afr_mknod_done;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->transaction.wind = afr_mknod_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
local->transaction.unwind = afr_mknod_unwind;
ret = afr_build_parent_loc (&local->transaction.parent_loc, loc,
@@ -541,22 +624,30 @@ afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
local->transaction.main_frame = frame;
local->transaction.basename = AFR_BASENAME (loc->path);
+ int_lock = &local->internal_lock;
+
+ int_lock->lockee_count = 0;
+ ret = afr_init_entry_lockee (&int_lock->lockee[0], local,
+ &local->transaction.parent_loc,
+ local->transaction.basename,
+ priv->child_count);
+ if (ret)
+ goto out;
+ int_lock->lockee_count++;
ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
if (ret < 0) {
- op_errno = -ret;
- goto out;
+ op_errno = -ret;
+ goto out;
}
- ret = 0;
+ return 0;
out:
- if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (mknod, frame, -1, op_errno,
- NULL, NULL, NULL, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+ NULL);
return 0;
}
@@ -573,25 +664,14 @@ afr_mkdir_unwind (call_frame_t *frame, xlator_t *this)
local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (mkdir, main_frame,
- local->op_ret, local->op_errno,
- local->cont.dir_fop.inode,
- &local->cont.dir_fop.buf,
- &local->cont.dir_fop.preparent,
- &local->cont.dir_fop.postparent,
- NULL);
- }
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
+ AFR_STACK_UNWIND (mkdir, main_frame, local->op_ret, local->op_errno,
+ local->inode, &local->cont.dir_fop.buf,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
return 0;
}
@@ -602,129 +682,71 @@ afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *buf, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- int call_count = -1;
- int child_index = -1;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- __dir_entry_fop_common_cbk (frame, child_index, this,
- op_ret, op_errno, inode, buf,
- preparent, postparent, NULL, NULL);
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- afr_dir_fop_done (frame, this);
-
- return 0;
+ return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
}
int
-afr_mkdir_wind (call_frame_t *frame, xlator_t *this)
+afr_mkdir_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
local = frame->local;
priv = this->private;
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->mkdir,
- &local->loc, local->cont.mkdir.mode,
- local->umask,
- local->xdata_req);
- if (!--call_count)
- break;
- }
- }
-
- return 0;
-}
-
-
-int
-afr_mkdir_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t * local = NULL;
-
- local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
+ STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->mkdir, &local->loc,
+ local->cont.mkdir.mode, local->umask,
+ local->xdata_req);
return 0;
}
int
-afr_mkdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, mode_t umask, dict_t *params)
+afr_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
- int ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
priv = this->private;
QUORUM_CHECK(mkdir,out);
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
+ if (!transaction_frame)
goto out;
- }
- AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out);
- local = transaction_frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
loc_copy (&local->loc, loc);
-
- LOCK (&priv->read_child_lock);
- {
- local->read_child_index = (++priv->read_child_rr)
- % (priv->child_count);
- }
- UNLOCK (&priv->read_child_lock);
+ local->inode = inode_ref (loc->inode);
+ local->parent = inode_ref (loc->parent);
local->cont.mkdir.mode = mode;
local->umask = umask;
- if (params)
- local->xdata_req = dict_ref (params);
- local->transaction.fop = afr_mkdir_wind;
- local->transaction.done = afr_mkdir_done;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->op = GF_FOP_MKDIR;
+ local->transaction.wind = afr_mkdir_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
local->transaction.unwind = afr_mkdir_unwind;
ret = afr_build_parent_loc (&local->transaction.parent_loc, loc,
@@ -734,23 +756,30 @@ afr_mkdir (call_frame_t *frame, xlator_t *this,
local->transaction.main_frame = frame;
local->transaction.basename = AFR_BASENAME (loc->path);
+ int_lock = &local->internal_lock;
+
+ int_lock->lockee_count = 0;
+ ret = afr_init_entry_lockee (&int_lock->lockee[0], local,
+ &local->transaction.parent_loc,
+ local->transaction.basename,
+ priv->child_count);
+ if (ret)
+ goto out;
+ int_lock->lockee_count++;
ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
if (ret < 0) {
- op_errno = -ret;
- goto out;
+ op_errno = -ret;
+ goto out;
}
- ret = 0;
+ return 0;
out:
- if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
-
- AFR_STACK_UNWIND (mkdir, frame, -1, op_errno,
- NULL, NULL, NULL, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+ NULL);
return 0;
}
@@ -767,25 +796,14 @@ afr_link_unwind (call_frame_t *frame, xlator_t *this)
local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (link, main_frame,
- local->op_ret, local->op_errno,
- local->cont.dir_fop.inode,
- &local->cont.dir_fop.buf,
- &local->cont.dir_fop.preparent,
- &local->cont.dir_fop.postparent,
- NULL);
- }
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
+ AFR_STACK_UNWIND (link, main_frame, local->op_ret, local->op_errno,
+ local->inode, &local->cont.dir_fop.buf,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
return 0;
}
@@ -796,124 +814,70 @@ afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *buf, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- int call_count = -1;
- int child_index = -1;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- __dir_entry_fop_common_cbk (frame, child_index, this,
- op_ret, op_errno, inode, buf,
- preparent, postparent, NULL, NULL);
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- afr_dir_fop_done (frame, this);
-
- return 0;
+ return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
}
int
-afr_link_wind (call_frame_t *frame, xlator_t *this)
+afr_link_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
local = frame->local;
priv = this->private;
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->link,
- &local->loc,
- &local->newloc, local->xdata_req);
-
- if (!--call_count)
- break;
- }
- }
-
+ STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->link,
+ &local->loc, &local->newloc, local->xdata_req);
return 0;
}
int
-afr_link_done (call_frame_t *frame, xlator_t *this)
+afr_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
{
- afr_local_t * local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
- return 0;
-}
-
-
-int
-afr_link (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc, dict_t *xdata)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
- int ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
priv = this->private;
QUORUM_CHECK(link,out);
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
+ if (!transaction_frame)
goto out;
- }
- AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out);
- local = transaction_frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
loc_copy (&local->loc, oldloc);
loc_copy (&local->newloc, newloc);
+
+ local->inode = inode_ref (oldloc->inode);
+ local->parent = inode_ref (newloc->parent);
+
if (xdata)
- local->xdata_req = dict_ref (xdata);
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
- LOCK (&priv->read_child_lock);
- {
- local->read_child_index = (++priv->read_child_rr)
- % (priv->child_count);
- }
- UNLOCK (&priv->read_child_lock);
+ if (!local->xdata_req)
+ goto out;
+
+ local->op = GF_FOP_LINK;
- local->transaction.fop = afr_link_wind;
- local->transaction.done = afr_link_done;
+ local->transaction.wind = afr_link_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
local->transaction.unwind = afr_link_unwind;
ret = afr_build_parent_loc (&local->transaction.parent_loc, newloc,
@@ -923,22 +887,30 @@ afr_link (call_frame_t *frame, xlator_t *this,
local->transaction.main_frame = frame;
local->transaction.basename = AFR_BASENAME (newloc->path);
+ int_lock = &local->internal_lock;
+ int_lock->lockee_count = 0;
+ ret = afr_init_entry_lockee (&int_lock->lockee[0], local,
+ &local->transaction.parent_loc,
+ local->transaction.basename,
+ priv->child_count);
+ if (ret)
+ goto out;
+
+ int_lock->lockee_count++;
ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
if (ret < 0) {
- op_errno = -ret;
- goto out;
+ op_errno = -ret;
+ goto out;
}
- ret = 0;
+ return 0;
out:
- if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (link, frame, -1, op_errno,
- NULL, NULL, NULL, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+ NULL);
return 0;
}
@@ -955,25 +927,14 @@ afr_symlink_unwind (call_frame_t *frame, xlator_t *this)
local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (symlink, main_frame,
- local->op_ret, local->op_errno,
- local->cont.dir_fop.inode,
- &local->cont.dir_fop.buf,
- &local->cont.dir_fop.preparent,
- &local->cont.dir_fop.postparent,
- NULL);
- }
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
+ AFR_STACK_UNWIND (symlink, main_frame, local->op_ret, local->op_errno,
+ local->inode, &local->cont.dir_fop.buf,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
return 0;
}
@@ -984,130 +945,71 @@ afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *buf, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- int call_count = -1;
- int child_index = -1;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- __dir_entry_fop_common_cbk (frame, child_index, this,
- op_ret, op_errno, inode, buf,
- preparent, postparent, NULL, NULL);
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- afr_dir_fop_done (frame, this);
-
- return 0;
+ return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
}
int
-afr_symlink_wind (call_frame_t *frame, xlator_t *this)
+afr_symlink_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
local = frame->local;
priv = this->private;
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->symlink,
- local->cont.symlink.linkpath,
- &local->loc,
- local->umask,
- local->xdata_req);
-
- if (!--call_count)
- break;
-
- }
- }
-
+ STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->symlink,
+ local->cont.symlink.linkpath, &local->loc,
+ local->umask, local->xdata_req);
return 0;
}
int
-afr_symlink_done (call_frame_t *frame, xlator_t *this)
+afr_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath,
+ loc_t *loc, mode_t umask, dict_t *xdata)
{
- afr_local_t * local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
- return 0;
-}
-
-
-int
-afr_symlink (call_frame_t *frame, xlator_t *this,
- const char *linkpath, loc_t *loc, mode_t umask, dict_t *params)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
- int ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
priv = this->private;
QUORUM_CHECK(symlink,out);
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
+ if (!transaction_frame)
goto out;
- }
- AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out);
- local = transaction_frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
loc_copy (&local->loc, loc);
-
- LOCK (&priv->read_child_lock);
- {
- local->read_child_index = (++priv->read_child_rr)
- % (priv->child_count);
- }
- UNLOCK (&priv->read_child_lock);
+ local->inode = inode_ref (loc->inode);
+ local->parent = inode_ref (loc->parent);
local->cont.symlink.linkpath = gf_strdup (linkpath);
local->umask = umask;
- if (params)
- local->xdata_req = dict_ref (params);
- local->transaction.fop = afr_symlink_wind;
- local->transaction.done = afr_symlink_done;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->op = GF_FOP_SYMLINK;
+ local->transaction.wind = afr_symlink_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
local->transaction.unwind = afr_symlink_unwind;
ret = afr_build_parent_loc (&local->transaction.parent_loc, loc,
@@ -1117,22 +1019,30 @@ afr_symlink (call_frame_t *frame, xlator_t *this,
local->transaction.main_frame = frame;
local->transaction.basename = AFR_BASENAME (loc->path);
+ int_lock = &local->internal_lock;
+
+ int_lock->lockee_count = 0;
+ ret = afr_init_entry_lockee (&int_lock->lockee[0], local,
+ &local->transaction.parent_loc,
+ local->transaction.basename,
+ priv->child_count);
+ if (ret)
+ goto out;
+ int_lock->lockee_count++;
ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
if (ret < 0) {
- op_errno = -ret;
- goto out;
+ op_errno = -ret;
+ goto out;
}
- ret = 0;
+ return 0;
out:
- if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (symlink, frame, -1, op_errno,
- NULL, NULL, NULL, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (symlink, frame, -1, op_errno, NULL, NULL, NULL,
+ NULL, NULL);
return 0;
}
@@ -1148,26 +1058,16 @@ afr_rename_unwind (call_frame_t *frame, xlator_t *this)
local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (rename, main_frame,
- local->op_ret, local->op_errno,
- &local->cont.dir_fop.buf,
- &local->cont.dir_fop.preparent,
- &local->cont.dir_fop.postparent,
- &local->cont.dir_fop.prenewparent,
- &local->cont.dir_fop.postnewparent,
- NULL);
- }
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
+ AFR_STACK_UNWIND (rename, main_frame, local->op_ret, local->op_errno,
+ &local->cont.dir_fop.buf,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent,
+ &local->cont.dir_fop.prenewparent,
+ &local->cont.dir_fop.postnewparent, local->xdata_rsp);
return 0;
}
@@ -1179,127 +1079,72 @@ afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *prenewparent, struct iatt *postnewparent,
dict_t *xdata)
{
- afr_local_t * local = NULL;
- int call_count = -1;
- int child_index = -1;
-
- local = frame->local;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (afr_fop_failed (op_ret, op_errno) && op_errno != ENOTEMPTY)
- afr_transaction_fop_failed (frame, this, child_index);
- local->op_errno = op_errno;
-
- if (op_ret > -1)
- __dir_entry_fop_common_cbk (frame, child_index, this,
- op_ret, op_errno, NULL, buf,
- preoldparent, postoldparent,
- prenewparent, postnewparent);
-
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- afr_dir_fop_done (frame, this);
-
- return 0;
+ return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf,
+ preoldparent, postoldparent, prenewparent,
+ postnewparent, xdata);
}
-int32_t
-afr_rename_wind (call_frame_t *frame, xlator_t *this)
+int
+afr_rename_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_rename_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->rename,
- &local->loc,
- &local->newloc, NULL);
- if (!--call_count)
- break;
- }
- }
-
- return 0;
-}
-
-
-int
-afr_rename_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t * local = frame->local;
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
+ local = frame->local;
+ priv = this->private;
+ STACK_WIND_COOKIE (frame, afr_rename_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->rename,
+ &local->loc, &local->newloc, local->xdata_req);
return 0;
}
int
-afr_rename (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc, dict_t *xdata)
+afr_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
- int ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+ int nlockee = 0;
priv = this->private;
QUORUM_CHECK(rename,out);
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
+ if (!transaction_frame)
op_errno = ENOMEM;
- goto out;
- }
- AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out);
- local = transaction_frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
loc_copy (&local->loc, oldloc);
loc_copy (&local->newloc, newloc);
- local->read_child_index = afr_inode_get_read_ctx (this, oldloc->inode, NULL);
+ local->inode = inode_ref (oldloc->inode);
+ local->parent = inode_ref (oldloc->parent);
+ local->parent2 = inode_ref (newloc->parent);
- local->transaction.fop = afr_rename_wind;
- local->transaction.done = afr_rename_done;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->op = GF_FOP_RENAME;
+ local->transaction.wind = afr_rename_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
local->transaction.unwind = afr_rename_unwind;
ret = afr_build_parent_loc (&local->transaction.parent_loc, oldloc,
@@ -1314,23 +1159,52 @@ afr_rename (call_frame_t *frame, xlator_t *this,
local->transaction.main_frame = frame;
local->transaction.basename = AFR_BASENAME (oldloc->path);
local->transaction.new_basename = AFR_BASENAME (newloc->path);
+ int_lock = &local->internal_lock;
+
+ int_lock->lockee_count = nlockee = 0;
+ ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local,
+ &local->transaction.new_parent_loc,
+ local->transaction.new_basename,
+ priv->child_count);
+ if (ret)
+ goto out;
+
+ nlockee++;
+ ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local,
+ &local->transaction.parent_loc,
+ local->transaction.basename,
+ priv->child_count);
+ if (ret)
+ goto out;
+
+ nlockee++;
+ if (local->newloc.inode && IA_ISDIR (local->newloc.inode->ia_type)) {
+ ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local,
+ &local->newloc,
+ NULL,
+ priv->child_count);
+ if (ret)
+ goto out;
+
+ nlockee++;
+ }
+ qsort (int_lock->lockee, nlockee, sizeof (*int_lock->lockee),
+ afr_entry_lockee_cmp);
+ int_lock->lockee_count = nlockee;
ret = afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION);
if (ret < 0) {
- op_errno = -ret;
- goto out;
+ op_errno = -ret;
+ goto out;
}
- ret = 0;
+ return 0;
out:
- if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
-
- AFR_STACK_UNWIND (rename, frame, -1, op_errno,
- NULL, NULL, NULL, NULL, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+ NULL, NULL);
return 0;
}
@@ -1346,23 +1220,13 @@ afr_unlink_unwind (call_frame_t *frame, xlator_t *this)
local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (unlink, main_frame,
- local->op_ret, local->op_errno,
- &local->cont.dir_fop.preparent,
- &local->cont.dir_fop.postparent,
- NULL);
- }
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
+ AFR_STACK_UNWIND (unlink, main_frame, local->op_ret, local->op_errno,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
return 0;
}
@@ -1372,121 +1236,69 @@ afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- afr_local_t * local = NULL;
- int call_count = -1;
- int child_index = (long) cookie;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (child_index == local->read_child_index) {
- local->read_child_returned = _gf_true;
- }
- __dir_entry_fop_common_cbk (frame, child_index, this,
- op_ret, op_errno, NULL, NULL,
- preparent, postparent, NULL, NULL);
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
- if (call_count == 0)
- afr_dir_fop_done (frame, this);
-
- return 0;
+ return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, NULL,
+ preparent, postparent, NULL, NULL, xdata);
}
-int32_t
-afr_unlink_wind (call_frame_t *frame, xlator_t *this)
+int
+afr_unlink_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
local = frame->local;
priv = this->private;
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->unlink,
- &local->loc, local->xflag,
- local->xdata_req);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
-}
-
-
-int32_t
-afr_unlink_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t * local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
+ STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->unlink,
+ &local->loc, local->xflag, local->xdata_req);
return 0;
}
-int32_t
-afr_unlink (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int xflag, dict_t *xdata)
+int
+afr_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+ dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
- int ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
priv = this->private;
QUORUM_CHECK(unlink,out);
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
+ if (!transaction_frame)
goto out;
- }
- AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out);
- local = transaction_frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
loc_copy (&local->loc, loc);
local->xflag = xflag;
+
+ local->inode = inode_ref (loc->inode);
+ local->parent = inode_ref (loc->parent);
+
if (xdata)
- local->xdata_req = dict_ref (xdata);
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
+
+ if (!local->xdata_req)
+ goto out;
- local->transaction.fop = afr_unlink_wind;
- local->transaction.done = afr_unlink_done;
+ local->op = GF_FOP_UNLINK;
+ local->transaction.wind = afr_unlink_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
local->transaction.unwind = afr_unlink_unwind;
ret = afr_build_parent_loc (&local->transaction.parent_loc, loc,
@@ -1496,22 +1308,29 @@ afr_unlink (call_frame_t *frame, xlator_t *this,
local->transaction.main_frame = frame;
local->transaction.basename = AFR_BASENAME (loc->path);
+ int_lock = &local->internal_lock;
+
+ int_lock->lockee_count = 0;
+ ret = afr_init_entry_lockee (&int_lock->lockee[0], local,
+ &local->transaction.parent_loc,
+ local->transaction.basename,
+ priv->child_count);
+ if (ret)
+ goto out;
+ int_lock->lockee_count++;
ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
if (ret < 0) {
- op_errno = -ret;
- goto out;
+ op_errno = -ret;
+ goto out;
}
- ret = 0;
+ return 0;
out:
- if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (unlink, frame, -1, op_errno,
- NULL, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
}
@@ -1529,23 +1348,13 @@ afr_rmdir_unwind (call_frame_t *frame, xlator_t *this)
local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame) {
- main_frame = local->transaction.main_frame;
- }
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (rmdir, main_frame,
- local->op_ret, local->op_errno,
- &local->cont.dir_fop.preparent,
- &local->cont.dir_fop.postparent,
- NULL);
- }
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
+ AFR_STACK_UNWIND (rmdir, main_frame, local->op_ret, local->op_errno,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
return 0;
}
@@ -1555,126 +1364,71 @@ afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- afr_local_t * local = NULL;
- int call_count = -1;
- int child_index = (long) cookie;
- int read_child = 0;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (child_index == read_child) {
- local->read_child_returned = _gf_true;
- }
- if (afr_fop_failed (op_ret, op_errno) && (op_errno != ENOTEMPTY))
- afr_transaction_fop_failed (frame, this, child_index);
- local->op_errno = op_errno;
- if (op_ret > -1)
- __dir_entry_fop_common_cbk (frame, child_index, this,
- op_ret, op_errno, NULL, NULL,
- preparent, postparent, NULL,
- NULL);
-
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
- if (call_count == 0)
- afr_dir_fop_done (frame, this);
-
- return 0;
+ return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, NULL,
+ preparent, postparent, NULL, NULL, xdata);
}
int
-afr_rmdir_wind (call_frame_t *frame, xlator_t *this)
+afr_rmdir_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
local = frame->local;
priv = this->private;
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->rmdir,
- &local->loc, local->cont.rmdir.flags,
- NULL);
-
- if (!--call_count)
- break;
- }
- }
-
+ STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->rmdir,
+ &local->loc, local->cont.rmdir.flags, local->xdata_req);
return 0;
}
int
-afr_rmdir_done (call_frame_t *frame, xlator_t *this)
+afr_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+ dict_t *xdata)
{
- afr_local_t * local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
- return 0;
-}
-
-
-int
-afr_rmdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int flags, dict_t *xdata)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t * transaction_frame = NULL;
- int ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+ int nlockee = 0;
priv = this->private;
QUORUM_CHECK(rmdir,out);
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
+ if (!transaction_frame)
goto out;
- }
- AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out);
- local = transaction_frame->local;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
- local->cont.rmdir.flags = flags;
loc_copy (&local->loc, loc);
+ local->inode = inode_ref (loc->inode);
+ local->parent = inode_ref (loc->parent);
- local->transaction.fop = afr_rmdir_wind;
- local->transaction.done = afr_rmdir_done;
+ local->cont.rmdir.flags = flags;
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->op = GF_FOP_RMDIR;
+ local->transaction.wind = afr_rmdir_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
local->transaction.unwind = afr_rmdir_unwind;
ret = afr_build_parent_loc (&local->transaction.parent_loc, loc,
@@ -1684,21 +1438,41 @@ afr_rmdir (call_frame_t *frame, xlator_t *this,
local->transaction.main_frame = frame;
local->transaction.basename = AFR_BASENAME (loc->path);
+ int_lock = &local->internal_lock;
+
+ int_lock->lockee_count = nlockee = 0;
+ ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local,
+ &local->transaction.parent_loc,
+ local->transaction.basename,
+ priv->child_count);
+ if (ret)
+ goto out;
+
+ nlockee++;
+ ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local,
+ &local->loc,
+ NULL,
+ priv->child_count);
+ if (ret)
+ goto out;
+
+ nlockee++;
+ qsort (int_lock->lockee, nlockee, sizeof (*int_lock->lockee),
+ afr_entry_lockee_cmp);
+ int_lock->lockee_count = nlockee;
ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
if (ret < 0) {
- op_errno = -ret;
- goto out;
+ op_errno = -ret;
+ goto out;
}
- ret = 0;
+ return 0;
out:
- if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
}
diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c
index 4679e4610..4cb219246 100644
--- a/xlators/cluster/afr/src/afr-inode-read.c
+++ b/xlators/cluster/afr/src/afr-inode-read.c
@@ -35,237 +35,153 @@
#include "compat-errno.h"
#include "compat.h"
-/**
- * Common algorithm for inode read calls:
- *
- * - Try the fop on the first child that is up
- * - if we have failed due to ENOTCONN:
- * try the next child
- *
- * Applicable to: access, stat, fstat, readlink, getxattr
- */
+#include "afr-transaction.h"
+
/* {{{ access */
-int32_t
-afr_access_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata)
+int
+afr_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
- int unwind = 1;
- int32_t *last_index = NULL;
- int32_t next_call_child = -1;
- int32_t read_child = -1;
- int32_t *fresh_children = NULL;
-
- priv = this->private;
- children = priv->children;
+ afr_local_t *local = NULL;
local = frame->local;
- read_child = (long) cookie;
-
- if (op_ret == -1) {
- last_index = &local->cont.access.last_index;
- fresh_children = local->fresh_children;
- next_call_child = afr_next_call_child (fresh_children,
- local->child_up,
- priv->child_count,
- last_index, read_child);
- if (next_call_child < 0)
- goto out;
-
- unwind = 0;
+ if (op_ret < 0) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- STACK_WIND_COOKIE (frame, afr_access_cbk,
- (void *) (long) read_child,
- children[next_call_child],
- children[next_call_child]->fops->access,
- &local->loc, local->cont.access.mask,
- NULL);
- }
+ afr_read_txn_continue (frame, this, (long) cookie);
+ return 0;
+ }
-out:
- if (unwind) {
- AFR_STACK_UNWIND (access, frame, op_ret, op_errno, xdata);
- }
+ AFR_STACK_UNWIND (access, frame, op_ret, op_errno, xdata);
return 0;
}
-int32_t
-afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
- dict_t *xdata)
+int
+afr_access_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_private_t *priv = NULL;
- xlator_t **children = NULL;
- int call_child = 0;
- afr_local_t *local = NULL;
- int32_t op_errno = 0;
- int32_t read_child = -1;
- int ret = -1;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
-
- children = priv->children;
-
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
-
- local->fresh_children = afr_children_create (priv->child_count);
- if (!local->fresh_children) {
- op_errno = ENOMEM;
- goto out;
- }
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (subvol == -1) {
+ AFR_STACK_UNWIND (access, frame, local->op_ret,
+ local->op_errno, 0);
+ return 0;
+ }
+
+ STACK_WIND_COOKIE (frame, afr_access_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->access,
+ &local->loc, local->cont.access.mask,
+ local->xdata_req);
+ return 0;
+}
+int
+afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ int mask, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int op_errno = 0;
- read_child = afr_inode_get_read_ctx (this, loc->inode,
- local->fresh_children);
- ret = afr_get_call_child (this, local->child_up, read_child,
- local->fresh_children,
- &call_child,
- &local->cont.access.last_index);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
- loc_copy (&local->loc, loc);
- local->cont.access.mask = mask;
+ local->op = GF_FOP_ACCESS;
+ loc_copy (&local->loc, loc);
+ local->cont.access.mask = mask;
+ if (xdata)
+ local->xdata_req = dict_ref (xdata);
- STACK_WIND_COOKIE (frame, afr_access_cbk,
- (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->access,
- loc, mask, xdata);
+ afr_read_txn (frame, this, loc->inode, afr_access_wind,
+ AFR_METADATA_TRANSACTION);
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (access, frame, -1, op_errno, NULL);
+ AFR_STACK_UNWIND (access, frame, -1, op_errno, NULL);
+
return 0;
}
-
/* }}} */
/* {{{ stat */
-int32_t
+int
afr_stat_cbk (call_frame_t *frame, void *cookie,
xlator_t *this, int32_t op_ret, int32_t op_errno,
struct iatt *buf, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
- int unwind = 1;
- int32_t *last_index = NULL;
- int32_t next_call_child = -1;
- int32_t read_child = -1;
- int32_t *fresh_children = NULL;
-
- priv = this->private;
- children = priv->children;
-
- read_child = (long) cookie;
+ afr_local_t *local = NULL;
local = frame->local;
- if (op_ret == -1) {
- last_index = &local->cont.stat.last_index;
- fresh_children = local->fresh_children;
- next_call_child = afr_next_call_child (fresh_children,
- local->child_up,
- priv->child_count,
- last_index, read_child);
- if (next_call_child < 0)
- goto out;
+ if (op_ret < 0) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- unwind = 0;
+ afr_read_txn_continue (frame, this, (long) cookie);
+ return 0;
+ }
- STACK_WIND_COOKIE (frame, afr_stat_cbk,
- (void *) (long) read_child,
- children[next_call_child],
- children[next_call_child]->fops->stat,
- &local->loc, NULL);
- }
-
-out:
- if (unwind) {
- AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata);
- }
+ AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata);
return 0;
}
-int32_t
-afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+int
+afr_stat_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- xlator_t **children = NULL;
- int call_child = 0;
- int32_t op_errno = 0;
- int32_t read_child = -1;
- int ret = -1;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
-
- children = priv->children;
-
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (subvol == -1) {
+ AFR_STACK_UNWIND (stat, frame, local->op_ret, local->op_errno,
+ 0, 0);
+ return 0;
+ }
+
+ STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->stat,
+ &local->loc, local->xdata_req);
+ return 0;
+}
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+int
+afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int op_errno = 0;
- local->fresh_children = afr_children_create (priv->child_count);
- if (!local->fresh_children) {
- op_errno = ENOMEM;
- goto out;
- }
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
- read_child = afr_inode_get_read_ctx (this, loc->inode,
- local->fresh_children);
- ret = afr_get_call_child (this, local->child_up, read_child,
- local->fresh_children,
- &call_child,
- &local->cont.stat.last_index);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
- loc_copy (&local->loc, loc);
+ local->op = GF_FOP_STAT;
+ loc_copy (&local->loc, loc);
+ if (xdata)
+ local->xdata_req = dict_ref (xdata);
- STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->stat,
- loc, xdata);
+ afr_read_txn (frame, this, loc->inode, afr_stat_wind,
+ AFR_DATA_TRANSACTION);
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL);
+ AFR_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL);
return 0;
}
@@ -275,52 +191,49 @@ out:
/* {{{ fstat */
-int32_t
+int
afr_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *buf,
dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- xlator_t **children = NULL;
- int unwind = 1;
- int32_t *last_index = NULL;
- int32_t next_call_child = -1;
- int32_t read_child = -1;
- int32_t *fresh_children = NULL;
-
- priv = this->private;
- children = priv->children;
+ afr_local_t *local = NULL;
local = frame->local;
- read_child = (long) cookie;
+ if (op_ret < 0) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- if (op_ret == -1) {
- last_index = &local->cont.fstat.last_index;
- fresh_children = local->fresh_children;
- next_call_child = afr_next_call_child (fresh_children,
- local->child_up,
- priv->child_count,
- last_index, read_child);
- if (next_call_child < 0)
- goto out;
+ afr_read_txn_continue (frame, this, (long) cookie);
+ return 0;
+ }
- unwind = 0;
+ AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata);
- STACK_WIND_COOKIE (frame, afr_fstat_cbk,
- (void *) (long) read_child,
- children[next_call_child],
- children[next_call_child]->fops->fstat,
- local->fd, NULL);
- }
+ return 0;
+}
-out:
- if (unwind) {
- AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata);
- }
- return 0;
+int
+afr_fstat_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (subvol == -1) {
+ AFR_STACK_UNWIND (fstat, frame, local->op_ret, local->op_errno,
+ 0, 0);
+ return 0;
+ }
+
+ STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fstat,
+ local->fd, local->xdata_req);
+ return 0;
}
@@ -328,69 +241,26 @@ int32_t
afr_fstat (call_frame_t *frame, xlator_t *this,
fd_t *fd, dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- xlator_t **children = NULL;
- int call_child = 0;
- int32_t op_errno = 0;
- int32_t read_child = 0;
- int ret = -1;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (fd, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
-
- children = priv->children;
-
- VALIDATE_OR_GOTO (fd->inode, out);
-
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
-
- local->fresh_children = afr_children_create (priv->child_count);
- if (!local->fresh_children) {
- op_errno = ENOMEM;
- goto out;
- }
-
- read_child = afr_inode_get_read_ctx (this, fd->inode,
- local->fresh_children);
-
+ afr_local_t *local = NULL;
+ int op_errno = 0;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
- ret = afr_get_call_child (this, local->child_up, read_child,
- local->fresh_children,
- &call_child,
- &local->cont.fstat.last_index);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ local->op = GF_FOP_FSTAT;
+ local->fd = fd_ref (fd);
+ if (xdata)
+ local->xdata_req = dict_ref (xdata);
- local->fd = fd_ref (fd);
+ afr_fix_open (fd, this);
- ret = afr_open_fd_fix (frame, this, _gf_false);
- if (ret) {
- op_errno = -ret;
- goto out;
- }
- STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->fstat,
- fd, xdata);
+ afr_read_txn (frame, this, fd->inode, afr_fstat_wind,
+ AFR_DATA_TRANSACTION);
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL);
+ AFR_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL);
return 0;
}
@@ -399,115 +269,77 @@ out:
/* {{{ readlink */
-int32_t
+int
afr_readlink_cbk (call_frame_t *frame, void *cookie,
xlator_t *this, int32_t op_ret, int32_t op_errno,
const char *buf, struct iatt *sbuf, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
- int unwind = 1;
- int32_t *last_index = NULL;
- int32_t next_call_child = -1;
- int32_t read_child = -1;
- int32_t *fresh_children = NULL;
-
- priv = this->private;
- children = priv->children;
+ afr_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- read_child = (long) cookie;
-
- if (op_ret == -1) {
- last_index = &local->cont.readlink.last_index;
- fresh_children = local->fresh_children;
- next_call_child = afr_next_call_child (fresh_children,
- local->child_up,
- priv->child_count,
- last_index, read_child);
- if (next_call_child < 0)
- goto out;
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
- unwind = 0;
- STACK_WIND_COOKIE (frame, afr_readlink_cbk,
- (void *) (long) read_child,
- children[next_call_child],
- children[next_call_child]->fops->readlink,
- &local->loc,
- local->cont.readlink.size, NULL);
- }
+ afr_read_txn_continue (frame, this, (long) cookie);
+ return 0;
+ }
-out:
- if (unwind) {
- AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno, buf, sbuf,
- xdata);
- }
+ AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno,
+ buf, sbuf, xdata);
+ return 0;
+}
- return 0;
+int
+afr_readlink_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (subvol == -1) {
+ AFR_STACK_UNWIND (readlink, frame, local->op_ret,
+ local->op_errno, 0, 0, 0);
+ return 0;
+ }
+
+ STACK_WIND_COOKIE (frame, afr_readlink_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->readlink,
+ &local->loc, local->cont.readlink.size,
+ local->xdata_req);
+ return 0;
}
-int32_t
+int
afr_readlink (call_frame_t *frame, xlator_t *this,
loc_t *loc, size_t size, dict_t *xdata)
{
- afr_private_t *priv = NULL;
- xlator_t **children = NULL;
- int call_child = 0;
- afr_local_t *local = NULL;
+ afr_local_t * local = NULL;
int32_t op_errno = 0;
- int32_t read_child = -1;
- int ret = -1;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
- children = priv->children;
-
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
-
- local->fresh_children = afr_children_create (priv->child_count);
- if (!local->fresh_children) {
- op_errno = ENOMEM;
- goto out;
- }
- read_child = afr_inode_get_read_ctx (this, loc->inode,
- local->fresh_children);
- ret = afr_get_call_child (this, local->child_up, read_child,
- local->fresh_children,
- &call_child,
- &local->cont.readlink.last_index);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+ local->op = GF_FOP_READLINK;
loc_copy (&local->loc, loc);
+ local->cont.readlink.size = size;
+ if (xdata)
+ local->xdata_req = dict_ref (xdata);
- local->cont.readlink.size = size;
+ afr_read_txn (frame, this, loc->inode, afr_readlink_wind,
+ AFR_DATA_TRANSACTION);
- STACK_WIND_COOKIE (frame, afr_readlink_cbk,
- (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->readlink,
- loc, size, xdata);
-
- ret = 0;
-out:
- if (ret < 0)
- AFR_STACK_UNWIND (readlink, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
+out:
+ AFR_STACK_UNWIND(readlink, frame, -1, op_errno, 0, 0, 0);
+
+ return 0;
}
@@ -545,7 +377,7 @@ __gather_xattr_keys (dict_t *dict, char *key, data_t *value,
void
-__filter_xattrs (dict_t *dict)
+afr_filter_xattrs (dict_t *dict)
{
struct list_head keys = {0,};
struct _xattr_key *key = NULL;
@@ -566,59 +398,56 @@ __filter_xattrs (dict_t *dict)
}
-
-int32_t
+int
afr_getxattr_cbk (call_frame_t *frame, void *cookie,
xlator_t *this, int32_t op_ret, int32_t op_errno,
dict_t *dict, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
- int unwind = 1;
- int32_t *last_index = NULL;
- int32_t next_call_child = -1;
- int32_t read_child = -1;
- int32_t *fresh_children = NULL;
-
- priv = this->private;
- children = priv->children;
+ afr_local_t *local = NULL;
local = frame->local;
- read_child = (long) cookie;
-
- if (op_ret == -1) {
- last_index = &local->cont.getxattr.last_index;
- fresh_children = local->fresh_children;
- next_call_child = afr_next_call_child (fresh_children,
- local->child_up,
- priv->child_count,
- last_index, read_child);
- if (next_call_child < 0)
- goto out;
+ if (op_ret < 0) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- unwind = 0;
- STACK_WIND_COOKIE (frame, afr_getxattr_cbk,
- (void *) (long) read_child,
- children[next_call_child],
- children[next_call_child]->fops->getxattr,
- &local->loc,
- local->cont.getxattr.name,
- NULL);
- }
+ afr_read_txn_continue (frame, this, (long) cookie);
+ return 0;
+ }
-out:
- if (unwind) {
- if (op_ret >= 0 && dict)
- __filter_xattrs (dict);
+ if (dict)
+ afr_filter_xattrs (dict);
- AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata);
- }
+ AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata);
return 0;
}
+
+int
+afr_getxattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (subvol == -1) {
+ AFR_STACK_UNWIND (getxattr, frame, local->op_ret,
+ local->op_errno, NULL, NULL);
+ return 0;
+ }
+
+ STACK_WIND_COOKIE (frame, afr_getxattr_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->getxattr,
+ &local->loc, local->cont.getxattr.name,
+ local->xdata_req);
+ return 0;
+}
+
+
int32_t
afr_getxattr_unwind (call_frame_t *frame, int op_ret, int op_errno,
dict_t *dict, dict_t *xdata)
@@ -629,6 +458,94 @@ afr_getxattr_unwind (call_frame_t *frame, int op_ret, int op_errno,
}
int32_t
+afr_fgetxattr_clrlk_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *dict, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ xlator_t **children = NULL;
+ dict_t *xattr = NULL;
+ char *tmp_report = NULL;
+ char lk_summary[1024] = {0,};
+ int serz_len = 0;
+ int32_t callcnt = 0;
+ long int cky = 0;
+ int ret = 0;
+
+ priv = this->private;
+ children = priv->children;
+
+ local = frame->local;
+ cky = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+ if (op_ret == -1)
+ local->replies[cky].op_errno = op_errno;
+
+ if (!local->dict)
+ local->dict = dict_new ();
+ if (local->dict) {
+ ret = dict_get_str (dict, local->cont.getxattr.name,
+ &tmp_report);
+ if (ret)
+ goto unlock;
+ ret = dict_set_dynstr (local->dict,
+ children[cky]->name,
+ gf_strdup (tmp_report));
+ if (ret)
+ goto unlock;
+ }
+ }
+unlock:
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ xattr = dict_new ();
+ if (!xattr) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+ ret = dict_serialize_value_with_delim (local->dict,
+ lk_summary,
+ &serz_len, '\n');
+ if (ret) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "Error serializing dictionary");
+ goto unwind;
+ }
+ if (serz_len == -1)
+ snprintf (lk_summary, sizeof (lk_summary),
+ "No locks cleared.");
+ ret = dict_set_dynstr (xattr, local->cont.getxattr.name,
+ gf_strdup (lk_summary));
+ if (ret) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR,
+ "Error setting dictionary");
+ goto unwind;
+ }
+
+ unwind:
+ // Updating child_errno with more recent 'events'
+ op_errno = afr_final_errno (local, priv);
+
+ AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, xattr,
+ xdata);
+ if (xattr)
+ dict_unref (xattr);
+ }
+
+ return ret;
+}
+
+int32_t
afr_getxattr_clrlk_cbk (call_frame_t *frame, void *cookie,
xlator_t *this, int32_t op_ret, int32_t op_errno,
dict_t *dict, dict_t *xdata)
@@ -654,7 +571,7 @@ afr_getxattr_clrlk_cbk (call_frame_t *frame, void *cookie,
{
callcnt = --local->call_count;
if (op_ret == -1)
- local->child_errno[cky] = op_errno;
+ local->replies[cky].op_errno = op_errno;
if (!local->dict)
local->dict = dict_new ();
@@ -705,9 +622,8 @@ unlock:
unwind:
// Updating child_errno with more recent 'events'
- local->child_errno[cky] = op_errno;
- op_errno = afr_resultant_errno_get (NULL, local->child_errno,
- priv->child_count);
+ op_errno = afr_final_errno (local, priv);
+
AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, xdata);
if (xattr)
@@ -762,12 +678,372 @@ afr_getxattr_node_uuid_cbk (call_frame_t *frame, void *cookie,
unwind:
if (unwind)
- AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, NULL);
+ AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict,
+ NULL);
+
+ return 0;
+}
+
+int32_t
+afr_getxattr_lockinfo_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *dict, dict_t *xdata)
+{
+ int call_cnt = 0, len = 0;
+ char *lockinfo_buf = NULL;
+ dict_t *lockinfo = NULL, *newdict = NULL;
+ afr_local_t *local = NULL;
+
+ LOCK (&frame->lock);
+ {
+ local = frame->local;
+
+ call_cnt = --local->call_count;
+
+ if ((op_ret < 0) || (!dict && !xdata)) {
+ goto unlock;
+ }
+
+ if (xdata) {
+ if (!local->xdata_rsp) {
+ local->xdata_rsp = dict_new ();
+ if (!local->xdata_rsp) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unlock;
+ }
+ }
+ }
+
+ if (!dict) {
+ goto unlock;
+ }
+
+ op_ret = dict_get_ptr_and_len (dict, GF_XATTR_LOCKINFO_KEY,
+ (void **)&lockinfo_buf, &len);
+
+ if (!lockinfo_buf) {
+ goto unlock;
+ }
+
+ if (!local->dict) {
+ local->dict = dict_new ();
+ if (!local->dict) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unlock;
+ }
+ }
+ }
+unlock:
+ UNLOCK (&frame->lock);
+
+ if (lockinfo_buf != NULL) {
+ lockinfo = dict_new ();
+ if (lockinfo == NULL) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ } else {
+ op_ret = dict_unserialize (lockinfo_buf, len,
+ &lockinfo);
+
+ if (lockinfo && local->dict) {
+ dict_copy (lockinfo, local->dict);
+ }
+ }
+ }
+
+ if (xdata && local->xdata_rsp) {
+ dict_copy (xdata, local->xdata_rsp);
+ }
+
+ if (!call_cnt) {
+ newdict = dict_new ();
+ if (!newdict) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unwind;
+ }
+
+ len = dict_serialized_length (local->dict);
+ if (len == 0) {
+ goto unwind;
+ }
+
+ lockinfo_buf = GF_CALLOC (1, len, gf_common_mt_char);
+ if (!lockinfo_buf) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unwind;
+ }
+
+ op_ret = dict_serialize (local->dict, lockinfo_buf);
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = -op_ret;
+ }
+
+ op_ret = dict_set_dynptr (newdict, GF_XATTR_LOCKINFO_KEY,
+ (void *)lockinfo_buf, len);
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = -op_ret;
+ goto unwind;
+ }
+
+ unwind:
+ AFR_STACK_UNWIND (getxattr, frame, op_ret,
+ op_errno, newdict,
+ local->xdata_rsp);
+ }
+
+ dict_unref (lockinfo);
+
+ return 0;
+}
+
+int32_t
+afr_fgetxattr_lockinfo_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *dict, dict_t *xdata)
+{
+ int call_cnt = 0, len = 0;
+ char *lockinfo_buf = NULL;
+ dict_t *lockinfo = NULL, *newdict = NULL;
+ afr_local_t *local = NULL;
+
+ LOCK (&frame->lock);
+ {
+ local = frame->local;
+
+ call_cnt = --local->call_count;
+
+ if ((op_ret < 0) || (!dict && !xdata)) {
+ goto unlock;
+ }
+
+ if (xdata) {
+ if (!local->xdata_rsp) {
+ local->xdata_rsp = dict_new ();
+ if (!local->xdata_rsp) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unlock;
+ }
+ }
+ }
+
+ if (!dict) {
+ goto unlock;
+ }
+
+ op_ret = dict_get_ptr_and_len (dict, GF_XATTR_LOCKINFO_KEY,
+ (void **)&lockinfo_buf, &len);
+
+ if (!lockinfo_buf) {
+ goto unlock;
+ }
+
+ if (!local->dict) {
+ local->dict = dict_new ();
+ if (!local->dict) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unlock;
+ }
+ }
+ }
+unlock:
+ UNLOCK (&frame->lock);
+
+ if (lockinfo_buf != NULL) {
+ lockinfo = dict_new ();
+ if (lockinfo == NULL) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ } else {
+ op_ret = dict_unserialize (lockinfo_buf, len,
+ &lockinfo);
+
+ if (lockinfo && local->dict) {
+ dict_copy (lockinfo, local->dict);
+ }
+ }
+ }
+
+ if (xdata && local->xdata_rsp) {
+ dict_copy (xdata, local->xdata_rsp);
+ }
+
+ if (!call_cnt) {
+ newdict = dict_new ();
+ if (!newdict) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unwind;
+ }
+
+ len = dict_serialized_length (local->dict);
+ if (len <= 0) {
+ goto unwind;
+ }
+
+ lockinfo_buf = GF_CALLOC (1, len, gf_common_mt_char);
+ if (!lockinfo_buf) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unwind;
+ }
+
+ op_ret = dict_serialize (local->dict, lockinfo_buf);
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = -op_ret;
+ }
+
+ op_ret = dict_set_dynptr (newdict, GF_XATTR_LOCKINFO_KEY,
+ (void *)lockinfo_buf, len);
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = -op_ret;
+ goto unwind;
+ }
+
+ unwind:
+ AFR_STACK_UNWIND (fgetxattr, frame, op_ret,
+ op_errno, newdict,
+ local->xdata_rsp);
+ }
+
+ dict_unref (lockinfo);
return 0;
}
int32_t
+afr_fgetxattr_pathinfo_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *dict, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int32_t callcnt = 0;
+ int ret = 0;
+ char *xattr = NULL;
+ char *xattr_serz = NULL;
+ char xattr_cky[1024] = {0,};
+ dict_t *nxattr = NULL;
+ long cky = 0;
+ int32_t padding = 0;
+ int32_t tlen = 0;
+
+ if (!frame || !frame->local || !this) {
+ gf_log ("", GF_LOG_ERROR, "possible NULL deref");
+ goto out;
+ }
+
+ local = frame->local;
+ cky = (long) cookie;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (op_ret < 0) {
+ local->op_errno = op_errno;
+ } else {
+ local->op_ret = op_ret;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref (xdata);
+ }
+
+ if (!dict || (op_ret < 0))
+ goto out;
+
+ if (!local->dict)
+ local->dict = dict_new ();
+
+ if (local->dict) {
+ ret = dict_get_str (dict,
+ local->cont.getxattr.name,
+ &xattr);
+ if (ret)
+ goto out;
+
+ xattr = gf_strdup (xattr);
+
+ (void)snprintf (xattr_cky, 1024, "%s-%ld",
+ local->cont.getxattr.name, cky);
+ ret = dict_set_dynstr (local->dict,
+ xattr_cky, xattr);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Cannot set xattr cookie key");
+ goto out;
+ }
+
+ local->cont.getxattr.xattr_len
+ += strlen (xattr) + 1;
+ }
+ }
+out:
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ if (!local->cont.getxattr.xattr_len)
+ goto unwind;
+
+ nxattr = dict_new ();
+ if (!nxattr)
+ goto unwind;
+
+ /* extra bytes for decorations (brackets and <>'s) */
+ padding += strlen (this->name)
+ + strlen (AFR_PATHINFO_HEADER) + 4;
+ local->cont.getxattr.xattr_len += (padding + 2);
+
+ xattr_serz = GF_CALLOC (local->cont.getxattr.xattr_len,
+ sizeof (char), gf_common_mt_char);
+
+ if (!xattr_serz)
+ goto unwind;
+
+ /* the xlator info */
+ (void) sprintf (xattr_serz, "(<"AFR_PATHINFO_HEADER"%s> ",
+ this->name);
+
+ /* actual series of pathinfo */
+ ret = dict_serialize_value_with_delim (local->dict,
+ xattr_serz
+ + strlen (xattr_serz),
+ &tlen, ' ');
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Error serializing"
+ " dictionary");
+ goto unwind;
+ }
+
+ /* closing part */
+ *(xattr_serz + padding + tlen) = ')';
+ *(xattr_serz + padding + tlen + 1) = '\0';
+
+ ret = dict_set_dynstr (nxattr, local->cont.getxattr.name,
+ xattr_serz);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR, "Cannot set pathinfo"
+ " key in dict");
+
+ unwind:
+ AFR_STACK_UNWIND (fgetxattr, frame, local->op_ret,
+ local->op_errno, nxattr, local->xdata_rsp);
+
+ if (nxattr)
+ dict_unref (nxattr);
+ }
+
+ return ret;
+}
+
+int32_t
afr_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie,
xlator_t *this, int32_t op_ret, int32_t op_errno,
dict_t *dict, dict_t *xdata)
@@ -795,6 +1071,14 @@ afr_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie,
{
callcnt = --local->call_count;
+ if (op_ret < 0) {
+ local->op_errno = op_errno;
+ } else {
+ local->op_ret = op_ret;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref (xdata);
+ }
+
if (!dict || (op_ret < 0))
goto out;
@@ -869,8 +1153,8 @@ afr_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie,
" key in dict");
unwind:
- AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, nxattr,
- xdata);
+ AFR_STACK_UNWIND (getxattr, frame, local->op_ret,
+ local->op_errno, nxattr, local->xdata_rsp);
if (nxattr)
dict_unref (nxattr);
@@ -879,51 +1163,133 @@ afr_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie,
return ret;
}
+static int
+afr_aggregate_stime_xattr (dict_t *this, char *key, data_t *value, void *data)
+{
+ int ret = 0;
+
+ if (fnmatch (GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0)
+ ret = gf_get_max_stime (THIS, data, key, value);
+
+ return ret;
+}
+
+int32_t
+afr_common_getxattr_stime_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *dict, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int32_t callcnt = 0;
+
+ if (!frame || !frame->local || !this) {
+ gf_log ("", GF_LOG_ERROR, "possible NULL deref");
+ goto out;
+ }
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (!dict || (op_ret < 0)) {
+ local->op_errno = op_errno;
+ goto cleanup;
+ }
+
+ if (!local->dict)
+ local->dict = dict_copy_with_ref (dict, NULL);
+ else
+ dict_foreach (dict, afr_aggregate_stime_xattr,
+ local->dict);
+ local->op_ret = 0;
+ }
+
+cleanup:
+ UNLOCK (&frame->lock);
+
+ if (!callcnt) {
+ AFR_STACK_UNWIND (getxattr, frame, local->op_ret,
+ local->op_errno, local->dict, xdata);
+ }
+
+out:
+ return 0;
+}
+
+
static gf_boolean_t
-afr_is_special_xattr (const char *name, fop_getxattr_cbk_t *cbk)
+afr_is_special_xattr (const char *name, fop_getxattr_cbk_t *cbk,
+ gf_boolean_t is_fgetxattr)
{
gf_boolean_t is_spl = _gf_true;
GF_ASSERT (cbk);
- if (!cbk) {
+ if (!cbk || !name) {
is_spl = _gf_false;
goto out;
}
- if (!strcmp (name, GF_XATTR_PATHINFO_KEY))
- *cbk = afr_getxattr_pathinfo_cbk;
-
- else if (!strncmp (name, GF_XATTR_CLRLK_CMD,
- strlen (GF_XATTR_CLRLK_CMD)))
- *cbk = afr_getxattr_clrlk_cbk;
- else
+ if (!strcmp (name, GF_XATTR_PATHINFO_KEY) ||
+ !strcmp (name, GF_XATTR_USER_PATHINFO_KEY)) {
+ if (is_fgetxattr) {
+ *cbk = afr_fgetxattr_pathinfo_cbk;
+ } else {
+ *cbk = afr_getxattr_pathinfo_cbk;
+ }
+ } else if (!strncmp (name, GF_XATTR_CLRLK_CMD,
+ strlen (GF_XATTR_CLRLK_CMD))) {
+ if (is_fgetxattr) {
+ *cbk = afr_fgetxattr_clrlk_cbk;
+ } else {
+ *cbk = afr_getxattr_clrlk_cbk;
+ }
+ } else if (!strncmp (name, GF_XATTR_LOCKINFO_KEY,
+ strlen (GF_XATTR_LOCKINFO_KEY))) {
+ if (is_fgetxattr) {
+ *cbk = afr_fgetxattr_lockinfo_cbk;
+ } else {
+ *cbk = afr_getxattr_lockinfo_cbk;
+ }
+ } else if (fnmatch (GF_XATTR_STIME_PATTERN, name, FNM_NOESCAPE) == 0) {
+ *cbk = afr_common_getxattr_stime_cbk;
+ } else {
is_spl = _gf_false;
+ }
out:
return is_spl;
}
static void
-afr_getxattr_frm_all_children (xlator_t *this, call_frame_t *frame,
- const char *name, loc_t *loc,
- fop_getxattr_cbk_t cbk)
+afr_getxattr_all_subvols (xlator_t *this, call_frame_t *frame,
+ const char *name, loc_t *loc,
+ fop_getxattr_cbk_t cbk)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
- xlator_t **children = NULL;
int i = 0;
+ int call_count = 0;
priv = this->private;
- children = priv->children;
local = frame->local;
- local->call_count = priv->child_count;
+ //local->call_count set in afr_local_init
+ call_count = local->call_count;
+
+ //If up-children count is 0, afr_local_init would have failed already
+ //and the call would have unwound so not handling it here.
for (i = 0; i < priv->child_count; i++) {
- STACK_WIND_COOKIE (frame, cbk,
- (void *) (long) i,
- children[i], children[i]->fops->getxattr,
- loc, name, NULL);
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, cbk,
+ (void *) (long) i, priv->children[i],
+ priv->children[i]->fops->getxattr,
+ loc, name, NULL);
+ if (!--call_count)
+ break;
+ }
}
return;
}
@@ -934,39 +1300,41 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,
{
afr_private_t *priv = NULL;
xlator_t **children = NULL;
- int call_child = 0;
afr_local_t *local = NULL;
xlator_list_t *trav = NULL;
xlator_t **sub_volumes = NULL;
int i = 0;
int32_t op_errno = 0;
- int32_t read_child = -1;
int ret = -1;
fop_getxattr_cbk_t cbk = NULL;
+ int afr_xtime_gauge[MCNT_MAX] = {0,};
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
children = priv->children;
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
+ loc_copy (&local->loc, loc);
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local->op = GF_FOP_GETXATTR;
+
+ if (xdata)
+ local->xdata_req = dict_ref (xdata);
- loc_copy (&local->loc, loc);
if (!name)
goto no_name;
local->cont.getxattr.name = gf_strdup (name);
+ if (!local->cont.getxattr.name) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
if (!strncmp (name, AFR_XATTR_PREFIX,
strlen (AFR_XATTR_PREFIX))) {
gf_log (this->name, GF_LOG_INFO,
@@ -976,7 +1344,7 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,
goto out;
}
if ((strcmp (GF_XATTR_MARKER_KEY, name) == 0)
- && (-1 == frame->root->pid)) {
+ && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) {
local->marker.call_count = priv->child_count;
@@ -992,6 +1360,7 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,
sub_volumes,
priv->child_count,
MARKER_UUID_TYPE,
+ marker_uuid_default_gauge,
priv->vol_uuid)) {
gf_log (this->name, GF_LOG_INFO,
@@ -1008,9 +1377,8 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,
* if we are doing getxattr with pathinfo as the key then we
* collect information from all childs
*/
- if (afr_is_special_xattr (name, &cbk)) {
- afr_getxattr_frm_all_children (this, frame, name,
- loc, cbk);
+ if (afr_is_special_xattr (name, &cbk, 0)) {
+ afr_getxattr_all_subvols (this, frame, name, loc, cbk);
return 0;
}
@@ -1026,10 +1394,11 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,
if (*priv->vol_uuid) {
if ((match_uuid_local (name, priv->vol_uuid) == 0)
- && (-1 == frame->root->pid)) {
+ && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) {
local->marker.call_count = priv->child_count;
- sub_volumes = alloca ( priv->child_count * sizeof (xlator_t *));
+ sub_volumes = alloca ( priv->child_count
+ * sizeof (xlator_t *));
for (i = 0, trav = this->children; trav ;
trav = trav->next, i++) {
@@ -1037,12 +1406,20 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,
}
+ /* don't err out on getting ENOTCONN (brick down)
+ * from a subset of the bricks
+ */
+ memcpy (afr_xtime_gauge, marker_xtime_default_gauge,
+ sizeof (afr_xtime_gauge));
+ afr_xtime_gauge[MCNT_NOTFOUND] = 0;
+ afr_xtime_gauge[MCNT_ENOTCONN] = 0;
if (cluster_getmarkerattr (frame, this, loc,
name, local,
afr_getxattr_unwind,
sub_volumes,
priv->child_count,
MARKER_XTIME_TYPE,
+ afr_xtime_gauge,
priv->vol_uuid)) {
gf_log (this->name, GF_LOG_INFO,
"%s: failed to get marker attr (%s)",
@@ -1056,27 +1433,9 @@ afr_getxattr (call_frame_t *frame, xlator_t *this,
}
no_name:
- local->fresh_children = afr_children_create (priv->child_count);
- if (!local->fresh_children) {
- op_errno = ENOMEM;
- goto out;
- }
- read_child = afr_inode_get_read_ctx (this, loc->inode, local->fresh_children);
- ret = afr_get_call_child (this, local->child_up, read_child,
- local->fresh_children,
- &call_child,
- &local->cont.getxattr.last_index);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- STACK_WIND_COOKIE (frame, afr_getxattr_cbk,
- (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->getxattr,
- loc, name, xdata);
+ afr_read_txn (frame, this, local->loc.inode, afr_getxattr_wind,
+ AFR_METADATA_TRANSACTION);
ret = 0;
out:
@@ -1093,128 +1452,128 @@ afr_fgetxattr_cbk (call_frame_t *frame, void *cookie,
xlator_t *this, int32_t op_ret, int32_t op_errno,
dict_t *dict, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
- int unwind = 1;
- int32_t *last_index = NULL;
- int32_t next_call_child = -1;
- int32_t read_child = -1;
- int32_t *fresh_children = NULL;
+ afr_local_t *local = NULL;
- priv = this->private;
- children = priv->children;
-
- local = frame->local;
-
- read_child = (long) cookie;
+ local = frame->local;
- if (op_ret == -1) {
- last_index = &local->cont.getxattr.last_index;
- fresh_children = local->fresh_children;
- next_call_child = afr_next_call_child (fresh_children,
- local->child_up,
- priv->child_count,
- last_index, read_child);
- if (next_call_child < 0)
- goto out;
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
- unwind = 0;
- STACK_WIND_COOKIE (frame, afr_fgetxattr_cbk,
- (void *) (long) read_child,
- children[next_call_child],
- children[next_call_child]->fops->fgetxattr,
- local->fd,
- local->cont.getxattr.name,
- NULL);
- }
+ afr_read_txn_continue (frame, this, (long) cookie);
+ return 0;
+ }
-out:
- if (unwind) {
- if (op_ret >= 0 && dict)
- __filter_xattrs (dict);
+ if (dict)
+ afr_filter_xattrs (dict);
- AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict,
- xdata);
- }
+ AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, xdata);
- return 0;
+ return 0;
}
-int32_t
-afr_fgetxattr_unwind (call_frame_t *frame,
- int op_ret, int op_errno, dict_t *dict, dict_t *xdata)
-
+int
+afr_fgetxattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
- AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, xdata);
- return 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (subvol == -1) {
+ AFR_STACK_UNWIND (fgetxattr, frame, local->op_ret,
+ local->op_errno, NULL, NULL);
+ return 0;
+ }
+
+ STACK_WIND_COOKIE (frame, afr_fgetxattr_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fgetxattr,
+ local->fd, local->cont.getxattr.name,
+ local->xdata_req);
+ return 0;
}
-int32_t
-afr_fgetxattr (call_frame_t *frame, xlator_t *this,
- fd_t *fd, const char *name, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- xlator_t **children = NULL;
- int call_child = 0;
- afr_local_t *local = NULL;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- int32_t read_child = -1;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+static void
+afr_fgetxattr_all_subvols (xlator_t *this, call_frame_t *frame,
+ fop_fgetxattr_cbk_t cbk)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int call_count = 0;
priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
- children = priv->children;
+ local = frame->local;
+ //local->call_count set in afr_local_init
+ call_count = local->call_count;
- AFR_LOCAL_ALLOC_OR_GOTO (local, out);
- frame->local = local;
+ //If up-children count is 0, afr_local_init would have failed already
+ //and the call would have unwound so not handling it here.
- op_ret = afr_local_init (local, priv, &op_errno);
- if (op_ret < 0) {
- op_errno = -op_ret;
- goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE (frame, cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fgetxattr,
+ local->fd, local->cont.getxattr.name,
+ NULL);
+ if (!--call_count)
+ break;
+ }
}
- local->fd = fd_ref (fd);
- if (name)
- local->cont.getxattr.name = gf_strdup (name);
+ return;
+}
- /* pathinfo gets handled only in getxattr() */
- local->fresh_children = afr_children_create (priv->child_count);
- if (!local->fresh_children) {
- op_errno = ENOMEM;
- goto out;
- }
+int
+afr_fgetxattr (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, const char *name, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int32_t op_errno = 0;
+ fop_fgetxattr_cbk_t cbk = NULL;
- read_child = afr_inode_get_read_ctx (this, fd->inode, local->fresh_children);
- op_ret = afr_get_call_child (this, local->child_up, read_child,
- local->fresh_children,
- &call_child,
- &local->cont.getxattr.last_index);
- if (op_ret < 0) {
- op_errno = -op_ret;
- op_ret = -1;
- goto out;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->op = GF_FOP_FGETXATTR;
+ local->fd = fd_ref (fd);
+ if (name) {
+ local->cont.getxattr.name = gf_strdup (name);
+ if (!local->cont.getxattr.name) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+ }
+ if (xdata)
+ local->xdata_req = dict_ref (xdata);
+
+ /* pathinfo gets handled only in getxattr(), but we need to handle
+ * lockinfo.
+ * If we are doing fgetxattr with lockinfo as the key then we
+ * collect information from all children.
+ */
+ if (afr_is_special_xattr (name, &cbk, 1)) {
+ afr_fgetxattr_all_subvols (this, frame, cbk);
+ return 0;
}
- STACK_WIND_COOKIE (frame, afr_fgetxattr_cbk,
- (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->fgetxattr,
- fd, name, xdata);
+ afr_fix_open (fd, this);
+
+ afr_read_txn (frame, this, fd->inode, afr_fgetxattr_wind,
+ AFR_METADATA_TRANSACTION);
- op_ret = 0;
+ return 0;
out:
- if (op_ret == -1) {
- AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, NULL, NULL);
- }
+ AFR_STACK_UNWIND (fgetxattr, frame, -1, op_errno, NULL, NULL);
+
return 0;
}
@@ -1223,145 +1582,84 @@ out:
/* {{{ readv */
-/**
- * read algorithm:
- *
- * if the user has specified a read subvolume, use it
- * otherwise -
- * use the inode number to hash it to one of the subvolumes, and
- * read from there (to balance read load)
- *
- * if any of the above read's fail, try the children in sequence
- * beginning at the beginning
- */
-
-int32_t
+int
afr_readv_cbk (call_frame_t *frame, void *cookie,
xlator_t *this, int32_t op_ret, int32_t op_errno,
struct iovec *vector, int32_t count, struct iatt *buf,
struct iobref *iobref, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- xlator_t ** children = NULL;
- int unwind = 1;
- int32_t *last_index = NULL;
- int32_t next_call_child = -1;
- int32_t *fresh_children = NULL;
- int32_t read_child = -1;
+ afr_local_t *local = NULL;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ local = frame->local;
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
- children = priv->children;
+ afr_read_txn_continue (frame, this, (long) cookie);
+ return 0;
+ }
- local = frame->local;
-
- read_child = (long) cookie;
-
- if (op_ret == -1) {
- last_index = &local->cont.readv.last_index;
- fresh_children = local->fresh_children;
- next_call_child = afr_next_call_child (fresh_children,
- local->child_up,
- priv->child_count,
- last_index, read_child);
- if (next_call_child < 0)
- goto out;
-
- unwind = 0;
-
- STACK_WIND_COOKIE (frame, afr_readv_cbk,
- (void *) (long) read_child,
- children[next_call_child],
- children[next_call_child]->fops->readv,
- local->fd, local->cont.readv.size,
- local->cont.readv.offset,
- local->cont.readv.flags,
- NULL);
- }
+ AFR_STACK_UNWIND (readv, frame, op_ret, op_errno,
+ vector, count, buf, iobref, xdata);
+ return 0;
+}
-out:
- if (unwind) {
- AFR_STACK_UNWIND (readv, frame, op_ret, op_errno,
- vector, count, buf, iobref, xdata);
- }
- return 0;
+int
+afr_readv_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (subvol == -1) {
+ AFR_STACK_UNWIND (readv, frame, local->op_ret, local->op_errno,
+ 0, 0, 0, 0, 0);
+ return 0;
+ }
+
+ STACK_WIND_COOKIE (frame, afr_readv_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->readv,
+ local->fd, local->cont.readv.size,
+ local->cont.readv.offset, local->cont.readv.flags,
+ local->xdata_req);
+ return 0;
}
-int32_t
-afr_readv (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata)
+int
+afr_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata)
{
- afr_private_t * priv = NULL;
afr_local_t * local = NULL;
- xlator_t ** children = NULL;
- int call_child = 0;
int32_t op_errno = 0;
- int32_t read_child = -1;
- int ret = -1;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (fd, out);
- priv = this->private;
- children = priv->children;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
-
- local->fresh_children = afr_children_create (priv->child_count);
- if (!local->fresh_children) {
- op_errno = ENOMEM;
- goto out;
- }
-
- read_child = afr_inode_get_read_ctx (this, fd->inode, local->fresh_children);
- ret = afr_get_call_child (this, local->child_up, read_child,
- local->fresh_children,
- &call_child,
- &local->cont.readv.last_index);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ local->op = GF_FOP_READ;
+ local->fd = fd_ref (fd);
+ local->cont.readv.size = size;
+ local->cont.readv.offset = offset;
+ local->cont.readv.flags = flags;
+ if (xdata)
+ local->xdata_req = dict_ref (xdata);
- local->fd = fd_ref (fd);
+ afr_fix_open (fd, this);
- local->cont.readv.size = size;
- local->cont.readv.offset = offset;
- local->cont.readv.flags = flags;
+ afr_read_txn (frame, this, fd->inode, afr_readv_wind,
+ AFR_DATA_TRANSACTION);
- ret = afr_open_fd_fix (frame, this, _gf_false);
- if (ret) {
- op_errno = -ret;
- goto out;
- }
- STACK_WIND_COOKIE (frame, afr_readv_cbk,
- (void *) (long) call_child,
- children[call_child],
- children[call_child]->fops->readv,
- fd, size, offset, flags, xdata);
-
- ret = 0;
-out:
- if (ret < 0) {
- AFR_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL,
- NULL, NULL);
- }
return 0;
+out:
+ AFR_STACK_UNWIND(readv, frame, -1, op_errno, 0, 0, 0, 0, 0);
+
+ return 0;
}
/* }}} */
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index 4c5502972..00e0d2676 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -37,145 +37,302 @@
#include "afr.h"
#include "afr-transaction.h"
-#include "afr-self-heal-common.h"
-/* {{{ writev */
-int
-afr_writev_unwind (call_frame_t *frame, xlator_t *this)
+static void
+__afr_inode_write_finalize (call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int read_subvol = 0;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (local->inode) {
+ if (local->transaction.type == AFR_METADATA_TRANSACTION)
+ read_subvol = afr_metadata_subvol_get (local->inode, this,
+ NULL, NULL);
+ else
+ read_subvol = afr_data_subvol_get (local->inode, this,
+ NULL, NULL);
+ }
+
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno (local, priv);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret < 0) {
+ afr_inode_read_subvol_reset (local->inode, this);
+ continue;
+ }
+
+ /* Order of checks in the compound conditional
+ below is important.
+
+ - Highest precedence: largest op_ret
+ - Next precendence: if all op_rets are equal, read subvol
+ - Least precedence: any succeeded subvol
+ */
+ if ((local->op_ret < local->replies[i].op_ret) ||
+ ((local->op_ret == local->replies[i].op_ret) &&
+ (i == read_subvol))) {
+
+ local->op_ret = local->replies[i].op_ret;
+ local->op_errno = local->replies[i].op_errno;
+
+ local->cont.inode_wfop.prebuf =
+ local->replies[i].prestat;
+ local->cont.inode_wfop.postbuf =
+ local->replies[i].poststat;
+
+ if (local->replies[i].xdata) {
+ if (local->xdata_rsp)
+ dict_unref (local->xdata_rsp);
+ local->xdata_rsp =
+ dict_ref (local->replies[i].xdata);
+ }
+ }
+ }
+}
+
+
+static void
+__afr_inode_write_fill (call_frame_t *frame, xlator_t *this, int child_index,
+ int op_ret, int op_errno,
+ struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+
+ if (op_ret >= 0) {
+ if (prebuf)
+ local->replies[child_index].prestat = *prebuf;
+ if (postbuf)
+ local->replies[child_index].poststat = *postbuf;
+ if (xdata)
+ local->replies[child_index].xdata = dict_ref (xdata);
+ } else {
+ afr_transaction_fop_failed (frame, this, child_index);
+ }
+
+ return;
+}
+
+
+static int
+__afr_inode_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int child_index = (long) cookie;
+ int call_count = -1;
local = frame->local;
LOCK (&frame->lock);
{
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
+ __afr_inode_write_fill (frame, this, child_index, op_ret,
+ op_errno, prebuf, postbuf, xdata);
}
UNLOCK (&frame->lock);
- if (main_frame) {
- AFR_STACK_UNWIND (writev, main_frame,
- local->op_ret, local->op_errno,
- &local->cont.writev.prebuf,
- &local->cont.writev.postbuf,
- NULL);
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0) {
+ __afr_inode_write_finalize (frame, this);
+
+ if (afr_txn_nothing_failed (frame, this))
+ local->transaction.unwind (frame, this);
+
+ local->transaction.resume (frame, this);
}
+
return 0;
}
+/* {{{ writev */
-int
-afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
+void
+afr_writev_copy_outvars (call_frame_t *src_frame, call_frame_t *dst_frame)
{
- afr_local_t * local = NULL;
- int child_index = (long) cookie;
- int call_count = -1;
- int read_child = 0;
+ afr_local_t *src_local = NULL;
+ afr_local_t *dst_local = NULL;
+
+ src_local = src_frame->local;
+ dst_local = dst_frame->local;
+
+ dst_local->op_ret = src_local->op_ret;
+ dst_local->op_errno = src_local->op_errno;
+ dst_local->cont.inode_wfop.prebuf = src_local->cont.inode_wfop.prebuf;
+ dst_local->cont.inode_wfop.postbuf = src_local->cont.inode_wfop.postbuf;
+ if (src_local->xdata_rsp)
+ dst_local->xdata_rsp = dict_ref (src_local->xdata_rsp);
+}
+void
+afr_writev_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
local = frame->local;
- read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL);
-
- LOCK (&frame->lock);
- {
- if (child_index == read_child) {
- local->read_child_returned = _gf_true;
- }
+ AFR_STACK_UNWIND (writev, frame,
+ local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf,
+ local->xdata_rsp);
+}
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.writev.prebuf = *prebuf;
- local->cont.writev.postbuf = *postbuf;
- }
+int
+afr_transaction_writev_unwind (call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *fop_frame = NULL;
- if (child_index == read_child) {
- local->cont.writev.prebuf = *prebuf;
- local->cont.writev.postbuf = *postbuf;
- }
- }
+ fop_frame = afr_transaction_detach_fop_frame (frame);
- local->op_errno = op_errno;
+ if (fop_frame) {
+ afr_writev_copy_outvars (frame, fop_frame);
+ afr_writev_unwind (fop_frame, this);
}
- UNLOCK (&frame->lock);
+ return 0;
+}
- call_count = afr_frame_return (frame);
+static void
+afr_writev_handle_short_writes (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
- if (call_count == 0) {
- local->transaction.unwind (frame, this);
+ local = frame->local;
+ priv = this->private;
+ /*
+ * We already have the best case result of the writev calls staged
+ * as the return value. Any writev that returns some value less
+ * than the best case is now out of sync, so mark the fop as
+ * failed. Note that fops that have returned with errors have
+ * already been marked as failed.
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if ((!local->replies[i].valid) ||
+ (local->replies[i].op_ret == -1))
+ continue;
- local->transaction.resume (frame, this);
+ if (local->replies[i].op_ret < local->op_ret)
+ afr_transaction_fop_failed(frame, this, i);
}
- return 0;
}
int
-afr_writev_wind (call_frame_t *frame, xlator_t *this)
+afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- int call_count = -1;
+ afr_local_t * local = NULL;
+ call_frame_t *fop_frame = NULL;
+ int child_index = (long) cookie;
+ int call_count = -1;
+ int ret = 0;
+ uint32_t open_fd_count = 0;
+ uint32_t write_is_append = 0;
local = frame->local;
- priv = this->private;
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
+ LOCK (&frame->lock);
+ {
+ __afr_inode_write_fill (frame, this, child_index, op_ret,
+ op_errno, prebuf, postbuf, xdata);
+ if (op_ret == -1 || !xdata)
+ goto unlock;
+
+ write_is_append = 0;
+ ret = dict_get_uint32 (xdata, GLUSTERFS_WRITE_IS_APPEND,
+ &write_is_append);
+ if (ret || !write_is_append)
+ local->append_write = _gf_false;
+
+ ret = dict_get_uint32 (xdata, GLUSTERFS_OPEN_FD_COUNT,
+ &open_fd_count);
+ if (ret == -1)
+ goto unlock;
+ if ((open_fd_count > local->open_fd_count)) {
+ local->open_fd_count = open_fd_count;
+ local->update_open_fd_count = _gf_true;
+ }
}
+unlock:
+ UNLOCK (&frame->lock);
- local->call_count = call_count;
+ call_count = afr_frame_return (frame);
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_writev_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->writev,
- local->fd,
- local->cont.writev.vector,
- local->cont.writev.count,
- local->cont.writev.offset,
- local->cont.writev.flags,
- local->cont.writev.iobref,
- NULL);
-
- if (!--call_count)
- break;
+ if (call_count == 0) {
+ if (!local->stable_write && !local->append_write)
+ /* An appended write removes the necessity to
+ fsync() the file. This is because self-heal
+ has the logic to check for larger file when
+ the xattrs are not reliably pointing at
+ a stale file.
+ */
+ afr_fd_report_unstable_write (this, local->fd);
+
+ __afr_inode_write_finalize (frame, this);
+
+ afr_writev_handle_short_writes (frame, this);
+
+ if (local->update_open_fd_count)
+ afr_handle_open_fd_count (frame, this);
+
+ if (!afr_txn_nothing_failed (frame, this)) {
+ //Don't unwind until post-op is complete
+ local->transaction.resume (frame, this);
+ } else {
+ /*
+ * Generally inode-write fops do transaction.unwind then
+ * transaction.resume, but writev needs to make sure that
+ * delayed post-op frame is placed in fdctx before unwind
+ * happens. This prevents the race of flush doing the
+ * changelog wakeup first in fuse thread and then this
+ * writev placing its delayed post-op frame in fdctx.
+ * This helps flush make sure all the delayed post-ops are
+ * completed.
+ */
+
+ fop_frame = afr_transaction_detach_fop_frame (frame);
+ afr_writev_copy_outvars (frame, fop_frame);
+ local->transaction.resume (frame, this);
+ afr_writev_unwind (fop_frame, this);
}
}
-
return 0;
}
int
-afr_writev_done (call_frame_t *frame, xlator_t *this)
+afr_writev_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
local = frame->local;
+ priv = this->private;
- iobref_unref (local->cont.writev.iobref);
- local->cont.writev.iobref = NULL;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
+ STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->writev,
+ local->fd, local->cont.writev.vector,
+ local->cont.writev.count, local->cont.writev.offset,
+ local->cont.writev.flags, local->cont.writev.iobref,
+ local->xdata_req);
return 0;
}
@@ -185,30 +342,37 @@ afr_do_writev (call_frame_t *frame, xlator_t *this)
{
call_frame_t *transaction_frame = NULL;
afr_local_t *local = NULL;
- int op_ret = -1;
- int op_errno = 0;
-
- local = frame->local;
+ int ret = -1;
+ int op_errno = ENOMEM;
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
+ if (!transaction_frame)
goto out;
- }
+ local = frame->local;
transaction_frame->local = local;
- frame->local = NULL;
+ frame->local = NULL;
- local->op = GF_FOP_WRITE;
+ if (!AFR_FRAME_INIT (frame, op_errno))
+ goto out;
- local->success_count = 0;
+ local->op = GF_FOP_WRITE;
- local->transaction.fop = afr_writev_wind;
- local->transaction.done = afr_writev_done;
- local->transaction.unwind = afr_writev_unwind;
+ local->transaction.wind = afr_writev_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
+ local->transaction.unwind = afr_transaction_writev_unwind;
local->transaction.main_frame = frame;
+
if (local->fd->flags & O_APPEND) {
+ /*
+ * Backend vfs ignores the 'offset' for append mode fd so
+ * locking just the region provided for the writev does not
+ * give consistency gurantee. The actual write may happen at a
+ * completely different range than the one provided by the
+ * offset, len in the fop. So lock the entire file.
+ */
local->transaction.start = 0;
local->transaction.len = 0;
} else {
@@ -217,272 +381,86 @@ afr_do_writev (call_frame_t *frame, xlator_t *this)
local->cont.writev.count);
}
- op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
- if (op_ret < 0) {
- op_errno = -op_ret;
+ ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
goto out;
}
- op_ret = 0;
+ return 0;
out:
- if (op_ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (writev, frame, op_ret, op_errno, NULL, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
}
-static int
-afr_prepare_loc (call_frame_t *frame, fd_t *fd)
-{
- afr_local_t *local = NULL;
- char *name = NULL;
- char *path = NULL;
- int ret = 0;
-
- if ((!fd) || (!fd->inode))
- return -1;
-
- local = frame->local;
- ret = inode_path (fd->inode, NULL, (char **)&path);
- if (ret <= 0) {
- gf_log (frame->this->name, GF_LOG_DEBUG,
- "Unable to get path for gfid: %s",
- uuid_utoa (fd->inode->gfid));
- return -1;
- }
-
- if (local->loc.path) {
- if (strcmp (path, local->loc.path))
- gf_log (frame->this->name, GF_LOG_DEBUG,
- "overwriting old loc->path %s with %s",
- local->loc.path, path);
- GF_FREE ((char *)local->loc.path);
- }
- local->loc.path = path;
-
- name = strrchr (local->loc.path, '/');
- if (name)
- name++;
- local->loc.name = name;
-
- if (local->loc.inode) {
- inode_unref (local->loc.inode);
- }
- local->loc.inode = inode_ref (fd->inode);
-
- if (local->loc.parent) {
- inode_unref (local->loc.parent);
- }
-
- local->loc.parent = inode_parent (local->loc.inode, 0, NULL);
-
- return 0;
-}
-
-afr_fd_paused_call_t*
-afr_paused_call_create (call_frame_t *frame)
-{
- afr_local_t *local = NULL;
- afr_fd_paused_call_t *paused_call = NULL;
-
- local = frame->local;
- GF_ASSERT (local->fop_call_continue);
-
- paused_call = GF_CALLOC (1, sizeof (*paused_call),
- gf_afr_fd_paused_call_t);
- if (paused_call) {
- INIT_LIST_HEAD (&paused_call->call_list);
- paused_call->frame = frame;
- }
-
- return paused_call;
-}
-
-static int
-afr_pause_fd_fop (call_frame_t *frame, xlator_t *this, afr_fd_ctx_t *fd_ctx)
-{
- afr_fd_paused_call_t *paused_call = NULL;
- int ret = 0;
-
- paused_call = afr_paused_call_create (frame);
- if (paused_call)
- list_add (&paused_call->call_list, &fd_ctx->paused_calls);
- else
- ret = -ENOMEM;
-
- return ret;
-}
-
-static void
-afr_trigger_open_fd_self_heal (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- inode_t *inode = NULL;
- char *reason = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- inode = local->fd->inode;
-
- sh->do_missing_entry_self_heal = _gf_true;
- sh->do_gfid_self_heal = _gf_true;
- sh->do_data_self_heal = _gf_true;
-
- reason = "subvolume came online";
- afr_launch_self_heal (frame, this, inode, _gf_true, inode->ia_type,
- reason, NULL, NULL);
-}
-
-int
-afr_open_fd_fix (call_frame_t *frame, xlator_t *this, gf_boolean_t pause_fop)
-{
- int ret = 0;
- int i = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
- inode_t *inode = NULL;
- gf_boolean_t need_self_heal = _gf_false;
- int *need_open = NULL;
- int need_open_count = 0;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- gf_boolean_t fop_continue = _gf_true;
-
- local = frame->local;
- priv = this->private;
-
- GF_ASSERT (local->fd);
-
- inode = local->fd->inode;
- //gfid is not set in rebalance, that case needs to be handled.
- if (fd_is_anonymous (local->fd) ||
- !inode || uuid_is_null (inode->gfid)) {
- fop_continue = _gf_true;
- goto out;
- }
-
- if (pause_fop)
- GF_ASSERT (local->fop_call_continue);
-
- ret = afr_prepare_loc (frame, local->fd);
- if (ret < 0) {
- //File does not exist we cant open it.
- ret = 0;
- goto out;
- }
-
- fd_ctx = afr_fd_ctx_get (local->fd, this);
- if (!fd_ctx) {
- ret = -EINVAL;
- goto out;
- }
-
- LOCK (&local->fd->lock);
- {
- if (fd_ctx->up_count < priv->up_count) {
- need_self_heal = _gf_true;
- fd_ctx->up_count = priv->up_count;
- fd_ctx->down_count = priv->down_count;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if ((fd_ctx->opened_on[i] == AFR_FD_NOT_OPENED) &&
- local->child_up[i]) {
- fd_ctx->opened_on[i] = AFR_FD_OPENING;
- if (!need_open)
- need_open = GF_CALLOC (priv->child_count,
- sizeof (*need_open),
- gf_afr_mt_int32_t);
- need_open[i] = 1;
- need_open_count++;
- } else if (pause_fop && local->child_up[i] &&
- (fd_ctx->opened_on[i] == AFR_FD_OPENING)) {
- local->fop_paused = _gf_true;
- }
- }
-
- if (local->fop_paused) {
- GF_ASSERT (pause_fop);
- gf_log (this->name, GF_LOG_INFO, "Pause fd %p",
- local->fd);
- ret = afr_pause_fd_fop (frame, this, fd_ctx);
- if (ret)
- goto unlock;
- fop_continue = _gf_false;
- }
- }
-unlock:
- UNLOCK (&local->fd->lock);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Failed to fix fd for %s",
- local->loc.path);
- fop_continue = _gf_false;
- goto out;
- }
-
- if (need_self_heal)
- afr_trigger_open_fd_self_heal (frame, this);
-
- if (!need_open_count)
- goto out;
-
- gf_log (this->name, GF_LOG_INFO, "Opening fd %p", local->fd);
- afr_fix_open (frame, this, fd_ctx, need_open_count, need_open);
- fop_continue = _gf_false;
-out:
- GF_FREE (need_open);
- if (fop_continue && local->fop_call_continue)
- local->fop_call_continue (frame, this);
- return ret;
-}
int
afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iovec *vector, int32_t count, off_t offset,
uint32_t flags, struct iobref *iobref, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- int ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int op_errno = ENOMEM;
priv = this->private;
QUORUM_CHECK(writev,out);
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
- local->cont.writev.vector = iov_dup (vector, count);
+ local->cont.writev.vector = iov_dup (vector, count);
+ if (!local->cont.writev.vector)
+ goto out;
local->cont.writev.count = count;
local->cont.writev.offset = offset;
local->cont.writev.flags = flags;
local->cont.writev.iobref = iobref_ref (iobref);
- local->fd = fd_ref (fd);
- local->fop_call_continue = afr_do_writev;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
- ret = afr_open_fd_fix (frame, this, _gf_true);
- if (ret) {
- op_errno = -ret;
- goto out;
- }
+ if (!local->xdata_req)
+ goto out;
+
+ local->fd = fd_ref (fd);
+ local->inode = inode_ref (fd->inode);
+
+ if (dict_set_uint32 (local->xdata_req, GLUSTERFS_OPEN_FD_COUNT, 4)) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ if (dict_set_uint32 (local->xdata_req, GLUSTERFS_WRITE_IS_APPEND, 4)) {
+ op_errno = ENOMEM;
+ goto out;
+ }
- ret = 0;
+ /* Set append_write to be true speculatively. If on any
+ server it turns not be true, we unset it in the
+ callback.
+ */
+ local->append_write = _gf_true;
+
+ /* detect here, but set it in writev_wind_cbk *after* the unstable
+ write is performed
+ */
+ local->stable_write = !!((fd->flags|flags)&(O_SYNC|O_DSYNC));
+
+ afr_fix_open (fd, this);
+
+ afr_do_writev (frame, this);
+
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL);
+ AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
}
@@ -500,22 +478,13 @@ afr_truncate_unwind (call_frame_t *frame, xlator_t *this)
local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (truncate, main_frame, local->op_ret,
- local->op_errno,
- &local->cont.truncate.prebuf,
- &local->cont.truncate.postbuf,
- NULL);
- }
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
+ AFR_STACK_UNWIND (truncate, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
return 0;
}
@@ -525,114 +494,32 @@ afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int child_index = (long) cookie;
- int read_child = 0;
- int call_count = -1;
- int need_unwind = 0;
-
- local = frame->local;
- priv = this->private;
-
- read_child = afr_inode_get_read_ctx (this, local->loc.inode, NULL);
-
- LOCK (&frame->lock);
- {
- if (child_index == read_child) {
- local->read_child_returned = _gf_true;
- }
+ afr_local_t *local = NULL;
- if (afr_fop_failed (op_ret, op_errno) && op_errno != EFBIG)
- afr_transaction_fop_failed (frame, this, child_index);
+ local = frame->local;
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.truncate.prebuf = *prebuf;
- local->cont.truncate.postbuf = *postbuf;
- }
-
- if (child_index == read_child) {
- local->cont.truncate.prebuf = *prebuf;
- local->cont.truncate.postbuf = *postbuf;
- }
-
- local->success_count++;
-
- if ((local->success_count >= priv->wait_count)
- && local->read_child_returned) {
- need_unwind = 1;
- }
- }
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- if (need_unwind)
- local->transaction.unwind (frame, this);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
+ if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size)
+ local->stable_write = _gf_false;
- return 0;
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ prebuf, postbuf, xdata);
}
-int32_t
-afr_truncate_wind (call_frame_t *frame, xlator_t *this)
+int
+afr_truncate_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
local = frame->local;
priv = this->private;
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->truncate,
- &local->loc,
- local->cont.truncate.offset,
- NULL);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
-}
-
-
-int
-afr_truncate_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
+ STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->truncate,
+ &local->loc, local->cont.truncate.offset,
+ local->xdata_req);
return 0;
}
@@ -644,56 +531,60 @@ afr_truncate (call_frame_t *frame, xlator_t *this,
afr_private_t * priv = NULL;
afr_local_t * local = NULL;
call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ int ret = -1;
+ int op_errno = ENOMEM;
priv = this->private;
QUORUM_CHECK(truncate,out);
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
+ if (!transaction_frame)
goto out;
- }
- AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out);
- local = transaction_frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
local->cont.truncate.offset = offset;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
- local->transaction.fop = afr_truncate_wind;
- local->transaction.done = afr_truncate_done;
+ if (!local->xdata_req)
+ goto out;
+
+ local->transaction.wind = afr_truncate_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
local->transaction.unwind = afr_truncate_unwind;
loc_copy (&local->loc, loc);
+ local->inode = inode_ref (loc->inode);
+
+ local->op = GF_FOP_TRUNCATE;
local->transaction.main_frame = frame;
local->transaction.start = offset;
local->transaction.len = 0;
+ /* Set it true speculatively, will get reset in afr_truncate_wind_cbk
+ if truncate was not a NOP */
+ local->stable_write = _gf_true;
+
ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
if (ret < 0) {
- op_errno = -ret;
- goto out;
+ op_errno = -ret;
+ goto out;
}
- ret = 0;
+ return 0;
out:
- if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
}
@@ -711,21 +602,13 @@ afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this)
local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
- if (main_frame) {
- AFR_STACK_UNWIND (ftruncate, main_frame, local->op_ret,
- local->op_errno,
- &local->cont.ftruncate.prebuf,
- &local->cont.ftruncate.postbuf,
- NULL);
- }
+ AFR_STACK_UNWIND (ftruncate, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
return 0;
}
@@ -735,140 +618,75 @@ afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int child_index = (long) cookie;
- int call_count = -1;
- int need_unwind = 0;
- int read_child = 0;
-
- local = frame->local;
- priv = this->private;
-
- read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL);
-
- LOCK (&frame->lock);
- {
- if (child_index == read_child) {
- local->read_child_returned = _gf_true;
- }
-
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.ftruncate.prebuf = *prebuf;
- local->cont.ftruncate.postbuf = *postbuf;
- }
-
- if (child_index == read_child) {
- local->cont.ftruncate.prebuf = *prebuf;
- local->cont.ftruncate.postbuf = *postbuf;
- }
+ afr_local_t *local = NULL;
- local->success_count++;
+ local = frame->local;
- if ((local->success_count >= priv->wait_count)
- && local->read_child_returned) {
- need_unwind = 1;
- }
- }
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- if (need_unwind)
- local->transaction.unwind (frame, this);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
+ if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size)
+ local->stable_write = _gf_false;
- return 0;
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ prebuf, postbuf, xdata);
}
int
-afr_ftruncate_wind (call_frame_t *frame, xlator_t *this)
+afr_ftruncate_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->ftruncate,
- local->fd,
- local->cont.ftruncate.offset,
- NULL);
-
- if (!--call_count)
- break;
- }
- }
+ local = frame->local;
+ priv = this->private;
+ STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->ftruncate,
+ local->fd, local->cont.ftruncate.offset,
+ local->xdata_req);
return 0;
}
int
-afr_ftruncate_done (call_frame_t *frame, xlator_t *this)
+afr_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata)
{
+ afr_private_t *priv = NULL;
afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
+ priv = this->private;
- return 0;
-}
+ QUORUM_CHECK(ftruncate,out);
+ transaction_frame = copy_frame (frame);
+ if (!frame)
+ goto out;
-int
-afr_do_ftruncate (call_frame_t *frame, xlator_t *this)
-{
- call_frame_t * transaction_frame = NULL;
- afr_local_t * local = NULL;
- int op_ret = -1;
- int op_errno = 0;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
- local = frame->local;
+ local->cont.ftruncate.offset = offset;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- goto out;
- }
+ if (!local->xdata_req)
+ goto out;
- transaction_frame->local = local;
- frame->local = NULL;
+ local->fd = fd_ref (fd);
+ local->inode = inode_ref (fd->inode);
local->op = GF_FOP_FTRUNCATE;
- local->transaction.fop = afr_ftruncate_wind;
- local->transaction.done = afr_ftruncate_done;
+ local->transaction.wind = afr_ftruncate_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
local->transaction.unwind = afr_ftruncate_unwind;
local->transaction.main_frame = frame;
@@ -876,68 +694,21 @@ afr_do_ftruncate (call_frame_t *frame, xlator_t *this)
local->transaction.start = local->cont.ftruncate.offset;
local->transaction.len = 0;
- op_ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
- if (op_ret < 0) {
- op_errno = -op_ret;
- goto out;
- }
-
- op_ret = 0;
-out:
- if (op_ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, NULL,
- NULL, NULL);
- }
-
- return 0;
-}
-
+ afr_fix_open (fd, this);
-int
-afr_ftruncate (call_frame_t *frame, xlator_t *this,
- fd_t *fd, off_t offset, dict_t *xdata)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_errno = 0;
+ /* Set it true speculatively, will get reset in afr_ftruncate_wind_cbk
+ if truncate was not a NOP */
+ local->stable_write = _gf_true;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- priv = this->private;
-
- QUORUM_CHECK(ftruncate,out);
-
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
-
- local->cont.ftruncate.offset = offset;
-
- local->fd = fd_ref (fd);
- local->fop_call_continue = afr_do_ftruncate;
-
- ret = afr_open_fd_fix (frame, this, _gf_true);
- if (ret) {
- op_errno = -ret;
- goto out;
+ ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
}
- ret = 0;
+ return 0;
out:
- if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL);
- }
+ AFR_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
}
@@ -949,188 +720,92 @@ out:
int
afr_setattr_unwind (call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (setattr, main_frame, local->op_ret,
- local->op_errno,
- &local->cont.setattr.preop_buf,
- &local->cont.setattr.postop_buf,
- NULL);
- }
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
+ AFR_STACK_UNWIND (setattr, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf,
+ local->xdata_rsp);
return 0;
}
int
afr_setattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
+ int op_ret, int op_errno,
struct iatt *preop, struct iatt *postop, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int child_index = (long) cookie;
- int read_child = 0;
- int call_count = -1;
- int need_unwind = 0;
-
- local = frame->local;
- priv = this->private;
-
- read_child = afr_inode_get_read_ctx (this, local->loc.inode, NULL);
-
- LOCK (&frame->lock);
- {
- if (child_index == read_child) {
- local->read_child_returned = _gf_true;
- }
-
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.setattr.preop_buf = *preop;
- local->cont.setattr.postop_buf = *postop;
- }
-
- if (child_index == read_child) {
- local->cont.setattr.preop_buf = *preop;
- local->cont.setattr.postop_buf = *postop;
- }
-
- local->success_count++;
-
- if ((local->success_count >= priv->wait_count)
- && local->read_child_returned) {
- need_unwind = 1;
- }
- }
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- if (need_unwind)
- local->transaction.unwind (frame, this);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ preop, postop, xdata);
}
-int32_t
-afr_setattr_wind (call_frame_t *frame, xlator_t *this)
+int
+afr_setattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
local = frame->local;
priv = this->private;
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_setattr_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->setattr,
- &local->loc,
- &local->cont.setattr.in_buf,
- local->cont.setattr.valid,
- NULL);
-
- if (!--call_count)
- break;
- }
- }
-
+ STACK_WIND_COOKIE (frame, afr_setattr_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->setattr,
+ &local->loc, &local->cont.setattr.in_buf,
+ local->cont.setattr.valid, local->xdata_req);
return 0;
}
int
-afr_setattr_done (call_frame_t *frame, xlator_t *this)
+afr_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf,
+ int32_t valid, dict_t *xdata)
{
+ afr_private_t *priv = NULL;
afr_local_t *local = NULL;
-
- local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
- return 0;
-}
-
-
-int
-afr_setattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, struct iatt *buf, int32_t valid, dict_t *xdata)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
+ call_frame_t *transaction_frame = NULL;
int ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ int op_errno = ENOMEM;
priv = this->private;
QUORUM_CHECK(setattr,out);
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
+ if (!transaction_frame)
goto out;
- }
- AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out);
- local = transaction_frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
local->cont.setattr.in_buf = *buf;
local->cont.setattr.valid = valid;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
- local->transaction.fop = afr_setattr_wind;
- local->transaction.done = afr_setattr_done;
+ if (!local->xdata_req)
+ goto out;
+
+ local->transaction.wind = afr_setattr_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
local->transaction.unwind = afr_setattr_unwind;
loc_copy (&local->loc, loc);
+ local->inode = inode_ref (loc->inode);
+
+ local->op = GF_FOP_SETATTR;
local->transaction.main_frame = frame;
local->transaction.start = LLONG_MAX - 1;
@@ -1138,18 +813,16 @@ afr_setattr (call_frame_t *frame, xlator_t *this,
ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
if (ret < 0) {
- op_errno = -ret;
- goto out;
+ op_errno = -ret;
+ goto out;
}
- ret = 0;
+ return 0;
out:
- if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
}
@@ -1163,22 +836,13 @@ afr_fsetattr_unwind (call_frame_t *frame, xlator_t *this)
local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (fsetattr, main_frame, local->op_ret,
- local->op_errno,
- &local->cont.fsetattr.preop_buf,
- &local->cont.fsetattr.postop_buf,
- NULL);
- }
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
+ AFR_STACK_UNWIND (fsetattr, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
return 0;
}
@@ -1188,163 +852,72 @@ afr_fsetattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
struct iatt *preop, struct iatt *postop, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int child_index = (long) cookie;
- int read_child = 0;
- int call_count = -1;
- int need_unwind = 0;
-
- local = frame->local;
- priv = this->private;
-
- read_child = afr_inode_get_read_ctx (this, local->fd->inode, NULL);
-
- LOCK (&frame->lock);
- {
- if (child_index == read_child) {
- local->read_child_returned = _gf_true;
- }
-
- if (afr_fop_failed (op_ret, op_errno))
- afr_transaction_fop_failed (frame, this, child_index);
-
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- local->cont.fsetattr.preop_buf = *preop;
- local->cont.fsetattr.postop_buf = *postop;
- }
-
- if (child_index == read_child) {
- local->cont.fsetattr.preop_buf = *preop;
- local->cont.fsetattr.postop_buf = *postop;
- }
-
- local->success_count++;
-
- if ((local->success_count >= priv->wait_count)
- && local->read_child_returned) {
- need_unwind = 1;
- }
- }
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- if (need_unwind)
- local->transaction.unwind (frame, this);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ preop, postop, xdata);
}
-int32_t
-afr_fsetattr_wind (call_frame_t *frame, xlator_t *this)
+int
+afr_fsetattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
local = frame->local;
priv = this->private;
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_fsetattr_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fsetattr,
- local->fd,
- &local->cont.fsetattr.in_buf,
- local->cont.fsetattr.valid,
- NULL);
-
- if (!--call_count)
- break;
- }
- }
-
+ STACK_WIND_COOKIE (frame, afr_fsetattr_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fsetattr,
+ local->fd, &local->cont.fsetattr.in_buf,
+ local->cont.fsetattr.valid, local->xdata_req);
return 0;
}
int
-afr_fsetattr_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
- return 0;
-}
-
-int
afr_fsetattr (call_frame_t *frame, xlator_t *this,
fd_t *fd, struct iatt *buf, int32_t valid, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
int ret = -1;
- int op_errno = 0;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ int op_errno = ENOMEM;
priv = this->private;
QUORUM_CHECK(fsetattr,out);
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
+ if (!transaction_frame)
goto out;
- }
- AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out);
- local = transaction_frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
local->cont.fsetattr.in_buf = *buf;
local->cont.fsetattr.valid = valid;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
+
+ if (!local->xdata_req)
+ goto out;
- local->transaction.fop = afr_fsetattr_wind;
- local->transaction.done = afr_fsetattr_done;
+ local->transaction.wind = afr_fsetattr_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
local->transaction.unwind = afr_fsetattr_unwind;
local->fd = fd_ref (fd);
+ local->inode = inode_ref (fd->inode);
- ret = afr_open_fd_fix (transaction_frame, this, _gf_false);
- if (ret) {
- op_errno = -ret;
- goto out;
- }
+ local->op = GF_FOP_FSETATTR;
+
+ afr_fix_open (fd, this);
local->transaction.main_frame = frame;
local->transaction.start = LLONG_MAX - 1;
@@ -1352,18 +925,16 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this,
ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
if (ret < 0) {
- op_errno = -ret;
- goto out;
+ op_errno = -ret;
+ goto out;
}
- ret = 0;
+ return 0;
out:
- if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
}
@@ -1379,19 +950,12 @@ afr_setxattr_unwind (call_frame_t *frame, xlator_t *this)
local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
- if (main_frame) {
- AFR_STACK_UNWIND (setxattr, main_frame,
- local->op_ret, local->op_errno,
- NULL);
- }
+ AFR_STACK_UNWIND (setxattr, main_frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
return 0;
}
@@ -1400,100 +964,32 @@ int
afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = -1;
- int need_unwind = 0;
-
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
-
- if (local->success_count == priv->child_count) {
- need_unwind = 1;
- }
- }
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- if (need_unwind)
- local->transaction.unwind (frame, this);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ NULL, NULL, xdata);
}
int
-afr_setxattr_wind (call_frame_t *frame, xlator_t *this)
+afr_setxattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
local = frame->local;
priv = this->private;
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->setxattr,
- &local->loc,
- local->cont.setxattr.dict,
- local->cont.setxattr.flags,
- NULL);
-
- if (!--call_count)
- break;
- }
- }
-
+ STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->setxattr,
+ &local->loc, local->cont.setxattr.dict,
+ local->cont.setxattr.flags, local->xdata_req);
return 0;
}
int
-afr_setxattr_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
- return 0;
-}
-
-int
-afr_setxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata)
+afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+ int32_t flags, dict_t *xdata)
{
afr_private_t *priv = NULL;
afr_local_t *local = NULL;
@@ -1501,59 +997,60 @@ afr_setxattr (call_frame_t *frame, xlator_t *this,
int ret = -1;
int op_errno = EINVAL;
- VALIDATE_OR_GOTO (this, out);
-
GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict,
op_errno, out);
GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict,
op_errno, out);
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this->private, out);
-
priv = this->private;
QUORUM_CHECK(setxattr,out);
+
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
+ if (!transaction_frame)
goto out;
- }
-
- AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out);
- local = transaction_frame->local;
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
goto out;
local->cont.setxattr.dict = dict_ref (dict);
local->cont.setxattr.flags = flags;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
+
+ if (!local->xdata_req)
+ goto out;
- local->transaction.fop = afr_setxattr_wind;
- local->transaction.done = afr_setxattr_done;
+ local->transaction.wind = afr_setxattr_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
local->transaction.unwind = afr_setxattr_unwind;
loc_copy (&local->loc, loc);
+ local->inode = inode_ref (loc->inode);
local->transaction.main_frame = frame;
local->transaction.start = LLONG_MAX - 1;
local->transaction.len = 0;
+ local->op = GF_FOP_SETXATTR;
+
ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
if (ret < 0) {
- op_errno = -ret;
- goto out;
+ op_errno = -ret;
+ goto out;
}
- ret = 0;
+ return 0;
out:
- if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+
+ AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
return 0;
}
@@ -1569,19 +1066,12 @@ afr_fsetxattr_unwind (call_frame_t *frame, xlator_t *this)
local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
- if (main_frame) {
- AFR_STACK_UNWIND (fsetxattr, main_frame,
- local->op_ret, local->op_errno,
- NULL);
- }
+ AFR_STACK_UNWIND (fsetxattr, main_frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
return 0;
}
@@ -1590,98 +1080,30 @@ int
afr_fsetxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = -1;
- int need_unwind = 0;
-
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
-
- if (local->success_count == priv->child_count) {
- need_unwind = 1;
- }
- }
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- if (need_unwind)
- local->transaction.unwind (frame, this);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ NULL, NULL, xdata);
}
int
-afr_fsetxattr_wind (call_frame_t *frame, xlator_t *this)
+afr_fsetxattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
local = frame->local;
priv = this->private;
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_fsetxattr_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fsetxattr,
- local->fd,
- local->cont.fsetxattr.dict,
- local->cont.fsetxattr.flags,
- NULL);
-
- if (!--call_count)
- break;
- }
- }
-
+ STACK_WIND_COOKIE (frame, afr_fsetxattr_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fsetxattr,
+ local->fd, local->cont.fsetxattr.dict,
+ local->cont.fsetxattr.flags, local->xdata_req);
return 0;
}
int
-afr_fsetxattr_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
- return 0;
-}
-
-int
afr_fsetxattr (call_frame_t *frame, xlator_t *this,
fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata)
{
@@ -1689,11 +1111,7 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this,
afr_local_t *local = NULL;
call_frame_t *transaction_frame = NULL;
int ret = -1;
- int op_errno = EINVAL;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ int op_errno = ENOMEM;
GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict,
op_errno, out);
@@ -1701,36 +1119,38 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this,
GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict,
op_errno, out);
- if (ret)
- goto out;
-
priv = this->private;
QUORUM_CHECK(fsetxattr,out);
- AFR_LOCAL_ALLOC_OR_GOTO (local, out);
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
-
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
+ if (!transaction_frame)
goto out;
- }
-
- transaction_frame->local = local;
- local->op_ret = -1;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
local->cont.fsetxattr.dict = dict_ref (dict);
local->cont.fsetxattr.flags = flags;
- local->transaction.fop = afr_fsetxattr_wind;
- local->transaction.done = afr_fsetxattr_done;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->transaction.wind = afr_fsetxattr_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
local->transaction.unwind = afr_fsetxattr_unwind;
local->fd = fd_ref (fd);
+ local->inode = inode_ref (fd->inode);
+
+ local->op = GF_FOP_FSETXATTR;
local->transaction.main_frame = frame;
local->transaction.start = LLONG_MAX - 1;
@@ -1738,18 +1158,16 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this,
ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
if (ret < 0) {
- op_errno = -ret;
- goto out;
+ op_errno = -ret;
+ goto out;
}
- ret = 0;
+ return 0;
out:
- if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL);
return 0;
}
@@ -1767,19 +1185,12 @@ afr_removexattr_unwind (call_frame_t *frame, xlator_t *this)
local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
- if (main_frame) {
- AFR_STACK_UNWIND (removexattr, main_frame,
- local->op_ret, local->op_errno,
- NULL);
- }
+ AFR_STACK_UNWIND (removexattr, main_frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
return 0;
}
@@ -1788,108 +1199,151 @@ int
afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int call_count = -1;
- int need_unwind = 0;
-
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
-
- if (local->success_count == priv->wait_count) {
- need_unwind = 1;
- }
- }
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ NULL, NULL, xdata);
+}
- if (need_unwind)
- local->transaction.unwind (frame, this);
- call_count = afr_frame_return (frame);
+int
+afr_removexattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
+ local = frame->local;
+ priv = this->private;
+ STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->removexattr,
+ &local->loc, local->cont.removexattr.name,
+ local->xdata_req);
return 0;
}
-int32_t
-afr_removexattr_wind (call_frame_t *frame, xlator_t *this)
+int
+afr_removexattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*",
+ name, op_errno, out);
+
+ GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*",
+ name, op_errno, out);
- local = frame->local;
priv = this->private;
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
+ QUORUM_CHECK(removexattr,out);
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame)
+ goto out;
- local->call_count = call_count;
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->removexattr,
- &local->loc,
- local->cont.removexattr.name,
- NULL);
-
- if (!--call_count)
- break;
- }
+ local->cont.removexattr.name = gf_strdup (name);
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->transaction.wind = afr_removexattr_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
+ local->transaction.unwind = afr_removexattr_unwind;
+
+ loc_copy (&local->loc, loc);
+ local->inode = inode_ref (loc->inode);
+
+ local->op = GF_FOP_REMOVEXATTR;
+
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
+
+ ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
}
+ return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+
+ AFR_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL);
return 0;
}
-
+/* ffremovexattr */
int
-afr_removexattr_done (call_frame_t *frame, xlator_t *this)
+afr_fremovexattr_unwind (call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = frame->local;
+ afr_local_t * local = NULL;
+ call_frame_t *main_frame = NULL;
- local->transaction.unwind (frame, this);
+ local = frame->local;
- AFR_STACK_DESTROY (frame);
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
+ AFR_STACK_UNWIND (fremovexattr, main_frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
return 0;
}
int
-afr_removexattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name, dict_t *xdata)
+afr_fremovexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_errno = 0;
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ NULL, NULL, xdata);
+}
+
+
+int
+afr_fremovexattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE (frame, afr_fremovexattr_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fremovexattr,
+ local->fd, local->cont.removexattr.name,
+ local->xdata_req);
+ return 0;
+}
- VALIDATE_OR_GOTO (this, out);
+
+int
+afr_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*",
name, op_errno, out);
@@ -1897,34 +1351,36 @@ afr_removexattr (call_frame_t *frame, xlator_t *this,
GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*",
name, op_errno, out);
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (loc, out);
-
- priv = this->private;
+ priv = this->private;
- QUORUM_CHECK(removexattr,out);
+ QUORUM_CHECK(fremovexattr, out);
transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
+ if (!transaction_frame)
goto out;
- }
-
- AFR_LOCAL_ALLOC_OR_GOTO (transaction_frame->local, out);
- local = transaction_frame->local;
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
goto out;
local->cont.removexattr.name = gf_strdup (name);
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
- local->transaction.fop = afr_removexattr_wind;
- local->transaction.done = afr_removexattr_done;
- local->transaction.unwind = afr_removexattr_unwind;
+ if (!local->xdata_req)
+ goto out;
- loc_copy (&local->loc, loc);
+ local->transaction.wind = afr_fremovexattr_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
+ local->transaction.unwind = afr_fremovexattr_unwind;
+
+ local->fd = fd_ref (fd);
+ local->inode = inode_ref (fd->inode);
+
+ local->op = GF_FOP_FREMOVEXATTR;
local->transaction.main_frame = frame;
local->transaction.start = LLONG_MAX - 1;
@@ -1932,210 +1388,364 @@ afr_removexattr (call_frame_t *frame, xlator_t *this,
ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
if (ret < 0) {
- op_errno = -ret;
- goto out;
+ op_errno = -ret;
+ goto out;
}
- ret = 0;
+ return 0;
out:
- if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+
+ AFR_STACK_UNWIND (fremovexattr, frame, -1, op_errno, NULL);
return 0;
}
-/* ffremovexattr */
+
int
-afr_fremovexattr_unwind (call_frame_t *frame, xlator_t *this)
+afr_fallocate_unwind (call_frame_t *frame, xlator_t *this)
{
afr_local_t * local = NULL;
call_frame_t *main_frame = NULL;
local = frame->local;
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
- if (main_frame) {
- AFR_STACK_UNWIND (fremovexattr, main_frame,
- local->op_ret, local->op_errno,
- NULL);
- }
+ AFR_STACK_UNWIND (fallocate, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
return 0;
}
int
-afr_fremovexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
+afr_fallocate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int call_count = -1;
- int need_unwind = 0;
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+}
+
+
+int
+afr_fallocate_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
local = frame->local;
priv = this->private;
- LOCK (&frame->lock);
- {
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
-
- if (local->success_count == priv->wait_count) {
- need_unwind = 1;
- }
- }
+ STACK_WIND_COOKIE (frame, afr_fallocate_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fallocate,
+ local->fd, local->cont.fallocate.mode,
+ local->cont.fallocate.offset,
+ local->cont.fallocate.len, local->xdata_req);
+ return 0;
+}
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
- if (need_unwind)
- local->transaction.unwind (frame, this);
+int
+afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ call_frame_t *transaction_frame = NULL;
+ afr_local_t *local = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- call_count = afr_frame_return (frame);
+ priv = this->private;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
+ QUORUM_CHECK(fallocate,out);
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->cont.fallocate.mode = mode;
+ local->cont.fallocate.offset = offset;
+ local->cont.fallocate.len = len;
+
+ local->fd = fd_ref (fd);
+ local->inode = inode_ref (fd->inode);
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->op = GF_FOP_FALLOCATE;
+
+ local->transaction.wind = afr_fallocate_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
+ local->transaction.unwind = afr_fallocate_unwind;
+
+ local->transaction.main_frame = frame;
+
+ local->transaction.start = local->cont.fallocate.offset;
+ local->transaction.len = 0;
+
+ afr_fix_open (fd, this);
+
+ ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
}
+ return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+
+ AFR_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+
+/* }}} */
+
+/* {{{ discard */
+
+int
+afr_discard_unwind (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t * local = NULL;
+ call_frame_t *main_frame = NULL;
+
+ local = frame->local;
+
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
+
+ AFR_STACK_UNWIND (discard, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
return 0;
}
-int32_t
-afr_fremovexattr_wind (call_frame_t *frame, xlator_t *this)
+int
+afr_discard_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+}
+
+
+int
+afr_discard_wind (call_frame_t *frame, xlator_t *this, int subvol)
{
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- int call_count = -1;
- int i = 0;
local = frame->local;
priv = this->private;
- call_count = afr_pre_op_done_children_count (local->transaction.pre_op,
- priv->child_count);
+ STACK_WIND_COOKIE (frame, afr_discard_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->discard,
+ local->fd, local->cont.discard.offset,
+ local->cont.discard.len, local->xdata_req);
+ return 0;
+}
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
- local->call_count = call_count;
+int
+afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i]) {
- STACK_WIND_COOKIE (frame, afr_fremovexattr_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fremovexattr,
- local->fd,
- local->cont.removexattr.name,
- NULL);
-
- if (!--call_count)
- break;
- }
+ priv = this->private;
+
+ QUORUM_CHECK(discard, out);
+
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->cont.discard.offset = offset;
+ local->cont.discard.len = len;
+
+ local->fd = fd_ref (fd);
+ local->inode = inode_ref (fd->inode);
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->op = GF_FOP_DISCARD;
+
+ local->transaction.wind = afr_discard_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
+ local->transaction.unwind = afr_discard_unwind;
+
+ local->transaction.main_frame = frame;
+
+ local->transaction.start = local->cont.discard.offset;
+ local->transaction.len = 0;
+
+ afr_fix_open (fd, this);
+
+ ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
}
+ return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+
+ AFR_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
}
+/* {{{ zerofill */
+
int
-afr_fremovexattr_done (call_frame_t *frame, xlator_t *this)
+afr_zerofill_unwind (call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = frame->local;
+ afr_local_t * local = NULL;
+ call_frame_t *main_frame = NULL;
- local->transaction.unwind (frame, this);
+ local = frame->local;
- AFR_STACK_DESTROY (frame);
+ main_frame = afr_transaction_detach_fop_frame (frame);
+ if (!main_frame)
+ return 0;
+ AFR_STACK_UNWIND (discard, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
return 0;
}
int
-afr_fremovexattr (call_frame_t *frame, xlator_t *this,
- fd_t *fd, const char *name, dict_t *xdata)
+afr_zerofill_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_ret = -1;
- int op_errno = 0;
+ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+}
- VALIDATE_OR_GOTO (this, out);
- GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*",
- name, op_errno, out);
+int
+afr_zerofill_wind (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*",
- name, op_errno, out);
+ local = frame->local;
+ priv = this->private;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this->private, out);
+ STACK_WIND_COOKIE (frame, afr_zerofill_wind_cbk, (void *) (long) subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->zerofill,
+ local->fd, local->cont.zerofill.offset,
+ local->cont.zerofill.len, local->xdata_req);
+ return 0;
+}
+
+int
+afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
priv = this->private;
- QUORUM_CHECK(fremovexattr, out);
+ QUORUM_CHECK(discard, out);
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- goto out;
- }
+ transaction_frame = copy_frame (frame);
+ if (!transaction_frame)
+ goto out;
- AFR_LOCAL_ALLOC_OR_GOTO (local, out);
+ local = AFR_FRAME_INIT (transaction_frame, op_errno);
+ if (!local)
+ goto out;
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ local->cont.zerofill.offset = offset;
+ local->cont.zerofill.len = len;
- transaction_frame->local = local;
+ local->fd = fd_ref (fd);
+ local->inode = inode_ref (fd->inode);
- local->op_ret = -1;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref (xdata, NULL);
+ else
+ local->xdata_req = dict_new ();
- local->cont.removexattr.name = gf_strdup (name);
+ if (!local->xdata_req)
+ goto out;
- local->transaction.fop = afr_fremovexattr_wind;
- local->transaction.done = afr_fremovexattr_done;
- local->transaction.unwind = afr_fremovexattr_unwind;
+ local->op = GF_FOP_ZEROFILL;
- local->fd = fd_ref (fd);
+ local->transaction.wind = afr_zerofill_wind;
+ local->transaction.fop = __afr_txn_write_fop;
+ local->transaction.done = __afr_txn_write_done;
+ local->transaction.unwind = afr_zerofill_unwind;
local->transaction.main_frame = frame;
- local->transaction.start = LLONG_MAX - 1;
- local->transaction.len = 0;
- op_ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
- if (op_ret < 0) {
- op_errno = -op_ret;
- goto out;
+ local->transaction.start = local->cont.discard.offset;
+ local->transaction.len = len;
+
+ afr_fix_open (fd, this);
+
+ ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
}
- op_ret = 0;
+ return 0;
out:
- if (op_ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
- AFR_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, NULL);
- }
+ if (transaction_frame)
+ AFR_STACK_DESTROY (transaction_frame);
+ AFR_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
}
+
+/* }}} */
diff --git a/xlators/cluster/afr/src/afr-inode-write.h b/xlators/cluster/afr/src/afr-inode-write.h
index ed11079fd..7b1fc5528 100644
--- a/xlators/cluster/afr/src/afr-inode-write.h
+++ b/xlators/cluster/afr/src/afr-inode-write.h
@@ -68,4 +68,15 @@ int32_t
afr_fremovexattr (call_frame_t *frame, xlator_t *this,
fd_t *fd, const char *name, dict_t *xdata);
+int
+afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata);
+
+int
+afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata);
+
+int
+afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata);
#endif /* __INODE_WRITE_H__ */
diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c
index 5e61be4d4..a2a758f35 100644
--- a/xlators/cluster/afr/src/afr-lk-common.c
+++ b/xlators/cluster/afr/src/afr-lk-common.c
@@ -55,7 +55,33 @@
} while (0);
int
-afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index);
+afr_entry_lockee_cmp (const void *l1, const void *l2)
+{
+ const afr_entry_lockee_t *r1 = l1;
+ const afr_entry_lockee_t *r2 = l2;
+ int ret = 0;
+ uuid_t gfid1 = {0};
+ uuid_t gfid2 = {0};
+
+ loc_gfid ((loc_t*)&r1->loc, gfid1);
+ loc_gfid ((loc_t*)&r2->loc, gfid2);
+ ret = uuid_compare (gfid1, gfid2);
+ /*Entrylks with NULL basename are the 'smallest'*/
+ if (ret == 0) {
+ if (!r1->basename)
+ return -1;
+ if (!r2->basename)
+ return 1;
+ ret = strcmp (r1->basename, r2->basename);
+ }
+
+ if (ret <= 0)
+ return -1;
+ else
+ return 1;
+}
+
+int afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index);
static int
afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this);
@@ -125,16 +151,9 @@ internal_lock_count (call_frame_t *frame, xlator_t *this)
local = frame->local;
priv = this->private;
- if (local->fd) {
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i] && local->fd_open_on[i])
- ++call_count;
- }
- } else {
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i])
- ++call_count;
- }
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i])
+ ++call_count;
}
return call_count;
@@ -332,10 +351,13 @@ static void
afr_trace_entrylk_in (call_frame_t *frame, xlator_t *this,
afr_lock_call_type_t lock_call_type,
afr_lock_op_type_t lk_op_type, const char *basename,
- int32_t child_index)
+ int32_t cookie)
{
afr_local_t *local = NULL;
afr_internal_lock_t *int_lock = NULL;
+ afr_private_t *priv = NULL;
+ int child_index = 0;
+ int lockee_no = 0;
char lock[256];
char lockee[256];
@@ -343,28 +365,40 @@ afr_trace_entrylk_in (call_frame_t *frame, xlator_t *this,
local = frame->local;
int_lock = &local->internal_lock;
+ priv = this->private;
+
+ if (!priv->entrylk_trace) {
+ return;
+ }
+ lockee_no = cookie / priv->child_count;
+ child_index = cookie % priv->child_count;
afr_print_entrylk (lock, 256, basename, &frame->root->lk_owner);
- afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index);
+ afr_print_lockee (lockee, 256, &int_lock->lockee[lockee_no].loc, local->fd,
+ child_index);
afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock);
gf_log (this->name, GF_LOG_INFO,
- "[%s %s] Lock={%s} Lockee={%s} Number={%llu}",
+ "[%s %s] Lock={%s} Lockee={%s} Number={%llu}, Cookie={%d}",
lock_call_type_str,
lk_op_type == AFR_LOCK_OP ? "LOCK REQUEST" : "UNLOCK REQUEST",
lock, lockee,
- (unsigned long long) int_lock->lock_number);
+ (unsigned long long) int_lock->lock_number,
+ cookie);
}
static void
afr_trace_entrylk_out (call_frame_t *frame, xlator_t *this,
afr_lock_call_type_t lock_call_type,
afr_lock_op_type_t lk_op_type, const char *basename,
- int op_ret, int op_errno, int32_t child_index)
+ int op_ret, int op_errno, int32_t cookie)
{
afr_internal_lock_t *int_lock = NULL;
afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int lockee_no = 0;
+ int child_index = 0;
char lock[256];
char lockee[256];
@@ -373,20 +407,30 @@ afr_trace_entrylk_out (call_frame_t *frame, xlator_t *this,
local = frame->local;
int_lock = &local->internal_lock;
+ priv = this->private;
- afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index);
+ if (!priv->entrylk_trace) {
+ return;
+ }
+ lockee_no = cookie / priv->child_count;
+ child_index = cookie % priv->child_count;
+
+ afr_print_entrylk (lock, 256, basename, &frame->root->lk_owner);
+ afr_print_lockee (lockee, 256, &int_lock->lockee[lockee_no].loc, local->fd,
+ child_index);
afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock);
afr_print_verdict (op_ret, op_errno, verdict);
gf_log (this->name, GF_LOG_INFO,
- "[%s %s] [%s] Lock={%s} Lockee={%s} Number={%llu}",
+ "[%s %s] [%s] Lock={%s} Lockee={%s} Number={%llu} Cookie={%d}",
lock_call_type_str,
lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY",
verdict,
lock, lockee,
- (unsigned long long) int_lock->lock_number);
+ (unsigned long long) int_lock->lock_number,
+ cookie);
}
@@ -439,6 +483,47 @@ is_afr_lock_transaction (afr_local_t *local)
return ret;
}
+int
+afr_init_entry_lockee (afr_entry_lockee_t *lockee, afr_local_t *local,
+ loc_t *loc, char *basename, int child_count)
+{
+ int ret = -1;
+
+ loc_copy (&lockee->loc, loc);
+ lockee->basename = (basename)? gf_strdup (basename): NULL;
+ if (basename && !lockee->basename)
+ goto out;
+
+ lockee->locked_count = 0;
+ lockee->locked_nodes = GF_CALLOC (child_count,
+ sizeof (*lockee->locked_nodes),
+ gf_afr_mt_afr_node_character);
+
+ if (!lockee->locked_nodes)
+ goto out;
+
+ ret = 0;
+out:
+ return ret;
+
+}
+
+void
+afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock)
+{
+ int i = 0;
+
+ for (i = 0; i < int_lock->lockee_count; i++) {
+ loc_wipe (&int_lock->lockee[i].loc);
+ if (int_lock->lockee[i].basename)
+ GF_FREE (int_lock->lockee[i].basename);
+ if (int_lock->lockee[i].locked_nodes)
+ GF_FREE (int_lock->lockee[i].locked_nodes);
+ }
+
+ return;
+}
+
static int
initialize_entrylk_variables (call_frame_t *frame, xlator_t *this)
{
@@ -456,8 +541,13 @@ initialize_entrylk_variables (call_frame_t *frame, xlator_t *this)
int_lock->lock_op_ret = -1;
int_lock->lock_op_errno = 0;
- for (i = 0; i < priv->child_count; i++) {
- int_lock->entry_locked_nodes[i] = 0;
+ for (i = 0; i < AFR_LOCKEE_COUNT_MAX; i++) {
+ if (!int_lock->lockee[i].locked_nodes)
+ break;
+ int_lock->lockee[i].locked_count = 0;
+ memset (int_lock->lockee[i].locked_nodes, 0,
+ sizeof (*int_lock->lockee[i].locked_nodes) *
+ priv->child_count);
}
return 0;
@@ -469,37 +559,37 @@ initialize_inodelk_variables (call_frame_t *frame, xlator_t *this)
afr_local_t *local = NULL;
afr_internal_lock_t *int_lock = NULL;
afr_private_t *priv = NULL;
- int i = 0;
+ afr_inodelk_t *inodelk = NULL;
priv = this->private;
local = frame->local;
int_lock = &local->internal_lock;
- int_lock->inodelk_lock_count = 0;
- int_lock->lock_op_ret = -1;
- int_lock->lock_op_errno = 0;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
- for (i = 0; i < priv->child_count; i++) {
- int_lock->inode_locked_nodes[i] = 0;
- }
+ inodelk->lock_count = 0;
+ int_lock->lk_attempted_count = 0;
+ int_lock->lock_op_ret = -1;
+ int_lock->lock_op_errno = 0;
+
+ memset (inodelk->locked_nodes, 0,
+ sizeof (*inodelk->locked_nodes) * priv->child_count);
+ memset (int_lock->locked_nodes, 0,
+ sizeof (*int_lock->locked_nodes) * priv->child_count);
return 0;
}
-loc_t *
-lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2)
+int
+afr_lockee_locked_nodes_count (afr_internal_lock_t *int_lock)
{
- int ret = 0;
-
- ret = uuid_compare (l1->inode->gfid, l2->inode->gfid);
+ int call_count = 0;
+ int i = 0;
- if (ret == 0)
- ret = strcmp (b1, b2);
+ for (i = 0; i < int_lock->lockee_count; i++)
+ call_count += int_lock->lockee[i].locked_count;
- if (ret <= 0)
- return l1;
- else
- return l2;
+ return call_count;
}
int
@@ -550,7 +640,9 @@ afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
{
afr_local_t *local = NULL;
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
int32_t child_index = (long)cookie;
+ afr_private_t *priv = NULL;
local = frame->local;
int_lock = &local->internal_lock;
@@ -559,14 +651,18 @@ afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
AFR_UNLOCK_OP, NULL, op_ret,
op_errno, child_index);
+ priv = this->private;
+
if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) {
- gf_log (this->name, GF_LOG_INFO, "%s: unlock failed on %d "
- "unlock by %s", local->loc.path, child_index,
+ gf_log (this->name, GF_LOG_INFO, "%s: unlock failed on subvolume %s "
+ "with lock owner %s", local->loc.path,
+ priv->children[child_index]->name,
lkowner_utoa (&frame->root->lk_owner));
}
- int_lock->inode_locked_nodes[child_index] &= LOCKED_NO;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+ inodelk->locked_nodes[child_index] &= LOCKED_NO;
if (local->transaction.eager_lock)
local->transaction.eager_lock[child_index] = 0;
@@ -580,6 +676,7 @@ static int
afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
{
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
struct gf_flock flock = {0,};
@@ -595,12 +692,14 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
int_lock = &local->internal_lock;
priv = this->private;
- flock.l_start = int_lock->lk_flock.l_start;
- flock.l_len = int_lock->lk_flock.l_len;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+
+ flock.l_start = inodelk->flock.l_start;
+ flock.l_len = inodelk->flock.l_len;
flock.l_type = F_UNLCK;
full_flock.l_type = F_UNLCK;
- call_count = afr_locked_nodes_count (int_lock->inode_locked_nodes,
+ call_count = afr_locked_nodes_count (inodelk->locked_nodes,
priv->child_count);
int_lock->lk_call_count = call_count;
@@ -616,8 +715,7 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
fd_ctx = afr_fd_ctx_get (local->fd, this);
for (i = 0; i < priv->child_count; i++) {
- if ((int_lock->inode_locked_nodes[i] & LOCKED_YES)
- != LOCKED_YES)
+ if ((inodelk->locked_nodes[i] & LOCKED_YES) != LOCKED_YES)
continue;
if (local->fd) {
@@ -658,7 +756,7 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
(void *) (long)i,
priv->children[i],
priv->children[i]->fops->finodelk,
- this->name, local->fd,
+ int_lock->domain, local->fd,
F_SETLK, flock_use, NULL);
if (!--call_count)
@@ -673,7 +771,7 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
(void *) (long)i,
priv->children[i],
priv->children[i]->fops->inodelk,
- this->name, &local->loc,
+ int_lock->domain, &local->loc,
F_SETLK, &flock, NULL);
if (!--call_count)
@@ -689,13 +787,22 @@ afr_unlock_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
afr_local_t *local = NULL;
- int32_t child_index = (long)cookie;
+ afr_private_t *priv = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ int32_t child_index = 0;
+ int lockee_no = 0;
+
+ priv = this->private;
+ lockee_no = (int)((long) cookie) / priv->child_count;
+ child_index = (int) ((long) cookie) % priv->child_count;
local = frame->local;
+ int_lock = &local->internal_lock;
AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION,
- AFR_UNLOCK_OP, NULL, op_ret,
- op_errno, child_index);
+ AFR_UNLOCK_OP,
+ int_lock->lockee[lockee_no].basename, op_ret,
+ op_errno, (int) ((long)cookie));
if (op_ret < 0) {
gf_log (this->name, GF_LOG_ERROR,
@@ -703,6 +810,7 @@ afr_unlock_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->loc.path, child_index, strerror (op_errno));
}
+ int_lock->lockee[lockee_no].locked_nodes[child_index] &= LOCKED_NO;
afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno, NULL);
return 0;
@@ -711,24 +819,22 @@ afr_unlock_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
static int
afr_unlock_entrylk (call_frame_t *frame, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- const char *basename = NULL;
- loc_t *loc = NULL;
- int call_count = 0;
- int i = -1;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int index = 0;
+ int lockee_no = 0;
+ int copies = 0;
+ int i = -1;
local = frame->local;
int_lock = &local->internal_lock;
priv = this->private;
+ copies = priv->child_count;
- basename = int_lock->lk_basename;
- if (int_lock->lk_loc)
- loc = int_lock->lk_loc;
+ call_count = afr_lockee_locked_nodes_count (int_lock);
- call_count = afr_locked_nodes_count (int_lock->entry_locked_nodes,
- priv->child_count);
int_lock->lk_call_count = call_count;
if (!call_count){
@@ -738,18 +844,22 @@ afr_unlock_entrylk (call_frame_t *frame, xlator_t *this)
goto out;
}
- for (i = 0; i < priv->child_count; i++) {
- if (int_lock->entry_locked_nodes[i] & LOCKED_YES) {
- AFR_TRACE_ENTRYLK_IN (frame, this,
- AFR_ENTRYLK_NB_TRANSACTION,
- AFR_UNLOCK_OP, basename, i);
+ for (i = 0; i < int_lock->lockee_count * priv->child_count; i++) {
+ lockee_no = i / copies;
+ index = i % copies;
+ if (int_lock->lockee[lockee_no].locked_nodes[index] & LOCKED_YES) {
+ AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION,
+ AFR_UNLOCK_OP,
+ int_lock->lockee[lockee_no].basename,
+ i);
STACK_WIND_COOKIE (frame, afr_unlock_entrylk_cbk,
(void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->entrylk,
- this->name,
- loc, basename,
+ priv->children[index],
+ priv->children[index]->fops->entrylk,
+ int_lock->domain,
+ &int_lock->lockee[lockee_no].loc,
+ int_lock->lockee[lockee_no].basename,
ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL);
if (!--call_count)
@@ -766,13 +876,20 @@ static int32_t
afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- int child_index = (long) cookie;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int cky = (long) cookie;
+ int child_index = 0;
+ int lockee_no = 0;
+ priv = this->private;
local = frame->local;
int_lock = &local->internal_lock;
+ child_index = ((int)cky) % priv->child_count;
+ lockee_no = ((int)cky) / priv->child_count;
+
LOCK (&frame->lock);
{
if (op_ret == -1) {
@@ -788,6 +905,8 @@ afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->op_errno = op_errno;
int_lock->lock_op_errno = op_errno;
}
+
+ int_lock->lk_attempted_count++;
}
UNLOCK (&frame->lock);
@@ -796,10 +915,17 @@ afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
afr_unlock (frame, this);
} else {
if (op_ret == 0) {
- int_lock->locked_nodes[child_index] |= LOCKED_YES;
- int_lock->lock_count++;
+ if (local->transaction.type == AFR_ENTRY_TRANSACTION ||
+ local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
+ int_lock->lockee[lockee_no].locked_nodes[child_index] |= LOCKED_YES;
+ int_lock->lockee[lockee_no].locked_count++;
+ int_lock->entrylk_lock_count++;
+ } else {
+ int_lock->locked_nodes[child_index] |= LOCKED_YES;
+ int_lock->lock_count++;
+ }
}
- afr_lock_blocking (frame, this, child_index + 1);
+ afr_lock_blocking (frame, this, cky + 1);
}
return 0;
@@ -819,79 +945,6 @@ afr_blocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
static int32_t
-afr_lock_lower_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- loc_t *lower = NULL;
- loc_t *higher = NULL;
- const char *higher_name = NULL;
- int child_index = (long) cookie;
-
- priv = this->private;
- local = frame->local;
- int_lock = &local->internal_lock;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- if (op_errno == ENOSYS) {
- /* return ENOTSUP */
-
- gf_log (this->name, GF_LOG_ERROR,
- "subvolume does not support locking. "
- "please load features/locks xlator on server");
-
- local->op_ret = op_ret;
- }
-
- local->op_errno = op_errno;
- }
- }
- UNLOCK (&frame->lock);
-
- if (op_ret != 0) {
- afr_copy_locked_nodes (frame, this);
- afr_unlock (frame, this);
- goto out;
- } else {
- int_lock->lower_locked_nodes[child_index] |= LOCKED_LOWER;
- int_lock->lock_count++;
- }
-
- /* The lower path has been locked. Now lock the higher path */
-
- lower = lower_path (&local->transaction.parent_loc,
- local->transaction.basename,
- &local->transaction.new_parent_loc,
- local->transaction.new_basename);
-
- higher = (lower == &local->transaction.parent_loc ?
- &local->transaction.new_parent_loc :
- &local->transaction.parent_loc);
-
- higher_name = (higher == &local->transaction.parent_loc ?
- local->transaction.basename :
- local->transaction.new_basename);
-
- AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_TRANSACTION,
- AFR_LOCK_OP, higher_name, child_index);
-
-
- STACK_WIND_COOKIE (frame, afr_lock_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->entrylk,
- this->name, higher, higher_name,
- ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
-
-out:
- return 0;
-}
-
-static int32_t
afr_blocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
@@ -907,6 +960,7 @@ static int
afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this)
{
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
@@ -917,18 +971,16 @@ afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this)
switch (local->transaction.type) {
case AFR_DATA_TRANSACTION:
case AFR_METADATA_TRANSACTION:
- memcpy (int_lock->inode_locked_nodes,
- int_lock->locked_nodes,
- priv->child_count);
- int_lock->inodelk_lock_count = int_lock->lock_count;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+ memcpy (inodelk->locked_nodes, int_lock->locked_nodes,
+ sizeof (*inodelk->locked_nodes) * priv->child_count);
+ inodelk->lock_count = int_lock->lock_count;
break;
case AFR_ENTRY_RENAME_TRANSACTION:
case AFR_ENTRY_TRANSACTION:
- memcpy (int_lock->entry_locked_nodes,
- int_lock->locked_nodes,
- priv->child_count);
- int_lock->entrylk_lock_count = int_lock->lock_count;
+ /*entrylk_count is being used in both non-blocking and blocking
+ * modes */
break;
}
@@ -936,25 +988,67 @@ afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this)
}
+static inline gf_boolean_t
+afr_is_entrylk (afr_internal_lock_t *int_lock,
+ afr_transaction_type trans_type)
+{
+ gf_boolean_t is_entrylk = _gf_false;
+
+ if ((int_lock->transaction_lk_type == AFR_SELFHEAL_LK) &&
+ int_lock->selfheal_lk_type == AFR_ENTRY_SELF_HEAL_LK) {
+
+ is_entrylk = _gf_true;
+
+ } else if ((int_lock->transaction_lk_type == AFR_TRANSACTION_LK) &&
+ (trans_type == AFR_ENTRY_TRANSACTION ||
+ trans_type == AFR_ENTRY_RENAME_TRANSACTION)) {
+
+ is_entrylk = _gf_true;
+
+ } else {
+ is_entrylk = _gf_false;
+ }
+
+ return is_entrylk;
+}
+
+static gf_boolean_t
+_is_lock_wind_needed (afr_local_t *local, int child_index)
+{
+ if (!local->child_up[child_index])
+ return _gf_false;
+
+ return _gf_true;
+}
+
int
-afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index)
+afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
{
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
- loc_t *lower = NULL;
- const char *lower_name = NULL;
struct gf_flock flock = {0,};
uint64_t ctx = 0;
int ret = 0;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- flock.l_start = int_lock->lk_flock.l_start;
- flock.l_len = int_lock->lk_flock.l_len;
- flock.l_type = int_lock->lk_flock.l_type;
+ int child_index = 0;
+ int lockee_no = 0;
+ gf_boolean_t is_entrylk = _gf_false;
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+ priv = this->private;
+ child_index = cookie % priv->child_count;
+ lockee_no = cookie / priv->child_count;
+ is_entrylk = afr_is_entrylk (int_lock, local->transaction.type);
+
+
+ if (!is_entrylk) {
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+ flock.l_start = inodelk->flock.l_start;
+ flock.l_len = inodelk->flock.l_len;
+ flock.l_type = inodelk->flock.l_type;
+ }
if (local->fd) {
ret = fd_ctx_get (local->fd, this, &ctx);
@@ -973,42 +1067,26 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index)
return 0;
}
-
- /* skip over children that or down
- or don't have the fd open */
-
- while ((child_index < priv->child_count)
- && (!local->child_up[child_index] ||
- !local->fd_open_on[child_index]))
-
- child_index++;
- } else {
- /* skip over children that are down */
- while ((child_index < priv->child_count)
- && !local->child_up[child_index])
- child_index++;
}
- if ((child_index == priv->child_count) &&
- int_lock->lock_count == 0) {
-
- gf_log (this->name, GF_LOG_INFO,
- "unable to lock on even one child");
-
- local->op_ret = -1;
- int_lock->lock_op_ret = -1;
+ if (int_lock->lk_expected_count == int_lock->lk_attempted_count) {
+ if ((is_entrylk && int_lock->entrylk_lock_count == 0) ||
+ (!is_entrylk && int_lock->lock_count == 0)) {
+ gf_log (this->name, GF_LOG_INFO,
+ "unable to lock on even one child");
- afr_copy_locked_nodes (frame, this);
+ local->op_ret = -1;
+ int_lock->lock_op_ret = -1;
- afr_unlock(frame, this);
+ afr_copy_locked_nodes (frame, this);
- return 0;
+ afr_unlock(frame, this);
+ return 0;
+ }
}
- if ((child_index == priv->child_count)
- || (int_lock->lock_count == int_lock->lk_expected_count)) {
-
+ if (int_lock->lk_expected_count == int_lock->lk_attempted_count) {
/* we're done locking */
gf_log (this->name, GF_LOG_DEBUG,
@@ -1021,6 +1099,11 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index)
return 0;
}
+ if (!_is_lock_wind_needed (local, child_index)) {
+ afr_lock_blocking (frame, this, cookie + 1);
+ return 0;
+ }
+
switch (local->transaction.type) {
case AFR_DATA_TRANSACTION:
case AFR_METADATA_TRANSACTION:
@@ -1035,7 +1118,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index)
(void *) (long) child_index,
priv->children[child_index],
priv->children[child_index]->fops->finodelk,
- this->name, local->fd,
+ int_lock->domain, local->fd,
F_SETLKW, &flock, NULL);
} else {
@@ -1048,50 +1131,29 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index)
(void *) (long) child_index,
priv->children[child_index],
priv->children[child_index]->fops->inodelk,
- this->name, &local->loc,
+ int_lock->domain, &local->loc,
F_SETLKW, &flock, NULL);
}
break;
case AFR_ENTRY_RENAME_TRANSACTION:
- {
- lower = lower_path (&local->transaction.parent_loc,
- local->transaction.basename,
- &local->transaction.new_parent_loc,
- local->transaction.new_basename);
-
- lower_name = (lower == &local->transaction.parent_loc ?
- local->transaction.basename :
- local->transaction.new_basename);
-
- AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_TRANSACTION,
- AFR_LOCK_OP, lower_name, child_index);
-
-
- STACK_WIND_COOKIE (frame, afr_lock_lower_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->entrylk,
- this->name, lower, lower_name,
- ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
-
- break;
- }
-
case AFR_ENTRY_TRANSACTION:
+ /*Accounting for child_index increments on 'down'
+ *and 'fd-less' children */
+
if (local->fd) {
- AFR_TRACE_ENTRYLK_IN (frame, this,
- AFR_ENTRYLK_TRANSACTION,
- AFR_LOCK_OP, local->transaction.basename,
- child_index);
+ AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_TRANSACTION,
+ AFR_LOCK_OP,
+ int_lock->lockee[lockee_no].basename,
+ cookie);
STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk,
- (void *) (long) child_index,
+ (void *) (long) cookie,
priv->children[child_index],
priv->children[child_index]->fops->fentrylk,
- this->name, local->fd,
- local->transaction.basename,
+ int_lock->domain, local->fd,
+ int_lock->lockee[lockee_no].basename,
ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
} else {
AFR_TRACE_ENTRYLK_IN (frame, this,
@@ -1100,12 +1162,12 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index)
child_index);
STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk,
- (void *) (long) child_index,
+ (void *) (long) cookie,
priv->children[child_index],
priv->children[child_index]->fops->entrylk,
- this->name,
- &local->transaction.parent_loc,
- local->transaction.basename,
+ int_lock->domain,
+ &int_lock->lockee[lockee_no].loc,
+ int_lock->lockee[lockee_no].basename,
ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
}
@@ -1134,11 +1196,11 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this)
break;
case AFR_ENTRY_RENAME_TRANSACTION:
- up_count = afr_up_children_count (local->child_up,
- priv->child_count);
- int_lock->lk_expected_count = 2 * up_count;
- //fallthrough
case AFR_ENTRY_TRANSACTION:
+ up_count = AFR_COUNT (local->child_up, priv->child_count);
+ int_lock->lk_call_count = int_lock->lk_expected_count
+ = (int_lock->lockee_count *
+ up_count);
initialize_entrylk_variables (frame, this);
break;
}
@@ -1154,35 +1216,48 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
{
afr_internal_lock_t *int_lock = NULL;
afr_local_t *local = NULL;
- int call_count = 0;
- int child_index = (long) cookie;
+ int call_count = 0;
+ int child_index = (long) cookie;
+ int copies = 0;
+ int index = 0;
+ int lockee_no = 0;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ copies = priv->child_count;
+ index = child_index % copies;
+ lockee_no = child_index / copies;
local = frame->local;
int_lock = &local->internal_lock;
AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION,
- AFR_LOCK_OP, NULL, op_ret,
+ AFR_LOCK_OP,
+ int_lock->lockee[lockee_no].basename, op_ret,
op_errno, (long) cookie);
- if (op_ret < 0 ) {
- if (op_errno == ENOSYS) {
+ LOCK (&frame->lock);
+ {
+ if (op_ret < 0 ) {
+ if (op_errno == ENOSYS) {
/* return ENOTSUP */
- gf_log (this->name, GF_LOG_ERROR,
- "subvolume does not support locking. "
- "please load features/locks xlator on server");
- local->op_ret = op_ret;
- int_lock->lock_op_ret = op_ret;
-
- int_lock->lock_op_errno = op_errno;
- local->op_errno = op_errno;
- }
- } else if (op_ret == 0) {
- int_lock->entry_locked_nodes[child_index] |= LOCKED_YES;
- int_lock->entrylk_lock_count++;
- }
+ gf_log (this->name, GF_LOG_ERROR,
+ "subvolume does not support locking. "
+ "please load features/locks xlator on server");
+ local->op_ret = op_ret;
+ int_lock->lock_op_ret = op_ret;
+
+ int_lock->lock_op_errno = op_errno;
+ local->op_errno = op_errno;
+ }
+ } else if (op_ret == 0) {
+ int_lock->lockee[lockee_no].locked_nodes[index] |= \
+ LOCKED_YES;
+ int_lock->lockee[lockee_no].locked_count++;
+ int_lock->entrylk_lock_count++;
+ }
- LOCK (&frame->lock);
- {
call_count = --int_lock->lk_call_count;
}
UNLOCK (&frame->lock);
@@ -1212,42 +1287,26 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
}
-void
-afr_mark_fd_open_on (afr_local_t *local, afr_fd_ctx_t *fd_ctx,
- size_t child_count)
-{
- int i = 0;
-
- GF_ASSERT (local->fd_open_on);
-
- memset (local->fd_open_on, 0, sizeof (*local->fd_open_on)*child_count);
- for (i = 0; i < child_count; i++)
- if (fd_ctx->opened_on[i] == AFR_FD_OPENED)
- local->fd_open_on[i] = 1;
-}
-
int
afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_fd_ctx_t *fd_ctx = NULL;
- const char *basename = NULL;
- loc_t *loc = NULL;
- int32_t call_count = 0;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int copies = 0;
+ int index = 0;
+ int lockee_no = 0;
+ int32_t call_count = 0;
int i = 0;
local = frame->local;
int_lock = &local->internal_lock;
priv = this->private;
+ copies = priv->child_count;
initialize_entrylk_variables (frame, this);
- basename = int_lock->lk_basename;
- if (int_lock->lk_loc)
- loc = int_lock->lk_loc;
-
if (local->fd) {
fd_ctx = afr_fd_ctx_get (local->fd, this);
if (!fd_ctx) {
@@ -1260,11 +1319,11 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)
local->op_errno = EINVAL;
int_lock->lock_op_errno = EINVAL;
+ afr_unlock (frame, this);
return -1;
}
- afr_mark_fd_open_on (local, fd_ctx, priv->child_count);
- call_count = internal_lock_count (frame, this);
+ call_count = int_lock->lockee_count * internal_lock_count (frame, this);
int_lock->lk_call_count = call_count;
int_lock->lk_expected_count = call_count;
@@ -1277,46 +1336,52 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)
/* Send non-blocking entrylk calls only on up children
and where the fd has been opened */
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i] && local->fd_open_on[i]) {
- AFR_TRACE_ENTRYLK_IN (frame, this,
- AFR_ENTRYLK_NB_TRANSACTION,
- AFR_LOCK_OP, basename, i);
+ for (i = 0; i < int_lock->lockee_count*priv->child_count; i++) {
+ index = i%copies;
+ lockee_no = i/copies;
+ if (local->child_up[index]) {
+ AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION,
+ AFR_LOCK_OP,
+ int_lock->lockee[lockee_no].basename,
+ i);
STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk,
(void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fentrylk,
+ priv->children[index],
+ priv->children[index]->fops->fentrylk,
this->name, local->fd,
- basename,
+ int_lock->lockee[lockee_no].basename,
ENTRYLK_LOCK_NB, ENTRYLK_WRLCK,
NULL);
+ if (!--call_count)
+ break;
}
}
} else {
- GF_ASSERT (loc);
-
- call_count = internal_lock_count (frame, this);
+ call_count = int_lock->lockee_count * internal_lock_count (frame, this);
int_lock->lk_call_count = call_count;
int_lock->lk_expected_count = call_count;
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- AFR_TRACE_ENTRYLK_IN (frame, this,
- AFR_ENTRYLK_NB_TRANSACTION,
- AFR_LOCK_OP, basename, i);
+ for (i = 0; i < int_lock->lockee_count*priv->child_count; i++) {
+ index = i%copies;
+ lockee_no = i/copies;
+ if (local->child_up[index]) {
+ AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION,
+ AFR_LOCK_OP,
+ int_lock->lockee[lockee_no].basename,
+ i);
STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk,
(void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->entrylk,
- this->name, loc, basename,
+ priv->children[index],
+ priv->children[index]->fops->entrylk,
+ this->name, &int_lock->lockee[lockee_no].loc,
+ int_lock->lockee[lockee_no].basename,
ENTRYLK_LOCK_NB, ENTRYLK_WRLCK,
NULL);
if (!--call_count)
break;
-
}
}
}
@@ -1329,6 +1394,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
afr_local_t *local = NULL;
int call_count = 0;
int child_index = (long) cookie;
@@ -1337,58 +1403,57 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local = frame->local;
int_lock = &local->internal_lock;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_NB_TRANSACTION,
AFR_LOCK_OP, NULL, op_ret,
op_errno, (long) cookie);
- if (op_ret < 0) {
- if (op_errno == ENOSYS) {
- /* return ENOTSUP */
- gf_log (this->name, GF_LOG_ERROR,
- "subvolume does not support locking. "
- "please load features/locks xlator on server");
- local->op_ret = op_ret;
- int_lock->lock_op_ret = op_ret;
- int_lock->lock_op_errno = op_errno;
- local->op_errno = op_errno;
- }
- if (local->transaction.eager_lock)
- local->transaction.eager_lock[child_index] = 0;
- } else {
- int_lock->inode_locked_nodes[child_index]
- |= LOCKED_YES;
- int_lock->inodelk_lock_count++;
-
- if (local->transaction.eager_lock &&
- local->transaction.eager_lock[child_index] && local->fd) {
- fd_ctx = afr_fd_ctx_get (local->fd, this);
- /* piggybacked */
-
- if (op_ret == 1) {
- /* piggybacked */
- } else if (op_ret == 0) {
- /* lock acquired from server */
- LOCK (&local->fd->lock);
- {
- fd_ctx->lock_acquired[child_index]++;
- }
- UNLOCK (&local->fd->lock);
- }
- }
- }
+ if (local->fd)
+ fd_ctx = afr_fd_ctx_get (local->fd, this);
LOCK (&frame->lock);
{
+ if (op_ret < 0) {
+ if (op_errno == ENOSYS) {
+ /* return ENOTSUP */
+ gf_log (this->name, GF_LOG_ERROR,
+ "subvolume does not support locking. "
+ "please load features/locks xlator on "
+ "server");
+ local->op_ret = op_ret;
+ int_lock->lock_op_ret = op_ret;
+ int_lock->lock_op_errno = op_errno;
+ local->op_errno = op_errno;
+ }
+ if (local->transaction.eager_lock)
+ local->transaction.eager_lock[child_index] = 0;
+ } else {
+ inodelk->locked_nodes[child_index] |= LOCKED_YES;
+ inodelk->lock_count++;
+
+ if (local->transaction.eager_lock &&
+ local->transaction.eager_lock[child_index] &&
+ local->fd) {
+ /* piggybacked */
+ if (op_ret == 1) {
+ /* piggybacked */
+ } else if (op_ret == 0) {
+ /* lock acquired from server */
+ fd_ctx->lock_acquired[child_index]++;
+ }
+ }
+ }
+
call_count = --int_lock->lk_call_count;
}
UNLOCK (&frame->lock);
+
if (call_count == 0) {
gf_log (this->name, GF_LOG_TRACE,
"Last inode locking reply received");
/* all locks successful. Proceed to call FOP */
- if (int_lock->inodelk_lock_count ==
- int_lock->lk_expected_count) {
+ if (inodelk->lock_count == int_lock->lk_expected_count) {
gf_log (this->name, GF_LOG_TRACE,
"All servers locked. Calling the cbk");
int_lock->lock_op_ret = 0;
@@ -1412,6 +1477,7 @@ int
afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
{
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
afr_local_t *local = NULL;
afr_private_t *priv = NULL;
afr_fd_ctx_t *fd_ctx = NULL;
@@ -1427,11 +1493,13 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
int_lock = &local->internal_lock;
priv = this->private;
- flock.l_start = int_lock->lk_flock.l_start;
- flock.l_len = int_lock->lk_flock.l_len;
- flock.l_type = int_lock->lk_flock.l_type;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+
+ flock.l_start = inodelk->flock.l_start;
+ flock.l_len = inodelk->flock.l_len;
+ flock.l_type = inodelk->flock.l_type;
- full_flock.l_type = int_lock->lk_flock.l_type;
+ full_flock.l_type = inodelk->flock.l_type;
initialize_inodelk_variables (frame, this);
@@ -1447,11 +1515,11 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
local->op_errno = EINVAL;
int_lock->lock_op_errno = EINVAL;
+ afr_unlock (frame, this);
ret = -1;
goto out;
}
- afr_mark_fd_open_on (local, fd_ctx, priv->child_count);
call_count = internal_lock_count (frame, this);
int_lock->lk_call_count = call_count;
int_lock->lk_expected_count = call_count;
@@ -1466,11 +1534,11 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
/* Send non-blocking inodelk calls only on up children
and where the fd has been opened */
for (i = 0; i < priv->child_count; i++) {
- if (!local->child_up[i] || !local->fd_open_on[i])
+ if (!local->child_up[i])
continue;
flock_use = &flock;
- if (!priv->eager_lock) {
+ if (!local->transaction.eager_lock_on) {
goto wind;
}
@@ -1506,7 +1574,7 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
(void *) (long) i,
priv->children[i],
priv->children[i]->fops->finodelk,
- this->name, local->fd,
+ int_lock->domain, local->fd,
F_SETLK, flock_use, NULL);
if (!--call_count)
@@ -1528,7 +1596,7 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
(void *) (long) i,
priv->children[i],
priv->children[i]->fops->inodelk,
- this->name, &local->loc,
+ int_lock->domain, &local->loc,
F_SETLK, &flock, NULL);
if (!--call_count)
@@ -1539,201 +1607,6 @@ out:
return ret;
}
-static int
-__is_lower_locked (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int count = 0;
- int i = 0;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++) {
- if (int_lock->lower_locked_nodes[i] & LOCKED_LOWER)
- count++;
- }
-
- return count;
-
-}
-
-static int
-__is_higher_locked (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int count = 0;
- int i = 0;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++) {
- if (int_lock->locked_nodes[i] & LOCKED_YES)
- count++;
- }
-
- return count;
-
-}
-
-static int
-afr_unlock_lower_entrylk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- const char *basename = NULL;
- loc_t *loc = NULL;
- int call_count = 0;
- int i = -1;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- basename = int_lock->lk_basename;
- if (int_lock->lk_loc)
- loc = int_lock->lk_loc;
-
- call_count = __is_lower_locked (frame, this);
- int_lock->lk_call_count = call_count;
-
- if (!call_count){
- gf_log (this->name, GF_LOG_TRACE,
- "No internal locks unlocked");
- int_lock->lock_cbk (frame, this);
- goto out;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (int_lock->lower_locked_nodes[i] & LOCKED_LOWER) {
- AFR_TRACE_ENTRYLK_IN (frame, this,
- AFR_ENTRYLK_NB_TRANSACTION,
- AFR_UNLOCK_OP, basename, i);
-
- STACK_WIND_COOKIE (frame, afr_unlock_entrylk_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->entrylk,
- this->name,
- loc, basename,
- ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL);
-
- if (!--call_count)
- break;
-
- }
- }
-
-out:
- return 0;
-
-}
-
-
-static int
-afr_post_unlock_higher_cbk (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- local->transaction.done (frame, this);
- return 0;
-}
-
-static int
-afr_post_unlock_lower_cbk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- loc_t *lower = NULL;
- loc_t *higher = NULL;
- const char *higher_name = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- lower = lower_path (&local->transaction.parent_loc,
- local->transaction.basename,
- &local->transaction.new_parent_loc,
- local->transaction.new_basename);
-
- higher = (lower == &local->transaction.parent_loc ?
- &local->transaction.new_parent_loc :
- &local->transaction.parent_loc);
-
- higher_name = (higher == &local->transaction.parent_loc ?
- local->transaction.basename :
- local->transaction.new_basename);
-
- if (__is_higher_locked (frame, this)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "unlocking higher");
- int_lock->lk_basename = higher_name;
- int_lock->lk_loc = higher;
- int_lock->lock_cbk = afr_post_unlock_higher_cbk;
-
- afr_unlock_entrylk (frame, this);
- } else
- local->transaction.done (frame, this);
-
- return 0;
-}
-
-static int
-afr_rename_unlock (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- loc_t *lower = NULL;
- const char *lower_name = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- lower = lower_path (&local->transaction.parent_loc,
- local->transaction.basename,
- &local->transaction.new_parent_loc,
- local->transaction.new_basename);
-
- lower_name = (lower == &local->transaction.parent_loc ?
- local->transaction.basename :
- local->transaction.new_basename);
-
- if (__is_lower_locked (frame, this)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "unlocking lower");
- int_lock->lk_basename = lower_name;
- int_lock->lk_loc = lower;
- int_lock->lock_cbk = afr_post_unlock_lower_cbk;
-
- afr_unlock_lower_entrylk (frame, this);
- } else
- afr_post_unlock_lower_cbk (frame, this);
-
- return 0;
-}
-
-static int
-afr_rename_transaction (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- return (local->transaction.type ==
- AFR_ENTRY_RENAME_TRANSACTION);
-
-}
-
int32_t
afr_unlock (call_frame_t *frame, xlator_t *this)
{
@@ -1745,10 +1618,8 @@ afr_unlock (call_frame_t *frame, xlator_t *this)
if (is_afr_lock_transaction (local))
afr_unlock_inodelk (frame, this);
else
- if (!afr_rename_transaction (frame, this))
- afr_unlock_entrylk (frame, this);
- else
- afr_rename_unlock (frame, this);
+ afr_unlock_entrylk (frame, this);
+
} else {
if (is_afr_lock_selfheal (local))
afr_unlock_inodelk (frame, this);
@@ -1760,518 +1631,37 @@ afr_unlock (call_frame_t *frame, xlator_t *this)
}
int
-afr_mark_locked_nodes (xlator_t *this, fd_t *fd,
- unsigned char *locked_nodes)
-{
- afr_private_t *priv = NULL;
- afr_fd_ctx_t *fdctx = NULL;
- uint64_t tmp = 0;
- int ret = 0;
-
- priv = this->private;
-
- ret = afr_fd_ctx_set (this, fd);
- if (ret)
- goto out;
-
- ret = fd_ctx_get (fd, this, &tmp);
- if (ret) {
- gf_log (this->name, GF_LOG_INFO,
- "failed to get the fd ctx");
- goto out;
- }
- fdctx = (afr_fd_ctx_t *) (long) tmp;
-
- GF_ASSERT (fdctx->locked_on);
-
- memcpy (fdctx->locked_on, locked_nodes,
- priv->child_count);
-
-out:
- return ret;
-}
-
-static int
-__is_fd_saved (xlator_t *this, fd_t *fd)
-{
- afr_locked_fd_t *locked_fd = NULL;
- afr_private_t *priv = NULL;
- int found = 0;
-
- priv = this->private;
-
- list_for_each_entry (locked_fd, &priv->saved_fds, list) {
- if (locked_fd->fd == fd) {
- found = 1;
- break;
- }
- }
-
- return found;
-}
-
-static int
-__afr_save_locked_fd (xlator_t *this, fd_t *fd)
-{
- afr_private_t *priv = NULL;
- afr_locked_fd_t *locked_fd = NULL;
- int ret = 0;
-
- priv = this->private;
-
- locked_fd = GF_CALLOC (1, sizeof (*locked_fd),
- gf_afr_mt_locked_fd);
- if (!locked_fd) {
- ret = -1;
- goto out;
- }
-
- locked_fd->fd = fd;
- INIT_LIST_HEAD (&locked_fd->list);
-
- list_add_tail (&locked_fd->list, &priv->saved_fds);
-
-out:
- return ret;
-}
-
-int
-afr_save_locked_fd (xlator_t *this, fd_t *fd)
-{
- afr_private_t *priv = NULL;
- int ret = 0;
-
- priv = this->private;
-
- pthread_mutex_lock (&priv->mutex);
- {
- if (__is_fd_saved (this, fd)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "fd=%p already saved", fd);
- goto unlock;
- }
-
- ret = __afr_save_locked_fd (this, fd);
- if (ret) {
- gf_log (this->name, GF_LOG_INFO,
- "fd=%p could not be saved", fd);
- goto unlock;
- }
- }
-unlock:
- pthread_mutex_unlock (&priv->mutex);
-
- return ret;
-}
-
-static int
-afr_lock_recovery_cleanup (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_locked_fd_t *locked_fd = NULL;
-
- local = frame->local;
-
- locked_fd = local->locked_fd;
-
- STACK_DESTROY (frame->root);
- afr_local_cleanup (local, this);
-
- afr_save_locked_fd (this, locked_fd->fd);
-
- return 0;
-
-}
-
-static int
-afr_get_source_lock_recovery (xlator_t *this, fd_t *fd)
-{
- afr_fd_ctx_t *fdctx = NULL;
- afr_private_t *priv = NULL;
- uint64_t tmp = 0;
- int i = 0;
- int source_child = -1;
- int ret = 0;
-
- priv = this->private;
-
- ret = fd_ctx_get (fd, this, &tmp);
- if (ret)
- goto out;
-
- fdctx = (afr_fd_ctx_t *) (long) tmp;
-
- for (i = 0; i < priv->child_count; i++) {
- if (fdctx->locked_on[i]) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Found lock recovery source=%d", i);
- source_child = i;
- break;
- }
- }
-
-out:
- return source_child;
-
-}
-
-int32_t
-afr_get_locks_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
- dict_t *xdata);
-int32_t
-afr_recover_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
- dict_t *xdata)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int32_t source_child = 0;
- struct gf_flock flock = {0,};
-
- local = frame->local;
- priv = this->private;
-
- if (op_ret) {
- gf_log (this->name, GF_LOG_INFO,
- "lock recovery failed");
- goto cleanup;
- }
-
- source_child = local->source_child;
-
- memcpy (&flock, lock, sizeof (*lock));
-
- STACK_WIND_COOKIE (frame, afr_get_locks_fd_cbk,
- (void *) (long) source_child,
- priv->children[source_child],
- priv->children[source_child]->fops->lk,
- local->fd, F_GETLK_FD, &flock, NULL);
-
- return 0;
-
-cleanup:
- afr_lock_recovery_cleanup (frame, this);
- return 0;
-}
-
-int
-afr_recover_lock (call_frame_t *frame, xlator_t *this,
- struct gf_flock *flock)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int32_t lock_recovery_child = 0;
-
- priv = this->private;
- local = frame->local;
-
- lock_recovery_child = local->lock_recovery_child;
-
- frame->root->lk_owner = flock->l_owner;
-
- STACK_WIND_COOKIE (frame, afr_recover_lock_cbk,
- (void *) (long) lock_recovery_child,
- priv->children[lock_recovery_child],
- priv->children[lock_recovery_child]->fops->lk,
- local->fd, F_SETLK, flock, NULL);
-
- return 0;
-}
-
-static int
-is_afr_lock_eol (struct gf_flock *lock)
-{
- int ret = 0;
-
- if ((lock->l_type == GF_LK_EOL))
- ret = 1;
-
- return ret;
-}
-
-int32_t
-afr_get_locks_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
- dict_t *xdata)
-{
- if (op_ret) {
- gf_log (this->name, GF_LOG_INFO,
- "Failed to get locks on fd");
- goto cleanup;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Got a lock on fd");
-
- if (is_afr_lock_eol (lock)) {
- gf_log (this->name, GF_LOG_INFO,
- "Reached EOL on locks on fd");
- goto cleanup;
- }
-
- afr_recover_lock (frame, this, lock);
-
- return 0;
-
-cleanup:
- afr_lock_recovery_cleanup (frame, this);
-
- return 0;
-}
-
-static int
-afr_lock_recovery (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- fd_t *fd = NULL;
- int ret = 0;
- int32_t source_child = 0;
- struct gf_flock flock = {0,};
-
- priv = this->private;
- local = frame->local;
-
- fd = local->fd;
-
- source_child = afr_get_source_lock_recovery (this, fd);
- if (source_child < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "Could not recover locks due to lock "
- "split brain");
- ret = -1;
- goto out;
- }
-
- local->source_child = source_child;
-
- /* the flock can be zero filled as we're querying incrementally
- the locks held on the fd.
- */
- STACK_WIND_COOKIE (frame, afr_get_locks_fd_cbk,
- (void *) (long) source_child,
- priv->children[source_child],
- priv->children[source_child]->fops->lk,
- local->fd, F_GETLK_FD, &flock, NULL);
-
-out:
- return ret;
-}
-
-
-static int
-afr_mark_fd_opened (xlator_t *this, fd_t *fd, int32_t child_index)
-{
- afr_fd_ctx_t *fdctx = NULL;
- uint64_t tmp = 0;
- int ret = 0;
-
- ret = fd_ctx_get (fd, this, &tmp);
- if (ret)
- goto out;
-
- fdctx = (afr_fd_ctx_t *) (long) tmp;
-
- fdctx->opened_on[child_index] = AFR_FD_OPENED;
-
-out:
- return ret;
-}
-
-int32_t
-afr_lock_recovery_preopen_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd,
- dict_t *xdata)
-{
- int32_t child_index = (long )cookie;
- int ret = 0;
-
- if (op_ret) {
- gf_log (this->name, GF_LOG_INFO,
- "Reopen during lock-recovery failed");
- goto cleanup;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Open succeeded => proceed to recover locks");
-
- ret = afr_lock_recovery (frame, this);
- if (ret) {
- gf_log (this->name, GF_LOG_INFO,
- "Lock recovery failed");
- goto cleanup;
- }
-
- ret = afr_mark_fd_opened (this, fd, child_index);
- if (ret) {
- gf_log (this->name, GF_LOG_INFO,
- "Marking fd open failed");
- goto cleanup;
- }
-
- return 0;
-
-cleanup:
- afr_lock_recovery_cleanup (frame, this);
- return 0;
-}
-
-static int
-afr_lock_recovery_preopen (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- uint64_t tmp = 0;
- afr_fd_ctx_t *fdctx = NULL;
- loc_t loc = {0,};
- int32_t child_index = 0;
- int ret = 0;
-
- priv = this->private;
- local = frame->local;
-
- GF_ASSERT (local && local->fd);
-
- ret = fd_ctx_get (local->fd, this, &tmp);
- if (ret)
- gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to get the context of fd",
- uuid_utoa (local->fd->inode->gfid));
- fdctx = (afr_fd_ctx_t *) (long) tmp;
- /* TODO: instead we should return from the function */
- GF_ASSERT (fdctx);
-
- child_index = local->lock_recovery_child;
-
- inode_path (local->fd->inode, NULL, (char **)&loc.path);
- loc.name = strrchr (loc.path, '/');
- loc.inode = inode_ref (local->fd->inode);
- loc.parent = inode_parent (local->fd->inode, 0, NULL);
-
-
- STACK_WIND_COOKIE (frame, afr_lock_recovery_preopen_cbk,
- (void *)(long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->open,
- &loc, fdctx->flags, local->fd, NULL);
-
- return 0;
-}
-
-static int
-is_fd_opened (fd_t *fd, int32_t child_index)
-{
- afr_fd_ctx_t *fdctx = NULL;
- uint64_t tmp = 0;
- int ret = 0;
-
- ret = fd_ctx_get (fd, THIS, &tmp);
- if (ret)
- goto out;
-
- fdctx = (afr_fd_ctx_t *) (long) tmp;
-
- if (fdctx->opened_on[child_index] == AFR_FD_OPENED)
- ret = 1;
-
-out:
- return ret;
-}
-
-int
-afr_attempt_lock_recovery (xlator_t *this, int32_t child_index)
-{
- call_frame_t *frame = NULL;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_locked_fd_t *locked_fd = NULL;
- afr_locked_fd_t *tmp = NULL;
- int ret = -1;
- struct list_head locks_list = {0,};
- int32_t op_errno = 0;
-
-
- priv = this->private;
-
- if (list_empty (&priv->saved_fds))
- goto out;
-
- frame = create_frame (this, this->ctx->pool);
- if (!frame) {
- ret = -1;
- goto out;
- }
-
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0) {
- ret = -1;
- goto out;
- }
-
- frame->local = local;
-
- INIT_LIST_HEAD (&locks_list);
-
- pthread_mutex_lock (&priv->mutex);
- {
- list_splice_init (&priv->saved_fds, &locks_list);
- }
- pthread_mutex_unlock (&priv->mutex);
-
- list_for_each_entry_safe (locked_fd, tmp,
- &locks_list, list) {
-
- list_del_init (&locked_fd->list);
-
- local->fd = fd_ref (locked_fd->fd);
- local->lock_recovery_child = child_index;
- local->locked_fd = locked_fd;
-
- if (!is_fd_opened (locked_fd->fd, child_index)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "attempting open before lock "
- "recovery");
- afr_lock_recovery_preopen (frame, this);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "attempting lock recovery "
- "without a preopen");
- afr_lock_recovery (frame, this);
- }
- }
-
-out:
- if ((ret < 0) && frame)
- AFR_STACK_DESTROY (frame);
- return ret;
-}
-
-void
-afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src,
+afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom,
unsigned int child_count)
{
- afr_local_t *dst_local = NULL;
- afr_local_t *src_local = NULL;
- afr_internal_lock_t *dst_lock = NULL;
- afr_internal_lock_t *src_lock = NULL;
+ afr_local_t *dst_local = NULL;
+ afr_local_t *src_local = NULL;
+ afr_internal_lock_t *dst_lock = NULL;
+ afr_internal_lock_t *src_lock = NULL;
+ afr_inodelk_t *dst_inodelk = NULL;
+ afr_inodelk_t *src_inodelk = NULL;
+ int ret = -1;
- dst_local = dst->local;
- dst_lock = &dst_local->internal_lock;
src_local = src->local;
src_lock = &src_local->internal_lock;
- if (src_lock->inode_locked_nodes) {
- memcpy (dst_lock->inode_locked_nodes,
- src_lock->inode_locked_nodes,
- sizeof (*dst_lock->inode_locked_nodes) * child_count);
- memset (src_lock->inode_locked_nodes, 0,
- sizeof (*src_lock->inode_locked_nodes) * child_count);
+ src_inodelk = afr_get_inodelk (src_lock, dom);
+ dst_local = dst->local;
+ dst_lock = &dst_local->internal_lock;
+ dst_inodelk = afr_get_inodelk (dst_lock, dom);
+ if (!dst_inodelk || !src_inodelk)
+ goto out;
+ if (src_inodelk->locked_nodes) {
+ memcpy (dst_inodelk->locked_nodes, src_inodelk->locked_nodes,
+ sizeof (*dst_inodelk->locked_nodes) * child_count);
+ memset (src_inodelk->locked_nodes, 0,
+ sizeof (*src_inodelk->locked_nodes) * child_count);
}
dst_lock->transaction_lk_type = src_lock->transaction_lk_type;
dst_lock->selfheal_lk_type = src_lock->selfheal_lk_type;
- dst_lock->inodelk_lock_count = src_lock->inodelk_lock_count;
- src_lock->inodelk_lock_count = 0;
+ dst_inodelk->lock_count = src_inodelk->lock_count;
+ src_inodelk->lock_count = 0;
+ ret = 0;
+out:
+ return ret;
}
diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h
index 7e684801f..05df90cc0 100644
--- a/xlators/cluster/afr/src/afr-mem-types.h
+++ b/xlators/cluster/afr/src/afr-mem-types.h
@@ -41,6 +41,8 @@ enum gf_afr_mem_types_ {
gf_afr_mt_shd_event_t,
gf_afr_mt_time_t,
gf_afr_mt_pos_data_t,
+ gf_afr_mt_reply_t,
+ gf_afr_mt_subvol_healer_t,
gf_afr_mt_end
};
#endif
diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c
index c0be197f2..f86aa7fd8 100644
--- a/xlators/cluster/afr/src/afr-open.c
+++ b/xlators/cluster/afr/src/afr-open.c
@@ -43,85 +43,29 @@
#include "afr-dir-read.h"
#include "afr-dir-write.h"
#include "afr-transaction.h"
-#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
-int
-afr_stale_child_up (afr_local_t *local, xlator_t *this)
-{
- int i = 0;
- afr_private_t *priv = NULL;
- int up = -1;
-
- priv = this->private;
- if (!local->fresh_children)
- local->fresh_children = afr_children_create (priv->child_count);
- if (!local->fresh_children)
- goto out;
-
- afr_inode_get_read_ctx (this, local->fd->inode, local->fresh_children);
- if (priv->child_count == afr_get_children_count (local->fresh_children,
- priv->child_count))
- goto out;
-
- for (i = 0; i < priv->child_count; i++) {
- if (!local->child_up[i])
- continue;
- if (afr_is_child_present (local->fresh_children,
- priv->child_count, i))
- continue;
- up = i;
- break;
- }
-out:
- return up;
-}
-
-void
-afr_perform_data_self_heal (call_frame_t *frame, xlator_t *this)
+gf_boolean_t
+afr_is_fd_fixable (fd_t *fd)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- inode_t *inode = NULL;
- int st_child = -1;
- char reason[64] = {0};
-
- local = frame->local;
- sh = &local->self_heal;
- inode = local->fd->inode;
-
- if (!IA_ISREG (inode->ia_type))
- goto out;
-
- st_child = afr_stale_child_up (local, this);
- if (st_child < 0)
- goto out;
-
- sh->do_data_self_heal = _gf_true;
- sh->do_metadata_self_heal = _gf_true;
- sh->do_gfid_self_heal = _gf_true;
- sh->do_missing_entry_self_heal = _gf_true;
-
- snprintf (reason, sizeof (reason), "stale subvolume %d detected",
- st_child);
- afr_launch_self_heal (frame, this, inode, _gf_true, inode->ia_type,
- reason, NULL, NULL);
-out:
- return;
+ if (!fd || !fd->inode)
+ return _gf_false;
+ else if (fd_is_anonymous (fd))
+ return _gf_false;
+ else if (uuid_is_null (fd->inode->gfid))
+ return _gf_false;
+
+ return _gf_true;
}
+
int
afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
{
afr_local_t * local = frame->local;
- afr_private_t *priv = NULL;
- priv = this->private;
- if (afr_open_only_data_self_heal (priv->data_self_heal))
- afr_perform_data_self_heal (frame, this);
AFR_STACK_UNWIND (open, frame, local->op_ret, local->op_errno,
local->fd, xdata);
return 0;
@@ -134,49 +78,38 @@ afr_open_cbk (call_frame_t *frame, void *cookie,
fd_t *fd, dict_t *xdata)
{
afr_local_t * local = NULL;
- int ret = 0;
int call_count = -1;
int child_index = (long) cookie;
- afr_private_t *priv = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
- priv = this->private;
local = frame->local;
+ fd_ctx = local->fd_ctx;
LOCK (&frame->lock);
{
if (op_ret == -1) {
local->op_errno = op_errno;
- }
-
- if (op_ret >= 0) {
+ fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
+ } else {
local->op_ret = op_ret;
- local->success_count++;
-
- ret = afr_child_fd_ctx_set (this, fd, child_index,
- local->cont.open.flags);
- if (ret) {
- local->op_ret = -1;
- local->op_errno = -ret;
- goto unlock;
- }
+ fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref (xdata);
}
}
-unlock:
UNLOCK (&frame->lock);
call_count = afr_frame_return (frame);
if (call_count == 0) {
- if ((local->cont.open.flags & O_TRUNC)
- && (local->op_ret >= 0)) {
+ if ((fd_ctx->flags & O_TRUNC) && (local->op_ret >= 0)) {
STACK_WIND (frame, afr_open_ftruncate_cbk,
this, this->fops->ftruncate,
fd, 0, NULL);
} else {
- if (afr_open_only_data_self_heal (priv->data_self_heal))
- afr_perform_data_self_heal (frame, this);
AFR_STACK_UNWIND (open, frame, local->op_ret,
- local->op_errno, local->fd, xdata);
+ local->op_errno, local->fd,
+ local->xdata_rsp);
}
}
@@ -190,16 +123,11 @@ afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
afr_private_t * priv = NULL;
afr_local_t * local = NULL;
int i = 0;
- int ret = -1;
int32_t call_count = 0;
int32_t op_errno = 0;
- int32_t wind_flags = flags & (~O_TRUNC);
- //We can't let truncation to happen outside transaction.
+ afr_fd_ctx_t *fd_ctx = NULL;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (loc, out);
+ //We can't let truncation to happen outside transaction.
priv = this->private;
@@ -207,240 +135,203 @@ afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
QUORUM_CHECK(open,out);
}
- if (afr_is_split_brain (this, loc->inode)) {
- /* self-heal failed */
- gf_log (this->name, GF_LOG_WARNING,
- "failed to open as split brain seen, returning EIO");
- op_errno = EIO;
- goto out;
- }
-
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
+ fd_ctx = afr_fd_ctx_get (fd, this);
+ if (!fd_ctx) {
+ op_errno = ENOMEM;
+ goto out;
+ }
- call_count = local->call_count;
- loc_copy (&local->loc, loc);
+ local->fd = fd_ref (fd);
+ local->fd_ctx = fd_ctx;
+ fd_ctx->flags = flags;
- local->cont.open.flags = flags;
+ call_count = local->call_count;
- local->fd = fd_ref (fd);
+ local->cont.open.flags = flags;
for (i = 0; i < priv->child_count; i++) {
if (local->child_up[i]) {
STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i,
priv->children[i],
priv->children[i]->fops->open,
- loc, wind_flags, fd, xdata);
-
+ loc, (flags & ~O_TRUNC), fd, xdata);
if (!--call_count)
break;
}
}
- ret = 0;
+ return 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (open, frame, -1, op_errno, fd, xdata);
+ AFR_STACK_UNWIND (open, frame, -1, op_errno, fd, NULL);
return 0;
}
-//NOTE: this function should be called with holding the lock on
-//fd to which fd_ctx belongs
-void
-afr_get_resumable_calls (xlator_t *this, afr_fd_ctx_t *fd_ctx,
- struct list_head *list)
-{
- afr_fd_paused_call_t *paused_call = NULL;
- afr_fd_paused_call_t *tmp = NULL;
- afr_local_t *call_local = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- gf_boolean_t call = _gf_false;
-
- priv = this->private;
- list_for_each_entry_safe (paused_call, tmp, &fd_ctx->paused_calls,
- call_list) {
- call = _gf_true;
- call_local = paused_call->frame->local;
- for (i = 0; i < priv->child_count; i++) {
- if (call_local->child_up[i] &&
- (fd_ctx->opened_on[i] == AFR_FD_OPENING))
- call = _gf_false;
- }
-
- if (call) {
- list_del_init (&paused_call->call_list);
- list_add (&paused_call->call_list, list);
- }
- }
-}
-
-void
-afr_resume_calls (xlator_t *this, struct list_head *list)
-{
- afr_fd_paused_call_t *paused_call = NULL;
- afr_fd_paused_call_t *tmp = NULL;
- afr_local_t *call_local = NULL;
-
- list_for_each_entry_safe (paused_call, tmp, list, call_list) {
- list_del_init (&paused_call->call_list);
- call_local = paused_call->frame->local;
- call_local->fop_call_continue (paused_call->frame, this);
- GF_FREE (paused_call);
- }
-}
-
int
afr_openfd_fix_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+ int32_t op_ret, int32_t op_errno, fd_t *fd,
+ dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_fd_ctx_t *fd_ctx = NULL;
- int call_count = 0;
- int child_index = (long) cookie;
- struct list_head paused_calls = {0};
- gf_boolean_t fop_paused = _gf_false;
- fd_t *local_fd = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int call_count = 0;
+ int child_index = (long) cookie;
priv = this->private;
local = frame->local;
- fop_paused = local->fop_paused;
if (op_ret >= 0) {
- gf_log (this->name, GF_LOG_INFO, "fd for %s opened "
+ gf_log (this->name, GF_LOG_DEBUG, "fd for %s opened "
"successfully on subvolume %s", local->loc.path,
priv->children[child_index]->name);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to open %s "
+ "on subvolume %s", local->loc.path,
+ priv->children[child_index]->name);
}
- local_fd = fd_ref (local->fd);
- call_count = afr_frame_return (frame);
- //Note: Do not access any thing using the frame outside call_count 0
+ fd_ctx = local->fd_ctx;
- //Note: No frame locking needed for this block of code
- fd_ctx = afr_fd_ctx_get (local_fd, this);
- if (!fd_ctx) {
- gf_log (this->name, GF_LOG_WARNING,
- "failed to get fd context, %p", local_fd);
- goto out;
- }
-
- LOCK (&local_fd->lock);
+ LOCK (&local->fd->lock);
{
if (op_ret >= 0) {
fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
} else {
fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
}
- if (call_count == 0) {
- INIT_LIST_HEAD (&paused_calls);
- afr_get_resumable_calls (this, fd_ctx, &paused_calls);
- }
- }
- UNLOCK (&local_fd->lock);
-out:
- if (call_count == 0) {
- afr_resume_calls (this, &paused_calls);
- //If the fop is paused then resume_calls will continue the fop
- if (fop_paused)
- goto done;
-
- if (local->fop_call_continue)
- local->fop_call_continue (frame, this);
- else
- AFR_STACK_DESTROY (frame);
}
+ UNLOCK (&local->fd->lock);
+
+ call_count = afr_frame_return (frame);
+ if (call_count == 0)
+ AFR_STACK_DESTROY (frame);
-done:
- fd_unref (local_fd);
return 0;
}
-int
-afr_fix_open (call_frame_t *frame, xlator_t *this, afr_fd_ctx_t *fd_ctx,
- int need_open_count, int *need_open)
+
+static int
+afr_fd_ctx_need_open (fd_t *fd, xlator_t *this, unsigned char *need_open)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- call_frame_t *open_frame = NULL;
- afr_local_t *open_local = NULL;
- int ret = -1;
- ia_type_t ia_type = IA_INVAL;
- int32_t op_errno = 0;
-
- GF_ASSERT (fd_ctx);
- GF_ASSERT (need_open_count > 0);
- GF_ASSERT (need_open);
+ afr_fd_ctx_t *fd_ctx = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int count = 0;
+
+ priv = this->private;
+
+ fd_ctx = afr_fd_ctx_get (fd, this);
+ if (!fd_ctx)
+ return 0;
+
+ LOCK (&fd->lock);
+ {
+ for (i = 0; i < priv->child_count; i++) {
+ if (fd_ctx->opened_on[i] == AFR_FD_NOT_OPENED &&
+ priv->child_up[i]) {
+ fd_ctx->opened_on[i] = AFR_FD_OPENING;
+ need_open[i] = 1;
+ count++;
+ } else {
+ need_open[i] = 0;
+ }
+ }
+ }
+ UNLOCK (&fd->lock);
+
+ return count;
+}
+
+
+void
+afr_fix_open (fd_t *fd, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ int ret = -1;
+ int32_t op_errno = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ unsigned char *need_open = NULL;
+ int call_count = 0;
- local = frame->local;
priv = this->private;
- if (!local->fop_call_continue) {
- open_frame = copy_frame (frame);
- if (!open_frame) {
- ret = -ENOMEM;
- goto out;
- }
- AFR_LOCAL_ALLOC_OR_GOTO (open_frame->local, out);
- open_local = open_frame->local;
- ret = afr_local_init (open_local, priv, &op_errno);
- if (ret < 0)
- goto out;
- loc_copy (&open_local->loc, &local->loc);
- open_local->fd = fd_ref (local->fd);
- } else {
- ret = 0;
- open_frame = frame;
- open_local = local;
- }
- open_local->call_count = need_open_count;
+ if (!afr_is_fd_fixable (fd))
+ goto out;
+
+ fd_ctx = afr_fd_ctx_get (fd, this);
+ if (!fd_ctx)
+ goto out;
+
+ need_open = alloca0 (priv->child_count);
+
+ call_count = afr_fd_ctx_need_open (fd, this, need_open);
+ if (!call_count)
+ goto out;
+
+ frame = create_frame (this, this->ctx->pool);
+ if (!frame)
+ goto out;
+
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->loc.inode = inode_ref (fd->inode);
+ ret = loc_path (&local->loc, NULL);
+ if (ret < 0)
+ goto out;
+
+ local->fd = fd_ref (fd);
+ local->fd_ctx = fd_ctx;
+
+ local->call_count = call_count;
gf_log (this->name, GF_LOG_DEBUG, "need open count: %d",
- need_open_count);
+ call_count);
- ia_type = open_local->fd->inode->ia_type;
- GF_ASSERT (ia_type != IA_INVAL);
for (i = 0; i < priv->child_count; i++) {
if (!need_open[i])
continue;
- if (IA_IFDIR == ia_type) {
+
+ if (IA_IFDIR == fd->inode->ia_type) {
gf_log (this->name, GF_LOG_DEBUG,
"opening fd for dir %s on subvolume %s",
local->loc.path, priv->children[i]->name);
- STACK_WIND_COOKIE (open_frame, afr_openfd_fix_open_cbk,
+ STACK_WIND_COOKIE (frame, afr_openfd_fix_open_cbk,
(void*) (long) i,
priv->children[i],
priv->children[i]->fops->opendir,
- &open_local->loc, open_local->fd,
+ &local->loc, local->fd,
NULL);
} else {
gf_log (this->name, GF_LOG_DEBUG,
"opening fd for file %s on subvolume %s",
local->loc.path, priv->children[i]->name);
- STACK_WIND_COOKIE (open_frame, afr_openfd_fix_open_cbk,
+ STACK_WIND_COOKIE (frame, afr_openfd_fix_open_cbk,
(void *)(long) i,
priv->children[i],
priv->children[i]->fops->open,
- &open_local->loc,
+ &local->loc,
fd_ctx->flags & (~O_TRUNC),
- open_local->fd, NULL);
+ local->fd, NULL);
}
+ if (!--call_count)
+ break;
}
- op_errno = 0;
- ret = 0;
+
+ return;
out:
- if (op_errno)
- ret = -op_errno;
- if (ret && open_frame)
- AFR_STACK_DESTROY (open_frame);
- return ret;
+ if (frame)
+ AFR_STACK_DESTROY (frame);
}
diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c
new file mode 100644
index 000000000..186f68c33
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-read-txn.c
@@ -0,0 +1,239 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "afr.h"
+#include "afr-transaction.h"
+
+int
+afr_read_txn_next_subvol (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int subvol = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->readable[i]) {
+ /* don't even bother trying here.
+ just mark as attempted and move on. */
+ local->read_attempted[i] = 1;
+ continue;
+ }
+
+ if (!local->read_attempted[i]) {
+ subvol = i;
+ break;
+ }
+ }
+
+ /* If no more subvols were available for reading, we leave
+ @subvol as -1, which is an indication we have run out of
+ readable subvols. */
+ if (subvol != -1)
+ local->read_attempted[subvol] = 1;
+ local->readfn (frame, this, subvol);
+
+ return 0;
+}
+
+
+int
+afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)
+{
+ afr_local_t *local = NULL;
+ int read_subvol = 0;
+ int event_generation = 0;
+ inode_t *inode = NULL;
+ int ret = -1;
+
+ local = frame->local;
+ inode = local->inode;
+
+ if (err) {
+ local->op_errno = -err;
+ local->op_ret = -1;
+ read_subvol = -1;
+ goto readfn;
+ }
+
+ ret = afr_inode_read_subvol_type_get (inode, this, local->readable,
+ &event_generation,
+ local->transaction.type);
+
+ if (ret == -1 || !event_generation) {
+ /* Even after refresh, we don't have a good
+ read subvolume. Time to bail */
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ read_subvol = -1;
+ goto readfn;
+ }
+
+ read_subvol = afr_read_subvol_select_by_policy (inode, this,
+ local->readable);
+
+ if (read_subvol == -1) {
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ goto readfn;
+ }
+
+ if (local->read_attempted[read_subvol]) {
+ afr_read_txn_next_subvol (frame, this);
+ return 0;
+ }
+
+ local->read_attempted[read_subvol] = 1;
+readfn:
+ local->readfn (frame, this, read_subvol);
+
+ return 0;
+}
+
+
+int
+afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (!local->refreshed) {
+ local->refreshed = _gf_true;
+ afr_inode_refresh (frame, this, local->inode,
+ afr_read_txn_refresh_done);
+ } else {
+ afr_read_txn_next_subvol (frame, this);
+ }
+
+ return 0;
+}
+
+
+/* afr_read_txn_wipe:
+
+ clean internal variables in @local in order to make
+ it possible to call afr_read_txn() multiple times from
+ the same frame
+*/
+
+void
+afr_read_txn_wipe (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ local->readfn = NULL;
+
+ if (local->inode)
+ inode_unref (local->inode);
+
+ for (i = 0; i < priv->child_count; i++) {
+ local->read_attempted[i] = 0;
+ local->readable[i] = 0;
+ }
+}
+
+
+/*
+ afr_read_txn:
+
+ This is the read transaction function. The way it works:
+
+ - Determine read-subvolume from inode ctx.
+
+ - If read-subvolume's generation was stale, refresh ctx once by
+ calling afr_inode_refresh()
+
+ Else make an attempt to read on read-subvolume.
+
+ - If attempted read on read-subvolume fails, refresh ctx once
+ by calling afr_inode_refresh()
+
+ - After ctx refresh, query read-subvolume freshly and attempt
+ read once.
+
+ - If read fails, try every other readable[] subvolume before
+ finally giving up. readable[] elements are set by afr_inode_refresh()
+ based on dirty and pending flags.
+
+ - If file is in split brain in the backend, generation will be
+ kept 0 by afr_inode_refresh() and readable[] will be set 0 for
+ all elements. Therefore reads always fail.
+*/
+
+int
+afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ afr_read_txn_wind_t readfn, afr_transaction_type type)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int read_subvol = -1;
+ int event_generation = 0;
+ int ret = -1;
+
+ priv = this->private;
+ local = frame->local;
+
+ afr_read_txn_wipe (frame, this);
+
+ local->readfn = readfn;
+ local->inode = inode_ref (inode);
+
+ local->transaction.type = type;
+ ret = afr_inode_read_subvol_type_get (inode, this, local->readable,
+ &event_generation, type);
+ if (ret == -1)
+ /* very first transaction on this inode */
+ goto refresh;
+
+ if (local->event_generation != event_generation)
+ /* servers have disconnected / reconnected, and possibly
+ rebooted, very likely changing the state of freshness
+ of copies */
+ goto refresh;
+
+ read_subvol = afr_read_subvol_select_by_policy (inode, this,
+ local->readable);
+
+ if (read_subvol < 0 || read_subvol > priv->child_count) {
+ gf_log (this->name, GF_LOG_WARNING, "Unreadable subvolume %d "
+ "found with event generation %d", read_subvol,
+ event_generation);
+ goto refresh;
+ }
+
+ if (!local->child_up[read_subvol]) {
+ /* should never happen, just in case */
+ gf_log (this->name, GF_LOG_WARNING, "subvolume %d is the "
+ "read subvolume in this generation, but is not up",
+ read_subvol);
+ goto refresh;
+ }
+
+ local->read_attempted[read_subvol] = 1;
+
+ local->readfn (frame, this, read_subvol);
+
+ return 0;
+
+refresh:
+ afr_inode_refresh (frame, this, inode, afr_read_txn_refresh_done);
+
+ return 0;
+}
diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.c b/xlators/cluster/afr/src/afr-self-heal-algorithm.c
deleted file mode 100644
index 9ad405920..000000000
--- a/xlators/cluster/afr/src/afr-self-heal-algorithm.c
+++ /dev/null
@@ -1,809 +0,0 @@
-/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-
-#include <openssl/md5.h>
-#include "glusterfs.h"
-#include "afr.h"
-#include "xlator.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
-
-#include "afr-transaction.h"
-#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
-#include "afr-self-heal-algorithm.h"
-
-/*
- This file contains the various self-heal algorithms
-*/
-
-static int
-sh_loop_driver (call_frame_t *sh_frame, xlator_t *this,
- gf_boolean_t is_first_call, call_frame_t *old_loop_frame);
-static int
-sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame,
- int32_t op_ret, int32_t op_errno);
-static int
-sh_destroy_frame (call_frame_t *frame, xlator_t *this)
-{
- if (!frame)
- goto out;
-
- AFR_STACK_DESTROY (frame);
-out:
- return 0;
-}
-
-static void
-sh_private_cleanup (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_sh_algo_private_t *sh_priv = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- sh_priv = sh->private;
- GF_FREE (sh_priv);
-}
-
-static int
-sh_number_of_writes_needed (unsigned char *write_needed, int child_count)
-{
- int writes = 0;
- int i = 0;
-
- for (i = 0; i < child_count; i++) {
- if (write_needed[i])
- writes++;
- }
-
- return writes;
-}
-
-
-static int
-sh_loop_driver_done (call_frame_t *sh_frame, xlator_t *this,
- call_frame_t *last_loop_frame)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_sh_algo_private_t *sh_priv = NULL;
- int32_t total_blocks = 0;
- int32_t diff_blocks = 0;
-
- local = sh_frame->local;
- sh = &local->self_heal;
- sh_priv = sh->private;
- if (sh_priv) {
- total_blocks = sh_priv->total_blocks;
- diff_blocks = sh_priv->diff_blocks;
- }
-
- sh_private_cleanup (sh_frame, this);
- if (sh->op_failed) {
- GF_ASSERT (!last_loop_frame);
- //loop_finish should have happened and the old_loop should be NULL
- gf_log (this->name, GF_LOG_DEBUG,
- "self-heal aborting on %s",
- local->loc.path);
-
- local->self_heal.algo_abort_cbk (sh_frame, this);
- } else {
- GF_ASSERT (last_loop_frame);
- if (diff_blocks == total_blocks) {
- gf_log (this->name, GF_LOG_DEBUG, "full self-heal "
- "completed on %s",local->loc.path);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "diff self-heal on %s: completed. "
- "(%d blocks of %d were different (%.2f%%))",
- local->loc.path, diff_blocks, total_blocks,
- ((diff_blocks * 1.0)/total_blocks) * 100);
- }
-
- sh->old_loop_frame = last_loop_frame;
- local->self_heal.algo_completion_cbk (sh_frame, this);
- }
-
- return 0;
-}
-
-int
-sh_loop_finish (call_frame_t *loop_frame, xlator_t *this)
-{
- afr_local_t *loop_local = NULL;
- afr_self_heal_t *loop_sh = NULL;
-
- if (!loop_frame)
- goto out;
-
- loop_local = loop_frame->local;
- if (loop_local) {
- loop_sh = &loop_local->self_heal;
- }
-
- if (loop_sh && loop_sh->data_lock_held) {
- afr_sh_data_unlock (loop_frame, this,
- sh_destroy_frame);
- } else {
- sh_destroy_frame (loop_frame, this);
- }
-out:
- return 0;
-}
-
-static int
-sh_loop_lock_success (call_frame_t *loop_frame, xlator_t *this)
-{
- afr_local_t *loop_local = NULL;
- afr_self_heal_t *loop_sh = NULL;
-
- loop_local = loop_frame->local;
- loop_sh = &loop_local->self_heal;
-
- sh_loop_finish (loop_sh->old_loop_frame, this);
- loop_sh->old_loop_frame = NULL;
-
- gf_log (this->name, GF_LOG_DEBUG, "Acquired lock for range %"PRIu64
- " %"PRIu64, loop_sh->offset, loop_sh->block_size);
- loop_sh->data_lock_held = _gf_true;
- loop_sh->sh_data_algo_start (loop_frame, this);
- return 0;
-}
-
-static int
-sh_loop_lock_failure (call_frame_t *loop_frame, xlator_t *this)
-{
- call_frame_t *sh_frame = NULL;
- afr_local_t *loop_local = NULL;
- afr_self_heal_t *loop_sh = NULL;
-
- loop_local = loop_frame->local;
- loop_sh = &loop_local->self_heal;
- sh_frame = loop_sh->sh_frame;
-
- gf_log (this->name, GF_LOG_ERROR, "failed lock for range %"PRIu64
- " %"PRIu64, loop_sh->offset, loop_sh->block_size);
- sh_loop_finish (loop_sh->old_loop_frame, this);
- loop_sh->old_loop_frame = NULL;
- sh_loop_return (sh_frame, this, loop_frame, -1, ENOTCONN);
- return 0;
-}
-
-static int
-sh_loop_frame_create (call_frame_t *sh_frame, xlator_t *this,
- call_frame_t *old_loop_frame, call_frame_t **loop_frame)
-{
- call_frame_t *new_loop_frame = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_local_t *new_loop_local = NULL;
- afr_self_heal_t *new_loop_sh = NULL;
- afr_private_t *priv = NULL;
-
- GF_ASSERT (sh_frame);
- GF_ASSERT (loop_frame);
-
- *loop_frame = NULL;
- local = sh_frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- new_loop_frame = copy_frame (sh_frame);
- if (!new_loop_frame)
- goto out;
- //We want the frame to have same lk_owner as sh_frame
- //so that locks translator allows conflicting locks
- new_loop_local = afr_local_copy (local, this);
- if (!new_loop_local)
- goto out;
- new_loop_frame->local = new_loop_local;
-
- new_loop_sh = &new_loop_local->self_heal;
- new_loop_sh->sources = memdup (sh->sources,
- priv->child_count * sizeof (*sh->sources));
- if (!new_loop_sh->sources)
- goto out;
- new_loop_sh->write_needed = GF_CALLOC (priv->child_count,
- sizeof (*new_loop_sh->write_needed),
- gf_afr_mt_char);
- if (!new_loop_sh->write_needed)
- goto out;
- new_loop_sh->checksum = GF_CALLOC (priv->child_count, MD5_DIGEST_LENGTH,
- gf_afr_mt_uint8_t);
- if (!new_loop_sh->checksum)
- goto out;
- new_loop_sh->inode = inode_ref (sh->inode);
- new_loop_sh->sh_data_algo_start = sh->sh_data_algo_start;
- new_loop_sh->source = sh->source;
- new_loop_sh->active_sinks = sh->active_sinks;
- new_loop_sh->healing_fd = fd_ref (sh->healing_fd);
- new_loop_sh->file_has_holes = sh->file_has_holes;
- new_loop_sh->old_loop_frame = old_loop_frame;
- new_loop_sh->sh_frame = sh_frame;
- *loop_frame = new_loop_frame;
- return 0;
-out:
- sh_destroy_frame (new_loop_frame, this);
- return -ENOMEM;
-}
-
-static int
-sh_loop_start (call_frame_t *sh_frame, xlator_t *this, off_t offset,
- call_frame_t *old_loop_frame)
-{
- call_frame_t *new_loop_frame = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_local_t *new_loop_local = NULL;
- afr_self_heal_t *new_loop_sh = NULL;
- int ret = 0;
-
- GF_ASSERT (sh_frame);
-
- local = sh_frame->local;
- sh = &local->self_heal;
-
- ret = sh_loop_frame_create (sh_frame, this, old_loop_frame,
- &new_loop_frame);
- if (ret)
- goto out;
- new_loop_local = new_loop_frame->local;
- new_loop_sh = &new_loop_local->self_heal;
- new_loop_sh->offset = offset;
- new_loop_sh->block_size = sh->block_size;
- afr_sh_data_lock (new_loop_frame, this, offset, new_loop_sh->block_size,
- sh_loop_lock_success, sh_loop_lock_failure);
- return 0;
-out:
- sh->op_failed = 1;
- if (old_loop_frame)
- sh_loop_finish (old_loop_frame, this);
- sh_loop_return (sh_frame, this, new_loop_frame, -1, ENOMEM);
- return 0;
-}
-
-static int
-sh_loop_driver (call_frame_t *sh_frame, xlator_t *this,
- gf_boolean_t is_first_call, call_frame_t *old_loop_frame)
-{
- afr_local_t * local = NULL;
- afr_self_heal_t * sh = NULL;
- afr_sh_algo_private_t *sh_priv = NULL;
- gf_boolean_t is_driver_done = _gf_false;
- blksize_t block_size = 0;
- int loop = 0;
- off_t offset = 0;
- afr_private_t *priv = NULL;
-
- priv = this->private;
- local = sh_frame->local;
- sh = &local->self_heal;
- sh_priv = sh->private;
-
- LOCK (&sh_priv->lock);
- {
- if (!is_first_call)
- sh_priv->loops_running--;
- offset = sh_priv->offset;
- block_size = sh->block_size;
- while ((!sh->eof_reached) && (0 == sh->op_failed) &&
- (sh_priv->loops_running < priv->data_self_heal_window_size)
- && (sh_priv->offset < sh->file_size)) {
-
- loop++;
- sh_priv->offset += block_size;
- sh_priv->loops_running++;
-
- if (!is_first_call)
- break;
- }
- if (0 == sh_priv->loops_running) {
- is_driver_done = _gf_true;
- }
- }
- UNLOCK (&sh_priv->lock);
-
- if (0 == loop) {
- //loop finish does unlock, but the erasing of the pending
- //xattrs needs to happen before that so do not finish the loop
- if (is_driver_done && !sh->op_failed)
- goto driver_done;
- if (old_loop_frame) {
- sh_loop_finish (old_loop_frame, this);
- old_loop_frame = NULL;
- }
- }
-
- //If we have more loops to form we should finish previous loop after
- //the next loop lock
- while (loop--) {
- if (sh->op_failed) {
- // op failed in other loop, stop spawning more loops
- if (old_loop_frame) {
- sh_loop_finish (old_loop_frame, this);
- old_loop_frame = NULL;
- }
- sh_loop_driver (sh_frame, this, _gf_false, NULL);
- } else {
- gf_log (this->name, GF_LOG_TRACE, "spawning a loop "
- "for offset %"PRId64, offset);
-
- sh_loop_start (sh_frame, this, offset, old_loop_frame);
- old_loop_frame = NULL;
- offset += block_size;
- }
- }
-
-driver_done:
- if (is_driver_done) {
- sh_loop_driver_done (sh_frame, this, old_loop_frame);
- }
- return 0;
-}
-
-static int
-sh_loop_return (call_frame_t *sh_frame, xlator_t *this, call_frame_t *loop_frame,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t * loop_local = NULL;
- afr_self_heal_t * loop_sh = NULL;
- afr_local_t * sh_local = NULL;
- afr_self_heal_t *sh = NULL;
-
- sh_local = sh_frame->local;
- sh = &sh_local->self_heal;
-
- if (loop_frame) {
- loop_local = loop_frame->local;
- if (loop_local)
- loop_sh = &loop_local->self_heal;
- if (loop_sh)
- gf_log (this->name, GF_LOG_TRACE, "loop for offset "
- "%"PRId64" returned", loop_sh->offset);
- }
-
- if (op_ret == -1) {
- sh->op_failed = 1;
- afr_sh_set_error (sh, op_errno);
- if (loop_frame) {
- sh_loop_finish (loop_frame, this);
- loop_frame = NULL;
- }
- }
-
- sh_loop_driver (sh_frame, this, _gf_false, loop_frame);
-
- return 0;
-}
-
-static int
-sh_loop_write_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf,
- struct iatt *postbuf, dict_t *xdata)
-{
- afr_private_t * priv = NULL;
- afr_local_t * loop_local = NULL;
- afr_self_heal_t * loop_sh = NULL;
- call_frame_t *sh_frame = NULL;
- afr_local_t * sh_local = NULL;
- afr_self_heal_t *sh = NULL;
- int call_count = 0;
- int child_index = 0;
-
- priv = this->private;
- loop_local = loop_frame->local;
- loop_sh = &loop_local->self_heal;
-
- sh_frame = loop_sh->sh_frame;
- sh_local = sh_frame->local;
- sh = &sh_local->self_heal;
-
- child_index = (long) cookie;
-
- gf_log (this->name, GF_LOG_TRACE,
- "wrote %d bytes of data from %s to child %d, offset %"PRId64"",
- op_ret, sh_local->loc.path, child_index, loop_sh->offset);
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "write to %s failed on subvolume %s (%s)",
- sh_local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
-
- sh->op_failed = 1;
- afr_sh_set_error (loop_sh, op_errno);
- }
-
- call_count = afr_frame_return (loop_frame);
-
- if (call_count == 0) {
- sh_loop_return (sh_frame, this, loop_frame,
- loop_sh->op_ret, loop_sh->op_errno);
- }
-
- return 0;
-}
-
-static void
-sh_prune_writes_needed (call_frame_t *sh_frame, call_frame_t *loop_frame,
- afr_private_t *priv)
-{
- afr_local_t *sh_local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_local_t *loop_local = NULL;
- afr_self_heal_t *loop_sh = NULL;
- int i = 0;
-
- sh_local = sh_frame->local;
- sh = &sh_local->self_heal;
-
- if (!strcmp (sh->algo->name, "diff"))
- return;
-
- loop_local = loop_frame->local;
- loop_sh = &loop_local->self_heal;
-
- /* full self-heal guarantees there exists atleast 1 file with size 0
- * That means for other files we can preserve holes that come after
- * its size before 'trim'
- */
- for (i = 0; i < priv->child_count; i++) {
- if (loop_sh->write_needed[i] &&
- ((loop_sh->offset + 1) > sh->buf[i].ia_size))
- loop_sh->write_needed[i] = 0;
- }
-}
-
-static int
-sh_loop_read_cbk (call_frame_t *loop_frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- struct iovec *vector, int32_t count, struct iatt *buf,
- struct iobref *iobref, dict_t *xdata)
-{
- afr_private_t * priv = NULL;
- afr_local_t * loop_local = NULL;
- afr_self_heal_t * loop_sh = NULL;
- call_frame_t *sh_frame = NULL;
- int i = 0;
- int call_count = 0;
- afr_local_t * sh_local = NULL;
- afr_self_heal_t * sh = NULL;
-
- priv = this->private;
- loop_local = loop_frame->local;
- loop_sh = &loop_local->self_heal;
-
- sh_frame = loop_sh->sh_frame;
- sh_local = sh_frame->local;
- sh = &sh_local->self_heal;
-
- gf_log (this->name, GF_LOG_TRACE,
- "read %d bytes of data from %s, offset %"PRId64"",
- op_ret, loop_local->loc.path, loop_sh->offset);
-
- if (op_ret <= 0) {
- if (op_ret < 0) {
- sh->op_failed = 1;
- gf_log (this->name, GF_LOG_ERROR, "read failed on %d "
- "for %s reason :%s", sh->source,
- sh_local->loc.path, strerror (errno));
- } else {
- sh->eof_reached = _gf_true;
- gf_log (this->name, GF_LOG_DEBUG, "Eof reached for %s",
- sh_local->loc.path);
- }
- sh_loop_return (sh_frame, this, loop_frame, op_ret, op_errno);
- goto out;
- }
-
- if (loop_sh->file_has_holes && iov_0filled (vector, count) == 0)
- sh_prune_writes_needed (sh_frame, loop_frame, priv);
-
- call_count = sh_number_of_writes_needed (loop_sh->write_needed,
- priv->child_count);
- if (call_count == 0) {
- sh_loop_return (sh_frame, this, loop_frame, 0, 0);
- goto out;
- }
- loop_local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (!loop_sh->write_needed[i])
- continue;
- STACK_WIND_COOKIE (loop_frame, sh_loop_write_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->writev,
- loop_sh->healing_fd, vector, count,
- loop_sh->offset, 0, iobref, NULL);
-
- if (!--call_count)
- break;
- }
-
-out:
- return 0;
-}
-
-
-static int
-sh_loop_read (call_frame_t *loop_frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- afr_local_t *loop_local = NULL;
- afr_self_heal_t *loop_sh = NULL;
-
- priv = this->private;
- loop_local = loop_frame->local;
- loop_sh = &loop_local->self_heal;
-
- STACK_WIND_COOKIE (loop_frame, sh_loop_read_cbk,
- (void *) (long) loop_sh->source,
- priv->children[loop_sh->source],
- priv->children[loop_sh->source]->fops->readv,
- loop_sh->healing_fd, loop_sh->block_size,
- loop_sh->offset, 0, NULL);
-
- return 0;
-}
-
-
-static int
-sh_diff_checksum_cbk (call_frame_t *loop_frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- uint32_t weak_checksum, uint8_t *strong_checksum,
- dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- afr_local_t *loop_local = NULL;
- afr_self_heal_t *loop_sh = NULL;
- call_frame_t *sh_frame = NULL;
- afr_local_t *sh_local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_sh_algo_private_t *sh_priv = NULL;
- int child_index = 0;
- int call_count = 0;
- int i = 0;
- int write_needed = 0;
-
- priv = this->private;
-
- loop_local = loop_frame->local;
- loop_sh = &loop_local->self_heal;
-
- sh_frame = loop_sh->sh_frame;
- sh_local = sh_frame->local;
- sh = &sh_local->self_heal;
-
- sh_priv = sh->private;
-
- child_index = (long) cookie;
-
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "checksum on %s failed on subvolume %s (%s)",
- sh_local->loc.path, priv->children[child_index]->name,
- strerror (op_errno));
- sh->op_failed = 1;
- } else {
- memcpy (loop_sh->checksum + child_index * MD5_DIGEST_LENGTH,
- strong_checksum, MD5_DIGEST_LENGTH);
- }
-
- call_count = afr_frame_return (loop_frame);
-
- if (call_count == 0) {
- for (i = 0; i < priv->child_count; i++) {
- if (sh->sources[i] || !sh_local->child_up[i])
- continue;
-
- if (memcmp (loop_sh->checksum + (i * MD5_DIGEST_LENGTH),
- loop_sh->checksum + (sh->source * MD5_DIGEST_LENGTH),
- MD5_DIGEST_LENGTH)) {
- /*
- Checksums differ, so this block
- must be written to this sink
- */
-
- gf_log (this->name, GF_LOG_DEBUG,
- "checksum on subvolume %s at offset %"
- PRId64" differs from that on source",
- priv->children[i]->name, loop_sh->offset);
-
- write_needed = loop_sh->write_needed[i] = 1;
- }
- }
-
- LOCK (&sh_priv->lock);
- {
- sh_priv->total_blocks++;
- if (write_needed)
- sh_priv->diff_blocks++;
- }
- UNLOCK (&sh_priv->lock);
-
- if (write_needed && !sh->op_failed) {
- sh_loop_read (loop_frame, this);
- } else {
- sh_loop_return (sh_frame, this, loop_frame,
- op_ret, op_errno);
- }
- }
-
- return 0;
-}
-
-static int
-sh_diff_checksum (call_frame_t *loop_frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- afr_local_t *loop_local = NULL;
- afr_self_heal_t *loop_sh = NULL;
- int call_count = 0;
- int i = 0;
-
- priv = this->private;
- loop_local = loop_frame->local;
- loop_sh = &loop_local->self_heal;
-
- call_count = loop_sh->active_sinks + 1; /* sinks and source */
-
- loop_local->call_count = call_count;
-
- STACK_WIND_COOKIE (loop_frame, sh_diff_checksum_cbk,
- (void *) (long) loop_sh->source,
- priv->children[loop_sh->source],
- priv->children[loop_sh->source]->fops->rchecksum,
- loop_sh->healing_fd,
- loop_sh->offset, loop_sh->block_size, NULL);
-
- for (i = 0; i < priv->child_count; i++) {
- if (loop_sh->sources[i] || !loop_local->child_up[i])
- continue;
-
- STACK_WIND_COOKIE (loop_frame, sh_diff_checksum_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->rchecksum,
- loop_sh->healing_fd,
- loop_sh->offset, loop_sh->block_size, NULL);
-
- if (!--call_count)
- break;
- }
-
- return 0;
-}
-
-static int
-sh_full_read_write_to_sinks (call_frame_t *loop_frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- afr_local_t *loop_local = NULL;
- afr_self_heal_t *loop_sh = NULL;
- int i = 0;
-
- priv = this->private;
- loop_local = loop_frame->local;
- loop_sh = &loop_local->self_heal;
-
- for (i = 0; i < priv->child_count; i++) {
- if (loop_sh->sources[i] || !loop_local->child_up[i])
- continue;
- loop_sh->write_needed[i] = 1;
- }
- sh_loop_read (loop_frame, this);
- return 0;
-}
-
-afr_sh_algo_private_t*
-afr_sh_priv_init ()
-{
- afr_sh_algo_private_t *sh_priv = NULL;
-
- sh_priv = GF_CALLOC (1, sizeof (*sh_priv),
- gf_afr_mt_afr_private_t);
- if (!sh_priv)
- goto out;
-
- LOCK_INIT (&sh_priv->lock);
-out:
- return sh_priv;
-}
-
-void
-afr_sh_transfer_lock (call_frame_t *dst, call_frame_t *src,
- unsigned int child_count)
-{
- afr_local_t *dst_local = NULL;
- afr_self_heal_t *dst_sh = NULL;
- afr_local_t *src_local = NULL;
- afr_self_heal_t *src_sh = NULL;
-
- dst_local = dst->local;
- dst_sh = &dst_local->self_heal;
- src_local = src->local;
- src_sh = &src_local->self_heal;
- GF_ASSERT (src_sh->data_lock_held);
- GF_ASSERT (!dst_sh->data_lock_held);
- afr_lk_transfer_datalock (dst, src, child_count);
- src_sh->data_lock_held = _gf_false;
- dst_sh->data_lock_held = _gf_true;
-}
-
-int
-afr_sh_start_loops (call_frame_t *sh_frame, xlator_t *this,
- afr_sh_algo_fn sh_data_algo_start)
-{
- call_frame_t *first_loop_frame = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int ret = 0;
- afr_private_t *priv = NULL;
-
- local = sh_frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- sh->sh_data_algo_start = sh_data_algo_start;
- local->call_count = 0;
- ret = sh_loop_frame_create (sh_frame, this, NULL, &first_loop_frame);
- if (ret)
- goto out;
- afr_sh_transfer_lock (first_loop_frame, sh_frame, priv->child_count);
- sh->private = afr_sh_priv_init ();
- if (!sh->private) {
- ret = -1;
- goto out;
- }
- sh_loop_driver (sh_frame, this, _gf_true, first_loop_frame);
- ret = 0;
-out:
- if (ret) {
- sh->op_failed = 1;
- sh_loop_driver_done (sh_frame, this, NULL);
- }
- return 0;
-}
-
-int
-afr_sh_algo_diff (call_frame_t *sh_frame, xlator_t *this)
-{
- afr_sh_start_loops (sh_frame, this, sh_diff_checksum);
- return 0;
-}
-
-int
-afr_sh_algo_full (call_frame_t *sh_frame, xlator_t *this)
-{
- afr_sh_start_loops (sh_frame, this, sh_full_read_write_to_sinks);
- return 0;
-}
-
-struct afr_sh_algorithm afr_self_heal_algorithms[] = {
- {.name = "full", .fn = afr_sh_algo_full},
- {.name = "diff", .fn = afr_sh_algo_diff},
- {0, 0},
-};
diff --git a/xlators/cluster/afr/src/afr-self-heal-algorithm.h b/xlators/cluster/afr/src/afr-self-heal-algorithm.h
deleted file mode 100644
index 6b20789b1..000000000
--- a/xlators/cluster/afr/src/afr-self-heal-algorithm.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#ifndef __AFR_SELF_HEAL_ALGORITHM_H__
-#define __AFR_SELF_HEAL_ALGORITHM_H__
-
-typedef int (*afr_sh_algo_fn) (call_frame_t *frame,
- xlator_t *this);
-
-struct afr_sh_algorithm {
- const char *name;
- afr_sh_algo_fn fn;
-};
-
-extern struct afr_sh_algorithm afr_self_heal_algorithms[3];
-typedef struct {
- gf_lock_t lock;
- unsigned int loops_running;
- off_t offset;
-
- int32_t total_blocks;
- int32_t diff_blocks;
-} afr_sh_algo_private_t;
-
-#endif /* __AFR_SELF_HEAL_ALGORITHM_H__ */
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index b6593a7ec..4dac83113 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -1,5 +1,5 @@
/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
@@ -8,2517 +8,1002 @@
cases as published by the Free Software Foundation.
*/
-#include "glusterfs.h"
-#include "xlator.h"
-#include "byte-order.h"
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
#include "afr.h"
-#include "afr-transaction.h"
-#include "afr-self-heal-common.h"
#include "afr-self-heal.h"
-#include "pump.h"
+#include "byte-order.h"
-void
-afr_sh_reset (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- memset (sh->child_errno, 0,
- sizeof (*sh->child_errno) * priv->child_count);
- memset (sh->buf, 0, sizeof (*sh->buf) * priv->child_count);
- memset (sh->parentbufs, 0,
- sizeof (*sh->parentbufs) * priv->child_count);
- memset (sh->success, 0, sizeof (*sh->success) * priv->child_count);
- memset (sh->locked_nodes, 0,
- sizeof (*sh->locked_nodes) * priv->child_count);
- sh->active_sinks = 0;
-
- afr_reset_xattr (sh->xattr, priv->child_count);
-}
-//Intersection[child]=1 if child is part of intersection
-void
-afr_children_intersection_get (int32_t *set1, int32_t *set2,
- int *intersection, unsigned int child_count)
+int
+afr_selfheal_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
{
- int i = 0;
-
- memset (intersection, 0, sizeof (*intersection) * child_count);
- for (i = 0; i < child_count; i++) {
- intersection[i] = afr_is_child_present (set1, child_count, i)
- && afr_is_child_present (set2, child_count,
- i);
- }
-}
+ afr_local_t *local = NULL;
-/**
- * select_source - select a source and return it
- */
+ local = frame->local;
-int
-afr_sh_select_source (int sources[], int child_count)
-{
- int i = 0;
- for (i = 0; i < child_count; i++)
- if (sources[i])
- return i;
+ syncbarrier_wake (&local->barrier);
- return -1;
+ return 0;
}
-void
-afr_sh_mark_source_sinks (call_frame_t *frame, xlator_t *this)
-{
- int i = 0;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int active_sinks = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++) {
- if (sh->sources[i] == 0 && local->child_up[i] == 1) {
- active_sinks++;
- sh->success[i] = 1;
- } else if (sh->sources[i] == 1 && local->child_up[i] == 1) {
- sh->success[i] = 1;
- }
- }
- sh->active_sinks = active_sinks;
-}
int
-afr_sh_source_count (int sources[], int child_count)
+afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int subvol, dict_t *xattr)
{
- int i = 0;
- int nsource = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ loc_t loc = {0, };
- for (i = 0; i < child_count; i++)
- if (sources[i])
- nsource++;
- return nsource;
-}
+ priv = this->private;
+ local = frame->local;
-void
-afr_sh_set_error (afr_self_heal_t *sh, int32_t op_errno)
-{
- sh->op_ret = -1;
- if (afr_error_more_important (sh->op_errno, op_errno))
- sh->op_errno = op_errno;
-}
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
-void
-afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this)
-{
- afr_private_t * priv = this->private;
- char *buf = NULL;
- char *ptr = NULL;
- int i = 0;
- int j = 0;
-
- /* 10 digits per entry + 1 space + '[' and ']' */
- buf = GF_MALLOC (priv->child_count * 11 + 8, gf_afr_mt_char);
-
- for (i = 0; i < priv->child_count; i++) {
- ptr = buf;
- ptr += sprintf (ptr, "[ ");
- for (j = 0; j < priv->child_count; j++) {
- ptr += sprintf (ptr, "%d ", pending_matrix[i][j]);
- }
- sprintf (ptr, "]");
- gf_log (this->name, GF_LOG_DEBUG, "pending_matrix: %s", buf);
- }
-
- GF_FREE (buf);
-}
+ STACK_WIND (frame, afr_selfheal_post_op_cbk, priv->children[subvol],
+ priv->children[subvol]->fops->xattrop, &loc,
+ GF_XATTROP_ADD_ARRAY, xattr, NULL);
-void
-afr_init_pending_matrix (int32_t **pending_matrix, size_t child_count)
-{
- int i = 0;
- int j = 0;
-
- GF_ASSERT (pending_matrix);
+ syncbarrier_wait (&local->barrier, 1);
- for (i = 0; i < child_count; i++) {
- for (j = 0; j < child_count; j++) {
- pending_matrix[i][j] = 0;
- }
- }
+ return 0;
}
-void
-afr_mark_ignorant_subvols_as_pending (int32_t **pending_matrix,
- unsigned char *ignorant_subvols,
- size_t child_count)
-{
- int i = 0;
- int j = 0;
-
- GF_ASSERT (pending_matrix);
- GF_ASSERT (ignorant_subvols);
-
- for (i = 0; i < child_count; i++) {
- if (ignorant_subvols[i]) {
- for (j = 0; j < child_count; j++) {
- if (!ignorant_subvols[j])
- pending_matrix[j][i] += 1;
- }
- }
- }
-}
-int
-afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix,
- unsigned char *ignorant_subvols,
- dict_t *xattr[], afr_transaction_type type,
- size_t child_count)
+dict_t *
+afr_selfheal_output_xattr (xlator_t *this, afr_transaction_type type,
+ int *output_dirty, int **output_matrix, int subvol)
{
- /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */
- int32_t pending[3] = {0,};
- void *pending_raw = NULL;
- int ret = -1;
- int i = 0;
- int j = 0;
- int k = 0;
-
- afr_init_pending_matrix (pending_matrix, child_count);
-
- for (i = 0; i < child_count; i++) {
- pending_raw = NULL;
-
- for (j = 0; j < child_count; j++) {
- ret = dict_get_ptr (xattr[i], pending_key[j],
- &pending_raw);
-
- if (ret != 0) {
- /*
- * There is no xattr present. This means this
- * subvolume should be considered an 'ignorant'
- * subvolume.
- */
-
- if (ignorant_subvols)
- ignorant_subvols[i] = 1;
- continue;
- }
-
- memcpy (pending, pending_raw, sizeof(pending));
- k = afr_index_for_transaction_type (type);
-
- pending_matrix[i][j] = ntoh32 (pending[k]);
- }
- }
-
- return ret;
-}
+ dict_t *xattr = NULL;
+ afr_private_t *priv = NULL;
+ int j = 0;
+ int idx = 0;
+ int ret = 0;
+ int *raw = 0;
-typedef enum {
- AFR_NODE_INVALID,
- AFR_NODE_INNOCENT,
- AFR_NODE_FOOL,
- AFR_NODE_WISE,
-} afr_node_type;
+ priv = this->private;
+ idx = afr_index_for_transaction_type (type);
-typedef struct {
- afr_node_type type;
- int wisdom;
-} afr_node_character;
+ xattr = dict_new ();
+ if (!xattr)
+ return NULL;
+ if (output_dirty[subvol]) {
+ /* clear dirty */
+ raw = GF_CALLOC (sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t);
+ if (!raw)
+ goto err;
-static int
-afr_sh_is_innocent (int32_t *array, int child_count)
-{
- int i = 0;
- int ret = 1; /* innocent until proven guilty */
-
- for (i = 0; i < child_count; i++) {
- if (array[i]) {
- ret = 0;
- break;
- }
- }
+ raw[idx] = hton32 (output_dirty[subvol]);
+ ret = dict_set_bin (xattr, AFR_DIRTY, raw,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret)
+ goto err;
+ }
- return ret;
-}
+ /* clear/set pending */
+ for (j = 0; j < priv->child_count; j++) {
+ if (!output_matrix[subvol][j])
+ continue;
+ raw = GF_CALLOC (sizeof(int), AFR_NUM_CHANGE_LOGS,
+ gf_afr_mt_int32_t);
+ if (!raw)
+ goto err;
-static int
-afr_sh_is_fool (int32_t *array, int i, int child_count)
-{
- return array[i]; /* fool if accuses itself */
-}
+ raw[idx] = hton32 (output_matrix[subvol][j]);
+ ret = dict_set_bin (xattr, priv->pending_key[j],
+ raw, sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret)
+ goto err;
+ }
-static int
-afr_sh_is_wise (int32_t *array, int i, int child_count)
-{
- return !array[i]; /* wise if does not accuse itself */
+ return xattr;
+err:
+ if (xattr)
+ dict_unref (xattr);
+ return NULL;
}
-static int
-afr_sh_all_nodes_innocent (afr_node_character *characters,
- int child_count)
-{
- int i = 0;
- int ret = 1;
-
- for (i = 0; i < child_count; i++) {
- if (characters[i].type != AFR_NODE_INNOCENT) {
- ret = 0;
- break;
- }
- }
-
- return ret;
+int
+afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks, afr_transaction_type type,
+ struct afr_reply *replies, unsigned char *locked_on)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int j = 0;
+ unsigned char *pending = NULL;
+ int *input_dirty = NULL;
+ int **input_matrix = NULL;
+ int *output_dirty = NULL;
+ int **output_matrix = NULL;
+ dict_t *xattr = NULL;
+
+ priv = this->private;
+
+ pending = alloca0 (priv->child_count);
+
+ input_dirty = alloca0 (priv->child_count * sizeof (int));
+ input_matrix = ALLOC_MATRIX (priv->child_count, int);
+ output_dirty = alloca0 (priv->child_count * sizeof (int));
+ output_matrix = ALLOC_MATRIX (priv->child_count, int);
+
+ afr_selfheal_extract_xattr (this, replies, type, input_dirty,
+ input_matrix);
+
+ for (i = 0; i < priv->child_count; i++)
+ if (sinks[i] && !healed_sinks[i])
+ pending[i] = 1;
+
+ for (i = 0; i < priv->child_count; i++) {
+ for (j = 0; j < priv->child_count; j++) {
+ if (pending[j])
+ output_matrix[i][j] = 1;
+ else
+ output_matrix[i][j] = -input_matrix[i][j];
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!pending[i])
+ output_dirty[i] = -input_dirty[i];
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!locked_on[i])
+ /* perform post-op only on subvols we had locked
+ and inspected on.
+ */
+ continue;
+
+ xattr = afr_selfheal_output_xattr (this, type, output_dirty,
+ output_matrix, i);
+ if (!xattr) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "unable to allocate xdata for subvol %d", i);
+ continue;
+ }
+
+ afr_selfheal_post_op (frame, this, inode, i, xattr);
+
+ dict_unref (xattr);
+ }
+
+ return 0;
}
-static int
-afr_sh_wise_nodes_exist (afr_node_character *characters, int child_count)
-{
- int i = 0;
- int ret = 0;
-
- for (i = 0; i < child_count; i++) {
- if (characters[i].type == AFR_NODE_WISE) {
- ret = 1;
- break;
- }
- }
-
- return ret;
+void
+afr_replies_copy (struct afr_reply *dst, struct afr_reply *src, int count)
+{
+ int i = 0;
+ dict_t *xdata = NULL;
+
+ if (dst == src)
+ return;
+
+ for (i = 0; i < count; i++) {
+ dst[i].valid = src[i].valid;
+ dst[i].op_ret = src[i].op_ret;
+ dst[i].op_errno = src[i].op_errno;
+ dst[i].prestat = src[i].prestat;
+ dst[i].poststat = src[i].poststat;
+ dst[i].preparent = src[i].preparent;
+ dst[i].postparent = src[i].postparent;
+ dst[i].preparent2 = src[i].preparent2;
+ dst[i].postparent2 = src[i].postparent2;
+ if (src[i].xdata)
+ xdata = dict_ref (src[i].xdata);
+ else
+ xdata = NULL;
+ if (dst[i].xdata)
+ dict_unref (dst[i].xdata);
+ dst[i].xdata = xdata;
+ memcpy (dst[i].checksum, src[i].checksum,
+ MD5_DIGEST_LENGTH);
+ }
}
-/*
- * The 'wisdom' of a wise node is 0 if any other wise node accuses it.
- * It is 1 if no other wise node accuses it.
- * Only wise nodes with wisdom 1 are sources.
- *
- * If no nodes with wisdom 1 exist, a split-brain has occurred.
- */
-
-static void
-afr_sh_compute_wisdom (int32_t *pending_matrix[],
- afr_node_character characters[], int child_count)
+int
+afr_selfheal_fill_dirty (xlator_t *this, int *dirty, int subvol,
+ int idx, dict_t *xdata)
{
- int i = 0;
- int j = 0;
-
- for (i = 0; i < child_count; i++) {
- if (characters[i].type == AFR_NODE_WISE) {
- characters[i].wisdom = 1;
-
- for (j = 0; j < child_count; j++) {
- if ((characters[j].type == AFR_NODE_WISE)
- && pending_matrix[j][i]) {
-
- characters[i].wisdom = 0;
- }
- }
- }
- }
-}
+ void *pending_raw = NULL;
+ int pending[3] = {0, };
+ if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw))
+ return -1;
-static int
-afr_sh_wise_nodes_conflict (afr_node_character *characters,
- int child_count)
-{
- int i = 0;
- int ret = 1;
+ if (!pending_raw)
+ return -1;
- for (i = 0; i < child_count; i++) {
- if ((characters[i].type == AFR_NODE_WISE)
- && characters[i].wisdom == 1) {
+ memcpy (pending, pending_raw, sizeof(pending));
- /* There is atleast one bona-fide wise node */
- ret = 0;
- break;
- }
- }
+ dirty[subvol] = ntoh32 (pending[idx]);
- return ret;
+ return 0;
}
-static int
-afr_sh_mark_wisest_as_sources (int sources[],
- afr_node_character *characters,
- int child_count)
-{
- int nsources = 0;
- int i = 0;
-
- for (i = 0; i < child_count; i++) {
- if (characters[i].wisdom == 1) {
- sources[i] = 1;
- nsources++;
- }
- }
-
- return nsources;
-}
-
-static void
-afr_compute_witness_of_fools (int32_t *witnesses, int32_t **pending_matrix,
- afr_node_character *characters,
- int32_t child_count)
+int
+afr_selfheal_fill_matrix (xlator_t *this, int **matrix, int subvol,
+ int idx, dict_t *xdata)
{
- int i = 0;
- int j = 0;
- int witness = 0;
-
- GF_ASSERT (witnesses);
- GF_ASSERT (pending_matrix);
- GF_ASSERT (characters);
- GF_ASSERT (child_count > 0);
-
- for (i = 0; i < child_count; i++) {
- if (characters[i].type != AFR_NODE_FOOL)
- continue;
-
- witness = 0;
- for (j = 0; j < child_count; j++) {
- if (i == j)
- continue;
- witness += pending_matrix[i][j];
- }
- witnesses[i] = witness;
- }
-}
+ int i = 0;
+ void *pending_raw = NULL;
+ int pending[3] = {0, };
+ afr_private_t *priv = NULL;
-static int32_t
-afr_find_biggest_witness_among_fools (int32_t *witnesses,
- afr_node_character *characters,
- int32_t child_count)
-{
- int i = 0;
- int biggest_witness = -1;
+ priv = this->private;
- GF_ASSERT (witnesses);
- GF_ASSERT (characters);
- GF_ASSERT (child_count > 0);
+ for (i = 0; i < priv->child_count; i++) {
+ if (dict_get_ptr (xdata, priv->pending_key[i], &pending_raw))
+ continue;
- for (i = 0; i < child_count; i++) {
- if (characters[i].type != AFR_NODE_FOOL)
- continue;
+ if (!pending_raw)
+ continue;
- if (biggest_witness < witnesses[i])
- biggest_witness = witnesses[i];
- }
- return biggest_witness;
-}
+ memcpy (pending, pending_raw, sizeof(pending));
-int
-afr_mark_fool_as_source_by_witness (int32_t *sources, int32_t *witnesses,
- afr_node_character *characters,
- int32_t child_count, int32_t witness)
-{
- int i = 0;
- int nsources = 0;
-
- GF_ASSERT (sources);
- GF_ASSERT (witnesses);
- GF_ASSERT (characters);
- GF_ASSERT (child_count > 0);
-
- for (i = 0; i < child_count; i++) {
- if (characters[i].type != AFR_NODE_FOOL)
- continue;
-
- if (witness == witnesses[i]) {
- sources[i] = 1;
- nsources++;
- }
- }
- return nsources;
-}
+ matrix[subvol][i] = ntoh32 (pending[idx]);
+ }
-static int
-afr_mark_biggest_of_fools_as_source (int32_t *sources, int32_t **pending_matrix,
- afr_node_character *characters,
- int child_count)
-{
- int32_t biggest_witness = 0;
- int nsources = 0;
- int32_t *witnesses = NULL;
-
- GF_ASSERT (child_count > 0);
-
- witnesses = GF_CALLOC (child_count, sizeof (*witnesses),
- gf_afr_mt_int32_t);
- if (NULL == witnesses) {
- nsources = -1;
- goto out;
- }
-
- afr_compute_witness_of_fools (witnesses, pending_matrix, characters,
- child_count);
- biggest_witness = afr_find_biggest_witness_among_fools (witnesses,
- characters,
- child_count);
- nsources = afr_mark_fool_as_source_by_witness (sources, witnesses,
- characters, child_count,
- biggest_witness);
-out:
- GF_FREE (witnesses);
- return nsources;
+ return 0;
}
-int
-afr_mark_child_as_source_by_uid (int32_t *sources, struct iatt *bufs,
- int32_t *success_children,
- unsigned int child_count, uint32_t uid)
-{
- int i = 0;
- int nsources = 0;
- int child = 0;
-
- for (i = 0; i < child_count; i++) {
- if (-1 == success_children[i])
- break;
-
- child = success_children[i];
- if (uid == bufs[child].ia_uid) {
- sources[child] = 1;
- nsources++;
- }
- }
- return nsources;
-}
int
-afr_get_child_with_lowest_uid (struct iatt *bufs, int32_t *success_children,
- unsigned int child_count)
+afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies,
+ afr_transaction_type type, int *dirty, int **matrix)
{
- int i = 0;
- int smallest = -1;
- int child = 0;
-
- for (i = 0; i < child_count; i++) {
- if (-1 == success_children[i])
- break;
- child = success_children[i];
- if ((smallest == -1) ||
- (bufs[child].ia_uid < bufs[smallest].ia_uid)) {
- smallest = child;
- }
- }
- return smallest;
-}
+ afr_private_t *priv = NULL;
+ int i = 0;
+ dict_t *xdata = NULL;
+ int idx = -1;
-static int
-afr_sh_mark_lowest_uid_as_source (struct iatt *bufs, int32_t *success_children,
- int child_count, int32_t *sources)
-{
- int nsources = 0;
- int smallest = 0;
-
- smallest = afr_get_child_with_lowest_uid (bufs, success_children,
- child_count);
- if (smallest < 0) {
- nsources = -1;
- goto out;
- }
- nsources = afr_mark_child_as_source_by_uid (sources, bufs,
- success_children, child_count,
- bufs[smallest].ia_uid);
-out:
- return nsources;
-}
-
-int
-afr_get_no_xattr_dir_read_child (xlator_t *this, int32_t *success_children,
- struct iatt *bufs)
-{
- afr_private_t *priv = NULL;
- int i = 0;
- int child = -1;
- int read_child = -1;
-
- priv = this->private;
- for (i = 0; i < priv->child_count; i++) {
- child = success_children[i];
- if (child < 0)
- break;
- if (read_child < 0)
- read_child = child;
- else if (bufs[read_child].ia_size < bufs[child].ia_size)
- read_child = child;
- }
- return read_child;
-}
+ idx = afr_index_for_transaction_type (type);
-int
-afr_sh_mark_zero_size_file_as_sink (struct iatt *bufs, int32_t *success_children,
- int child_count, int32_t *sources)
-{
- int nsources = 0;
- int i = 0;
- int child = 0;
- gf_boolean_t sink_exists = _gf_false;
- gf_boolean_t source_exists = _gf_false;
- int source = -1;
-
- for (i = 0; i < child_count; i++) {
- child = success_children[i];
- if (child < 0)
- break;
- if (!bufs[child].ia_size) {
- sink_exists = _gf_true;
- continue;
- }
- if (!source_exists) {
- source_exists = _gf_true;
- source = child;
- continue;
- }
- if (bufs[source].ia_size != bufs[child].ia_size) {
- nsources = -1;
- goto out;
- }
- }
- if (!source_exists && !sink_exists) {
- nsources = -1;
- goto out;
- }
-
- if (!source_exists || !sink_exists)
- goto out;
-
- for (i = 0; i < child_count; i++) {
- child = success_children[i];
- if (child < 0)
- break;
- if (bufs[child].ia_size) {
- sources[child] = 1;
- nsources++;
- }
- }
-out:
- return nsources;
-}
+ priv = this->private;
-char *
-afr_get_character_str (afr_node_type type)
-{
- char *character = NULL;
-
- switch (type) {
- case AFR_NODE_INNOCENT:
- character = "innocent";
- break;
- case AFR_NODE_FOOL:
- character = "fool";
- break;
- case AFR_NODE_WISE:
- character = "wise";
- break;
- default:
- character = "invalid";
- break;
- }
- return character;
-}
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].xdata)
+ continue;
-afr_node_type
-afr_find_child_character_type (int32_t *pending_row, int32_t child,
- unsigned int child_count)
-{
- afr_node_type type = AFR_NODE_INVALID;
+ xdata = replies[i].xdata;
- GF_ASSERT ((child >= 0) && (child < child_count));
+ afr_selfheal_fill_dirty (this, dirty, i, idx, xdata);
+ afr_selfheal_fill_matrix (this, matrix, i, idx, xdata);
+ }
- if (afr_sh_is_innocent (pending_row, child_count))
- type = AFR_NODE_INNOCENT;
- else if (afr_sh_is_fool (pending_row, child, child_count))
- type = AFR_NODE_FOOL;
- else if (afr_sh_is_wise (pending_row, child, child_count))
- type = AFR_NODE_WISE;
- return type;
+ return 0;
}
-int
-afr_build_sources (xlator_t *this, dict_t **xattr, struct iatt *bufs,
- int32_t **pending_matrix, int32_t *sources,
- int32_t *success_children, afr_transaction_type type,
- int32_t *subvol_status, gf_boolean_t ignore_ignorant)
-{
- afr_private_t *priv = NULL;
- afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID;
- int nsources = -1;
- unsigned char *ignorant_subvols = NULL;
- unsigned int child_count = 0;
-
- priv = this->private;
- child_count = priv->child_count;
-
- if (afr_get_children_count (success_children, priv->child_count) == 0)
- goto out;
-
- if (!ignore_ignorant) {
- ignorant_subvols = GF_CALLOC (sizeof (*ignorant_subvols),
- child_count, gf_afr_mt_char);
- if (NULL == ignorant_subvols)
- goto out;
- }
-
- afr_build_pending_matrix (priv->pending_key, pending_matrix,
- ignorant_subvols, xattr, type,
- priv->child_count);
-
- if (!ignore_ignorant)
- afr_mark_ignorant_subvols_as_pending (pending_matrix,
- ignorant_subvols,
- priv->child_count);
- sh_type = afr_self_heal_type_for_transaction (type);
- if (AFR_SELF_HEAL_INVALID == sh_type)
- goto out;
-
- afr_sh_print_pending_matrix (pending_matrix, this);
-
- nsources = afr_mark_sources (this, sources, pending_matrix, bufs,
- sh_type, success_children, subvol_status);
-out:
- GF_FREE (ignorant_subvols);
- return nsources;
-}
-void
-afr_find_character_types (afr_node_character *characters,
- int32_t **pending_matrix, int32_t *success_children,
- unsigned int child_count)
-{
- afr_node_type type = AFR_NODE_INVALID;
- int child = 0;
- int i = 0;
-
- for (i = 0; i < child_count; i++) {
- child = success_children[i];
- if (child == -1)
- break;
- type = afr_find_child_character_type (pending_matrix[child],
- child, child_count);
- characters[child].type = type;
- }
-}
-void
-afr_mark_success_children_sources (int32_t *sources, int32_t *success_children,
- unsigned int child_count)
-{
- int i = 0;
- for (i = 0; i < child_count; i++) {
- if (success_children[i] == -1)
- break;
- sources[success_children[i]] = 1;
- }
-}
-/**
- * mark_sources: Mark all 'source' nodes and return number of source
- * nodes found
+/*
+ * This function determines if a self-heal is required for a given inode,
+ * and if needed, in what direction.
*
- * A node (a row in the pending matrix) belongs to one of
- * three categories:
+ * locked_on[] is the array representing servers which have been locked and
+ * from which xattrs have been fetched for analysis.
*
- * M is the pending matrix.
+ * The output of the function is by filling the arrays sources[] and sinks[].
*
- * 'innocent' - M[i] is all zeroes
- * 'fool' - M[i] has i'th element = 1 (self-reference)
- * 'wise' - M[i] has i'th element = 0, others are 1 or 0.
+ * sources[i] is set if i'th server is an eligible source for a selfheal.
*
- * All 'innocent' nodes are sinks. If all nodes are innocent, no self-heal is
- * needed.
+ * sinks[i] is set if i'th server needs to be healed.
*
- * A 'wise' node can be a source. If two 'wise' nodes conflict, it is
- * a split-brain. If one wise node refers to the other but the other doesn't
- * refer back, the referrer is a source.
+ * if sources[0..N] are all set, there is no need for a selfheal.
+ *
+ * if sinks[0..N] are all set, the inode is in split brain.
*
- * All fools are sinks, unless there are no 'wise' nodes. In that case,
- * one of the fools is made a source.
*/
int
-afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix,
- struct iatt *bufs, afr_self_heal_type type,
- int32_t *success_children, int32_t *subvol_status)
-{
- /* stores the 'characters' (innocent, fool, wise) of the nodes */
- afr_node_character *characters = NULL;
- int nsources = -1;
- unsigned int child_count = 0;
- afr_private_t *priv = NULL;
-
- priv = this->private;
- child_count = priv->child_count;
- characters = GF_CALLOC (sizeof (afr_node_character),
- child_count, gf_afr_mt_afr_node_character);
- if (!characters)
- goto out;
-
- this = THIS;
-
- /* start clean */
- memset (sources, 0, sizeof (*sources) * child_count);
- nsources = 0;
- afr_find_character_types (characters, pending_matrix, success_children,
- child_count);
- if (afr_sh_all_nodes_innocent (characters, child_count)) {
- switch (type) {
- case AFR_SELF_HEAL_METADATA:
- nsources = afr_sh_mark_lowest_uid_as_source (bufs,
- success_children,
- child_count,
- sources);
- break;
- case AFR_SELF_HEAL_DATA:
- nsources = afr_sh_mark_zero_size_file_as_sink (bufs,
- success_children,
- child_count,
- sources);
- if ((nsources < 0) && subvol_status)
- *subvol_status |= SPLIT_BRAIN;
- break;
- default:
- break;
- }
- goto out;
- }
-
- if (afr_sh_wise_nodes_exist (characters, child_count)) {
- afr_sh_compute_wisdom (pending_matrix, characters, child_count);
-
- if (afr_sh_wise_nodes_conflict (characters, child_count)) {
- if (subvol_status)
- *subvol_status |= SPLIT_BRAIN;
- nsources = -1;
- } else {
- nsources = afr_sh_mark_wisest_as_sources (sources,
- characters,
- child_count);
- }
- } else {
- if (subvol_status)
- *subvol_status |= ALL_FOOLS;
- nsources = afr_mark_biggest_of_fools_as_source (sources,
- pending_matrix,
- characters,
- child_count);
- }
-
-out:
- if (nsources == 0)
- afr_mark_success_children_sources (sources, success_children,
- child_count);
- GF_FREE (characters);
-
- gf_log (this->name, GF_LOG_DEBUG, "Number of sources: %d", nsources);
- return nsources;
-}
-
-void
-afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr,
- int32_t *delta_matrix[], unsigned char success[],
- int child_count, afr_transaction_type type)
-{
- int i = 0;
- int j = 0;
-
- afr_build_pending_matrix (priv->pending_key, delta_matrix, NULL,
- xattr, type, priv->child_count);
- for (i = 0; i < priv->child_count; i++)
- for (j = 0; j < priv->child_count; j++)
- delta_matrix[i][j] = -delta_matrix[i][j];
+afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this,
+ struct afr_reply *replies,
+ afr_transaction_type type, unsigned char *locked_on,
+ unsigned char *sources, unsigned char *sinks)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int j = 0;
+ int *dirty = NULL;
+ int **matrix = NULL;
+ char *accused = NULL;
+
+ priv = this->private;
+
+ dirty = alloca0 (priv->child_count * sizeof (int));
+ accused = alloca0 (priv->child_count);
+ matrix = ALLOC_MATRIX(priv->child_count, int);
+
+ /* First construct the pending matrix for further analysis */
+ afr_selfheal_extract_xattr (this, replies, type, dirty, matrix);
+
+ /* Next short list all accused to exclude them from being sources */
+ for (i = 0; i < priv->child_count; i++) {
+ for (j = 0; j < priv->child_count; j++) {
+ if (matrix[i][j])
+ accused[j] = 1;
+ }
+ }
+
+ /* Short list all non-accused as sources */
+ memset (sources, 0, priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!accused[i] && locked_on[i])
+ sources[i] = 1;
+ }
+
+ /* Everyone accused by sources are sinks */
+ memset (sinks, 0, priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ for (j = 0; j < priv->child_count; j++) {
+ if (matrix[i][j])
+ sinks[j] = 1;
+ }
+ }
+
+ /* If any source has 'dirty' bit, pick first
+ 'dirty' source and make everybody else sinks */
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i] && dirty[i]) {
+ for (j = 0; j < priv->child_count; j++) {
+ if (j != i) {
+ sources[j] = 0;
+ sinks[j] = 1;
+ }
+ }
+ break;
+ }
+ }
+
+ /* If no sources, all locked nodes are sinks - split brain */
+ if (AFR_COUNT (sources, priv->child_count) == 0) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (locked_on[i])
+ sinks[i] = 1;
+ }
+ }
+
+ return 0;
}
int
-afr_sh_delta_to_xattr (xlator_t *this,
- int32_t *delta_matrix[], dict_t *xattr[],
- int child_count, afr_transaction_type type)
+afr_selfheal_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *parbuf)
{
- int i = 0;
- int j = 0;
- int k = 0;
- int ret = 0;
- int32_t *pending = NULL;
- int32_t *local_pending = NULL;
- afr_private_t *priv = NULL;
-
- priv = this->private;
- for (i = 0; i < child_count; i++) {
- if (!xattr[i])
- continue;
-
- local_pending = NULL;
- for (j = 0; j < child_count; j++) {
- pending = GF_CALLOC (sizeof (int32_t), 3,
- gf_afr_mt_int32_t);
-
- if (!pending)
- continue;
- /* 3 = data+metadata+entry */
-
- k = afr_index_for_transaction_type (type);
-
- pending[k] = hton32 (delta_matrix[i][j]);
-
- if (j == i) {
- local_pending = pending;
- continue;
- }
- ret = dict_set_bin (xattr[i], priv->pending_key[j],
- pending,
- AFR_NUM_CHANGE_LOGS * sizeof (int32_t));
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Unable to set dict value.");
- GF_FREE (pending);
- }
- }
- if (local_pending) {
- ret = dict_set_bin (xattr[i], priv->pending_key[i],
- local_pending,
- AFR_NUM_CHANGE_LOGS * sizeof (int32_t));
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Unable to set dict value.");
- GF_FREE (local_pending);
- }
- }
- }
- return 0;
-}
-
+ afr_local_t *local = NULL;
+ int i = -1;
-int
-afr_sh_has_metadata_pending (dict_t *xattr, xlator_t *this)
-{
- /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */
- int32_t pending[3] = {0,};
- void *pending_raw = NULL;
- afr_private_t *priv = NULL;
- int ret = -1;
- int i = 0;
- int j = 0;
+ local = frame->local;
+ i = (long) cookie;
- priv = this->private;
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (buf)
+ local->replies[i].poststat = *buf;
+ if (parbuf)
+ local->replies[i].postparent = *parbuf;
+ if (xdata)
+ local->replies[i].xdata = dict_ref (xdata);
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_get_ptr (xattr, priv->pending_key[i],
- &pending_raw);
+ syncbarrier_wake (&local->barrier);
- if (ret != 0)
- return 0;
+ return 0;
+}
- memcpy (pending, pending_raw, sizeof(pending));
- j = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION);
- if (pending[j])
- return 1;
- }
+inode_t *
+afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent,
+ const char *name, struct afr_reply *replies,
+ unsigned char *lookup_on)
+{
+ loc_t loc = {0, };
+ dict_t *xattr_req = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ inode_t *inode = NULL;
- return 0;
-}
+ local = frame->local;
+ priv = frame->this->private;
+ xattr_req = dict_new ();
+ if (!xattr_req)
+ return NULL;
-int
-afr_sh_has_data_pending (dict_t *xattr, xlator_t *this)
-{
- /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */
- int32_t pending[3] = {0,};
- void *pending_raw = NULL;
- afr_private_t *priv = NULL;
- int ret = -1;
- int i = 0;
- int j = 0;
+ if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) {
+ dict_destroy (xattr_req);
+ return NULL;
+ }
- priv = this->private;
+ inode = inode_new (parent->table);
+ if (!inode) {
+ dict_destroy (xattr_req);
+ return NULL;
+ }
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_get_ptr (xattr, priv->pending_key[i],
- &pending_raw);
+ loc.parent = inode_ref (parent);
+ uuid_copy (loc.pargfid, parent->gfid);
+ loc.name = name;
+ loc.inode = inode_ref (inode);
- if (ret != 0)
- return 0;
+ AFR_ONLIST (lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ xattr_req);
- memcpy (pending, pending_raw, sizeof(pending));
- j = afr_index_for_transaction_type (AFR_DATA_TRANSACTION);
+ afr_replies_copy (replies, local->replies, priv->child_count);
- if (pending[j])
- return 1;
- }
+ loc_wipe (&loc);
+ dict_unref (xattr_req);
- return 0;
+ return inode;
}
int
-afr_sh_has_entry_pending (dict_t *xattr, xlator_t *this)
+afr_selfheal_unlocked_discover_on (call_frame_t *frame, inode_t *inode,
+ uuid_t gfid, struct afr_reply *replies,
+ unsigned char *discover_on)
{
- /* Indexable by result of afr_index_for_transaction_type(): 0 -- 2. */
- int32_t pending[3] = {0,};
- void *pending_raw = NULL;
- afr_private_t *priv = NULL;
- int ret = -1;
- int i = 0;
- int j = 0;
+ loc_t loc = {0, };
+ dict_t *xattr_req = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = frame->this->private;
- priv = this->private;
+ xattr_req = dict_new ();
+ if (!xattr_req)
+ return -ENOMEM;
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_get_ptr (xattr, priv->pending_key[i],
- &pending_raw);
+ if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) {
+ dict_destroy (xattr_req);
+ return -ENOMEM;
+ }
- if (ret != 0)
- return 0;
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, gfid);
- memcpy (pending, pending_raw, sizeof(pending));
- j = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION);
+ AFR_ONLIST (discover_on, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ xattr_req);
- if (pending[j])
- return 1;
- }
+ afr_replies_copy (replies, local->replies, priv->child_count);
- return 0;
+ loc_wipe (&loc);
+ dict_unref (xattr_req);
+
+ return 0;
}
int
-afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this)
+afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode,
+ uuid_t gfid, struct afr_reply *replies)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- afr_sh_reset (frame, this);
-
- if (local->govinda_gOvinda) {
- gf_log (this->name, GF_LOG_DEBUG,
- "split brain found, aborting selfheal of %s",
- local->loc.path);
- sh->op_failed = 1;
- }
-
- if (sh->op_failed) {
- sh->completion_cbk (frame, this);
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "proceeding to metadata check on %s",
- local->loc.path);
- afr_self_heal_metadata (frame, this);
- }
-
- return 0;
+ afr_private_t *priv = NULL;
+
+ priv = frame->this->private;
+
+ return afr_selfheal_unlocked_discover_on (frame, inode, gfid, replies,
+ priv->child_up);
}
-static int
-afr_sh_missing_entries_finish (call_frame_t *frame, xlator_t *this)
+int
+afr_selfheal_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
- local = frame->local;
- int_lock = &local->internal_lock;
+ local = frame->local;
+ i = (long) cookie;
- int_lock->lock_cbk = afr_sh_missing_entries_done;
- afr_unlock (frame, this);
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
- return 0;
+ syncbarrier_wake (&local->barrier);
+
+ return 0;
}
+
int
-afr_sh_common_create (afr_self_heal_t *sh, unsigned int child_count)
+afr_selfheal_locked_fill (call_frame_t *frame, xlator_t *this,
+ unsigned char *locked_on)
{
- int ret = -ENOMEM;
- sh->buf = GF_CALLOC (child_count, sizeof (*sh->buf),
- gf_afr_mt_iatt);
- if (!sh->buf)
- goto out;
- sh->parentbufs = GF_CALLOC (child_count, sizeof (*sh->parentbufs),
- gf_afr_mt_iatt);
- if (!sh->parentbufs)
- goto out;
- sh->child_errno = GF_CALLOC (child_count, sizeof (*sh->child_errno),
- gf_afr_mt_int);
- if (!sh->child_errno)
- goto out;
- sh->success_children = afr_children_create (child_count);
- if (!sh->success_children)
- goto out;
- sh->fresh_children = afr_children_create (child_count);
- if (!sh->fresh_children)
- goto out;
- sh->xattr = GF_CALLOC (child_count, sizeof (*sh->xattr),
- gf_afr_mt_dict_t);
- if (!sh->xattr)
- goto out;
- ret = 0;
-out:
- return ret;
-}
+ int i = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int count = 0;
-void
-afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf,
- dict_t *xattr, struct iatt *postparent,
- loc_t *loc)
-{
- int child_index = 0;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- priv = this->private;
- sh = &local->self_heal;
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == 0) {
- sh->buf[child_index] = *buf;
- sh->parentbufs[child_index] = *postparent;
- sh->success_children[sh->success_count] = child_index;
- sh->success_count++;
- sh->xattr[child_index] = dict_ref (xattr);
- } else {
- gf_log (this->name, GF_LOG_DEBUG, "path %s on subvolume"
- " %s => -1 (%s)", loc->path,
- priv->children[child_index]->name,
- strerror (op_errno));
- local->self_heal.child_errno[child_index] = op_errno;
- }
- }
- UNLOCK (&frame->lock);
- return;
-}
+ local = frame->local;
+ priv = this->private;
-gf_boolean_t
-afr_valid_ia_type (ia_type_t ia_type)
-{
- switch (ia_type) {
- case IA_IFSOCK:
- case IA_IFREG:
- case IA_IFBLK:
- case IA_IFCHR:
- case IA_IFIFO:
- case IA_IFLNK:
- case IA_IFDIR:
- return _gf_true;
- default:
- return _gf_false;
- }
- return _gf_false;
-}
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].valid && local->replies[i].op_ret == 0) {
+ locked_on[i] = 1;
+ count++;
+ } else {
+ locked_on[i] = 0;
+ }
+ }
-int
-afr_impunge_frame_create (call_frame_t *frame, xlator_t *this,
- int active_source, call_frame_t **impunge_frame)
-{
- afr_local_t *local = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int32_t op_errno = 0;
- afr_private_t *priv = NULL;
- int ret = 0;
- call_frame_t *new_frame = NULL;
-
- op_errno = ENOMEM;
- priv = this->private;
- new_frame = copy_frame (frame);
- if (!new_frame) {
- goto out;
- }
-
- AFR_LOCAL_ALLOC_OR_GOTO (impunge_local, out);
-
- local = frame->local;
- new_frame->local = impunge_local;
- impunge_sh = &impunge_local->self_heal;
- impunge_sh->sh_frame = frame;
- impunge_sh->active_source = active_source;
- impunge_local->child_up = memdup (local->child_up,
- sizeof (*local->child_up) *
- priv->child_count);
- if (!impunge_local->child_up)
- goto out;
-
- impunge_local->pending = afr_matrix_create (priv->child_count,
- AFR_NUM_CHANGE_LOGS);
- if (!impunge_local->pending)
- goto out;
-
- ret = afr_sh_common_create (impunge_sh, priv->child_count);
- if (ret) {
- op_errno = -ret;
- goto out;
- }
- op_errno = 0;
- *impunge_frame = new_frame;
-out:
- if (op_errno && new_frame)
- AFR_STACK_DESTROY (new_frame);
- return -op_errno;
+ return count;
}
-void
-afr_sh_missing_entry_call_impunge_recreate (call_frame_t *frame, xlator_t *this,
- struct iatt *buf,
- struct iatt *postparent,
- afr_impunge_done_cbk_t impunge_done)
-{
- call_frame_t *impunge_frame = NULL;
- afr_local_t *local = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int ret = 0;
- unsigned int enoent_count = 0;
- afr_private_t *priv = NULL;
- int i = 0;
- int32_t op_errno = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- enoent_count = afr_errno_count (NULL, sh->child_errno,
- priv->child_count, ENOENT);
- if (!enoent_count) {
- gf_log (this->name, GF_LOG_INFO,
- "no missing files - %s. proceeding to metadata check",
- local->loc.path);
- goto out;
- }
- sh->impunge_done = impunge_done;
- ret = afr_impunge_frame_create (frame, this, sh->source, &impunge_frame);
- if (ret)
- goto out;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- loc_copy (&impunge_local->loc, &local->loc);
- ret = afr_build_parent_loc (&impunge_sh->parent_loc,
- &impunge_local->loc, &op_errno);
- if (ret) {
- ret = -op_errno;
- goto out;
- }
- impunge_local->call_count = enoent_count;
- impunge_sh->entrybuf = sh->buf[sh->source];
- impunge_sh->parentbuf = sh->parentbufs[sh->source];
- for (i = 0; i < priv->child_count; i++) {
- if (!impunge_local->child_up[i]) {
- impunge_sh->child_errno[i] = ENOTCONN;
- continue;
- }
- if (sh->child_errno[i] != ENOENT) {
- impunge_sh->child_errno[i] = EEXIST;
- continue;
- }
- }
- for (i = 0; i < priv->child_count; i++) {
- if (sh->child_errno[i] != ENOENT)
- continue;
- afr_sh_entry_impunge_create (impunge_frame, this, i);
- enoent_count--;
- }
- GF_ASSERT (!enoent_count);
- return;
-out:
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "impunge of %s failed, "
- "reason: %s", local->loc.path, strerror (-ret));
- sh->op_failed = 1;
- }
- afr_sh_missing_entries_finish (frame, this);
-}
int
-afr_sh_create_entry_cbk (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+afr_selfheal_tryinodelk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ unsigned char *locked_on)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- if (op_ret < 0)
- sh->op_failed = 1;
- afr_sh_missing_entries_finish (frame, this);
- return 0;
-}
+ loc_t loc = {0,};
+ struct gf_flock flock = {0, };
-static int
-sh_missing_entries_create (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int type = 0;
- struct iatt *buf = NULL;
- struct iatt *postparent = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- buf = &sh->buf[sh->source];
- postparent = &sh->parentbufs[sh->source];
-
- type = buf->ia_type;
- if (!afr_valid_ia_type (type)) {
- gf_log (this->name, GF_LOG_ERROR,
- "%s: unknown file type: 0%o", local->loc.path, type);
- local->govinda_gOvinda = 1;
- afr_sh_missing_entries_finish (frame, this);
- goto out;
- }
-
- afr_sh_missing_entry_call_impunge_recreate (frame, this,
- buf, postparent,
- afr_sh_create_entry_cbk);
-out:
- return 0;
-}
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
-void
-afr_sh_missing_entries_lookup_done (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- ia_type_t ia_type = IA_INVAL;
- int32_t nsources = 0;
- loc_t *loc = NULL;
- int32_t subvol_status = 0;
- afr_transaction_type txn_type = AFR_DATA_TRANSACTION;
- gf_boolean_t split_brain = _gf_false;
- int read_child = -1;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
- loc = &local->loc;
-
- if (op_ret < 0) {
- if (op_errno == EIO)
- local->govinda_gOvinda = 1;
- // EIO can happen if finding the fresh parent dir failed
- goto out;
- }
-
- //now No chance for the ia_type to conflict
- ia_type = sh->buf[sh->success_children[0]].ia_type;
- txn_type = afr_transaction_type_get (ia_type);
- nsources = afr_build_sources (this, sh->xattr, sh->buf,
- sh->pending_matrix, sh->sources,
- sh->success_children, txn_type,
- &subvol_status, _gf_false);
- if (nsources < 0) {
- gf_log (this->name, GF_LOG_INFO, "No sources for dir of %s,"
- " in missing entry self-heal, continuing with the rest"
- " of the self-heals", local->loc.path);
- if (subvol_status & SPLIT_BRAIN) {
- split_brain = _gf_true;
- switch (txn_type) {
- case AFR_DATA_TRANSACTION:
- nsources = 1;
- sh->sources[sh->success_children[0]] = 1;
- break;
- case AFR_ENTRY_TRANSACTION:
- read_child = afr_get_no_xattr_dir_read_child
- (this,
- sh->success_children,
- sh->buf);
- sh->sources[read_child] = 1;
- nsources = 1;
- break;
- default:
- op_errno = EIO;
- goto out;
- }
- } else {
- op_errno = EIO;
- goto out;
- }
- }
-
- afr_get_fresh_children (sh->success_children, sh->sources,
- sh->fresh_children, priv->child_count);
- sh->source = sh->fresh_children[0];
- if (sh->source == -1) {
- gf_log (this->name, GF_LOG_DEBUG, "No active sources found.");
- op_errno = EIO;
- goto out;
- }
-
- if (sh->gfid_sh_success_cbk)
- sh->gfid_sh_success_cbk (frame, this);
- sh->type = sh->buf[sh->source].ia_type;
- if (uuid_is_null (loc->inode->gfid))
- uuid_copy (loc->gfid, sh->buf[sh->source].ia_gfid);
- if (split_brain) {
- afr_sh_missing_entries_finish (frame, this);
- } else {
- sh_missing_entries_create (frame, this);
- }
- return;
-out:
- sh->op_failed = 1;
- afr_sh_set_error (sh, op_errno);
- afr_sh_missing_entries_finish (frame, this);
- return;
-}
+ flock.l_type = F_WRLCK;
+ flock.l_start = off;
+ flock.l_len = size;
-static int
-afr_sh_common_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, dict_t *xattr,
- struct iatt *postparent)
-{
- int call_count = 0;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- afr_sh_common_lookup_resp_handler (frame, cookie, this, op_ret,
- op_errno, inode, buf, xattr,
- postparent, &sh->lookup_loc);
- call_count = afr_frame_return (frame);
-
- if (call_count)
- goto out;
- op_ret = -1;
- if (!sh->success_count) {
- op_errno = afr_resultant_errno_get (NULL, sh->child_errno,
- priv->child_count);
- gf_log (this->name, GF_LOG_ERROR, "Failed to lookup %s, "
- "reason %s", sh->lookup_loc.path,
- strerror (op_errno));
- goto done;
- }
-
- if ((sh->lookup_flags & AFR_LOOKUP_FAIL_CONFLICTS) &&
- (afr_conflicting_iattrs (sh->buf, sh->success_children,
- priv->child_count,
- sh->lookup_loc.path, this->name))) {
- op_errno = EIO;
- gf_log (this->name, GF_LOG_ERROR, "Conflicting entries "
- "for %s", sh->lookup_loc.path);
- goto done;
- }
-
- if ((sh->lookup_flags & AFR_LOOKUP_FAIL_MISSING_GFIDS) &&
- (afr_gfid_missing_count (this->name, sh->success_children,
- sh->buf, priv->child_count,
- sh->lookup_loc.path))) {
- op_errno = ENODATA;
- gf_log (this->name, GF_LOG_ERROR, "Missing Gfids "
- "for %s", sh->lookup_loc.path);
- goto done;
- }
- op_ret = 0;
-
-done:
- sh->lookup_done (frame, this, op_ret, op_errno);
-out:
- return 0;
-}
+ AFR_ONALL (frame, afr_selfheal_lock_cbk, inodelk, dom,
+ &loc, F_SETLK, &flock, NULL);
-int
-afr_sh_remove_entry_cbk (call_frame_t *frame, xlator_t *this, int child,
- int32_t op_ret, int32_t op_errno)
-{
- int call_count = 0;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- GF_ASSERT (sh->post_remove_call);
- if ((op_ret == -1) && (op_errno != ENOENT)) {
- gf_log (this->name, GF_LOG_ERROR,
- "purge entry %s failed, on child %d reason, %s",
- local->loc.path, child, strerror (op_errno));
- LOCK (&frame->lock);
- {
- afr_sh_set_error (sh, EIO);
- sh->op_failed = 1;
- }
- UNLOCK (&frame->lock);
- }
- call_count = afr_frame_return (frame);
- if (call_count == 0)
- sh->post_remove_call (frame, this);
- return 0;
-}
+ loc_wipe (&loc);
-void
-afr_sh_call_entry_expunge_remove (call_frame_t *frame, xlator_t *this,
- int child_index, struct iatt *buf,
- struct iatt *parentbuf,
- afr_expunge_done_cbk_t expunge_done)
-{
- call_frame_t *expunge_frame = NULL;
- afr_local_t *local = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- int32_t op_errno = 0;
- int ret = 0;
-
- expunge_frame = copy_frame (frame);
- if (!expunge_frame) {
- goto out;
- }
-
- AFR_LOCAL_ALLOC_OR_GOTO (expunge_local, out);
-
- local = frame->local;
- sh = &local->self_heal;
- expunge_frame->local = expunge_local;
- expunge_sh = &expunge_local->self_heal;
- expunge_sh->sh_frame = frame;
- loc_copy (&expunge_local->loc, &local->loc);
- ret = afr_build_parent_loc (&expunge_sh->parent_loc,
- &expunge_local->loc, &op_errno);
- if (ret) {
- ret = -op_errno;
- goto out;
- }
- sh->expunge_done = expunge_done;
- afr_sh_entry_expunge_remove (expunge_frame, this, child_index, buf,
- parentbuf);
- return;
-out:
- gf_log (this->name, GF_LOG_ERROR, "Expunge of %s failed, reason: %s",
- local->loc.path, strerror (op_errno));
- expunge_done (frame, this, child_index, -1, op_errno);
+ return afr_selfheal_locked_fill (frame, this, locked_on);
}
-void
-afr_sh_remove_stale_lookup_info (afr_self_heal_t *sh, int32_t *success_children,
- int32_t *fresh_children,
- unsigned int child_count)
-{
- int i = 0;
-
- for (i = 0; i < child_count; i++) {
- if (afr_is_child_present (success_children, child_count, i) &&
- !afr_is_child_present (fresh_children, child_count, i)) {
- sh->child_errno[i] = ENOENT;
- GF_ASSERT (sh->xattr[i]);
- dict_unref (sh->xattr[i]);
- sh->xattr[i] = NULL;
- }
- }
-}
int
-afr_sh_purge_stale_entries_done (call_frame_t *frame, xlator_t *this)
+afr_selfheal_inodelk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ unsigned char *locked_on)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- if (sh->op_failed) {
- afr_sh_missing_entries_finish (frame, this);
- } else {
- if (afr_gfid_missing_count (this->name, sh->fresh_children,
- sh->buf, priv->child_count,
- local->loc.path)) {
- afr_sh_common_lookup (frame, this, &local->loc,
- afr_sh_missing_entries_lookup_done,
- sh->sh_gfid_req,
- AFR_LOOKUP_FAIL_CONFLICTS|
- AFR_LOOKUP_FAIL_MISSING_GFIDS,
- NULL);
- } else {
- //No need to set gfid so goto missing entries lookup done
- //Behave as if you have done the lookup
- afr_sh_remove_stale_lookup_info (sh,
- sh->success_children,
- sh->fresh_children,
- priv->child_count);
- afr_children_copy (sh->success_children,
- sh->fresh_children,
- priv->child_count);
- afr_sh_missing_entries_lookup_done (frame, this, 0, 0);
- }
- }
- return 0;
-}
+ loc_t loc = {0,};
+ struct gf_flock flock = {0, };
+ afr_local_t *local = NULL;
+ int i = 0;
+ afr_private_t *priv = NULL;
-gf_boolean_t
-afr_sh_purge_entry_condition (afr_local_t *local, afr_private_t *priv,
- int child)
-{
- afr_self_heal_t *sh = NULL;
+ priv = this->private;
+ local = frame->local;
- sh = &local->self_heal;
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
- if (local->child_up[child] &&
- (!afr_is_child_present (sh->fresh_parent_dirs, priv->child_count,
- child))
- && (sh->child_errno[child] != ENOENT))
- return _gf_true;
+ flock.l_type = F_WRLCK;
+ flock.l_start = off;
+ flock.l_len = size;
- return _gf_false;
-}
+ AFR_ONALL (frame, afr_selfheal_lock_cbk, inodelk, dom,
+ &loc, F_SETLK, &flock, NULL);
-gf_boolean_t
-afr_sh_purge_stale_entry_condition (afr_local_t *local, afr_private_t *priv,
- int child)
-{
- afr_self_heal_t *sh = NULL;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == -1 &&
+ local->replies[i].op_errno == EAGAIN) {
+ afr_selfheal_locked_fill (frame, this, locked_on);
+ afr_selfheal_uninodelk (frame, this, inode, dom, off,
+ size, locked_on);
- sh = &local->self_heal;
+ AFR_SEQ (frame, afr_selfheal_lock_cbk, inodelk, dom,
+ &loc, F_SETLKW, &flock, NULL);
+ break;
+ }
+ }
- if (local->child_up[child] &&
- (!afr_is_child_present (sh->fresh_children, priv->child_count,
- child))
- && (sh->child_errno[child] != ENOENT))
- return _gf_true;
+ loc_wipe (&loc);
- return _gf_false;
+ return afr_selfheal_locked_fill (frame, this, locked_on);
}
-void
-afr_sh_purge_entry_common (call_frame_t *frame, xlator_t *this,
- gf_boolean_t purge_condition (afr_local_t *local,
- afr_private_t *priv,
- int child))
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
- int i = 0;
- int call_count = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++) {
- if (purge_condition (local, priv, i))
- call_count++;
- }
-
- if (call_count == 0) {
- sh->post_remove_call (frame, this);
- goto out;
- }
-
- local->call_count = call_count;
- for (i = 0; i < priv->child_count; i++) {
- if (!purge_condition (local, priv, i))
- continue;
- gf_log (this->name, GF_LOG_INFO, "purging the stale entry %s "
- "on %d", local->loc.path, i);
- afr_sh_call_entry_expunge_remove (frame, this,
- (long) i, &sh->buf[i],
- &sh->parentbufs[i],
- afr_sh_remove_entry_cbk);
- }
-out:
- return;
-}
-void
-afr_sh_purge_entry (call_frame_t *frame, xlator_t *this)
+int
+afr_selfheal_uninodelk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ const unsigned char *locked_on)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
+ loc_t loc = {0,};
+ struct gf_flock flock = {0, };
- local = frame->local;
- sh = &local->self_heal;
- sh->post_remove_call = afr_sh_missing_entries_finish;
- afr_sh_purge_entry_common (frame, this, afr_sh_purge_entry_condition);
-}
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
-void
-afr_sh_purge_stale_entry (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
+ flock.l_type = F_UNLCK;
+ flock.l_start = off;
+ flock.l_len = size;
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
+ AFR_ONLIST (locked_on, frame, afr_selfheal_lock_cbk, inodelk,
+ dom, &loc, F_SETLK, &flock, NULL);
- sh->post_remove_call = afr_sh_purge_stale_entries_done;
+ loc_wipe (&loc);
- for (i = 0; i < priv->child_count; i++) {
- if (afr_is_child_present (sh->fresh_children,
- priv->child_count, i))
- continue;
+ return 0;
+}
- if ((!local->child_up[i]) || sh->child_errno[i] != 0)
- continue;
- GF_ASSERT (!uuid_is_null (sh->entrybuf.ia_gfid) ||
- uuid_is_null (sh->buf[i].ia_gfid));
+int
+afr_selfheal_tryentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on)
+{
+ loc_t loc = {0,};
- if ((sh->entrybuf.ia_type != sh->buf[i].ia_type) ||
- (uuid_compare (sh->buf[i].ia_gfid,
- sh->entrybuf.ia_gfid)))
- continue;
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
- afr_children_add_child (sh->fresh_children, i,
- priv->child_count);
+ AFR_ONALL (frame, afr_selfheal_lock_cbk, entrylk, dom,
+ &loc, name, ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL);
- }
- afr_sh_purge_entry_common (frame, this,
- afr_sh_purge_stale_entry_condition);
-}
+ loc_wipe (&loc);
-void
-afr_sh_save_child_iatts_from_policy (int32_t *children, struct iatt *bufs,
- struct iatt *save,
- unsigned int child_count)
-{
- int i = 0;
- int child = 0;
- gf_boolean_t saved = _gf_false;
-
- GF_ASSERT (save);
- //if iatt buf with gfid exists sets it
- for (i = 0; i < child_count; i++) {
- child = children[i];
- if (child == -1)
- break;
- *save = bufs[child];
- saved = _gf_true;
- if (!uuid_is_null (save->ia_gfid))
- break;
- }
- GF_ASSERT (saved);
+ return afr_selfheal_locked_fill (frame, this, locked_on);
}
-void
-afr_get_children_of_fresh_parent_dirs (afr_self_heal_t *sh,
- unsigned int child_count)
-{
- afr_children_intersection_get (sh->success_children,
- sh->fresh_parent_dirs,
- sh->sources, child_count);
- afr_get_fresh_children (sh->success_children, sh->sources,
- sh->fresh_children, child_count);
- memset (sh->sources, 0, sizeof (*sh->sources) * child_count);
-}
-void
-afr_sh_children_lookup_done (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+int
+afr_selfheal_entrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int32_t fresh_child_enoents = 0;
- int32_t fresh_parent_count = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- if (op_ret < 0)
- goto fail;
- afr_get_children_of_fresh_parent_dirs (sh, priv->child_count);
- fresh_parent_count = afr_get_children_count (sh->fresh_parent_dirs,
- priv->child_count);
- //we need the enoent count of the subvols present in fresh_parent_dirs
- fresh_child_enoents = afr_errno_count (sh->fresh_parent_dirs,
- sh->child_errno,
- priv->child_count, ENOENT);
- if (fresh_child_enoents == fresh_parent_count) {
- gf_log (this->name, GF_LOG_INFO, "Deleting stale file %s",
- local->loc.path);
- afr_sh_set_error (sh, ENOENT);
- sh->op_failed = 1;
- afr_sh_purge_entry (frame, this);
- } else if (!afr_conflicting_iattrs (sh->buf, sh->fresh_children,
- priv->child_count, local->loc.path,
- this->name)) {
- afr_sh_save_child_iatts_from_policy (sh->fresh_children,
- sh->buf, &sh->entrybuf,
- priv->child_count);
- afr_update_gfid_from_iatts (sh->sh_gfid_req, sh->buf,
- sh->fresh_children,
- priv->child_count);
- afr_sh_purge_stale_entry (frame, this);
- } else {
- op_errno = EIO;
- local->govinda_gOvinda = 1;
- goto fail;
- }
-
- return;
-
-fail:
- sh->op_failed = 1;
- afr_sh_set_error (sh, op_errno);
- afr_sh_missing_entries_finish (frame, this);
- return;
-}
+ loc_t loc = {0,};
+ afr_local_t *local = NULL;
+ int i = 0;
+ afr_private_t *priv = NULL;
-static void
-afr_sh_find_fresh_parents (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int enoent_count = 0;
- int nsources = 0;
- int source = -1;
- int32_t subvol_status = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- if (op_ret < 0)
- goto out;
- enoent_count = afr_errno_count (NULL, sh->child_errno,
- priv->child_count, ENOENT);
- if (enoent_count > 0) {
- gf_log (this->name, GF_LOG_INFO, "Parent dir missing for %s,"
- " in missing entry self-heal, aborting missing-entry "
- "self-heal",
- local->loc.path);
- afr_sh_missing_entries_finish (frame, this);
- return;
- }
-
- nsources = afr_build_sources (this, sh->xattr, sh->buf,
- sh->pending_matrix, sh->sources,
- sh->success_children,
- AFR_ENTRY_TRANSACTION, &subvol_status,
- _gf_true);
- if ((subvol_status & ALL_FOOLS) ||
- (subvol_status & SPLIT_BRAIN)) {
- gf_log (this->name, GF_LOG_INFO, "%s: Performing conservative "
- "merge", sh->parent_loc.path);
- afr_mark_success_children_sources (sh->sources,
- sh->success_children,
- priv->child_count);
- } else if (nsources < 0) {
- gf_log (this->name, GF_LOG_ERROR, "No sources for dir "
- "of %s, in missing entry self-heal, aborting "
- "self-heal", local->loc.path);
- op_errno = EIO;
- goto out;
- }
-
- source = afr_sh_select_source (sh->sources, priv->child_count);
- if (source == -1) {
- GF_ASSERT (0);
- gf_log (this->name, GF_LOG_DEBUG, "No active sources found.");
- op_errno = EIO;
- goto out;
- }
- afr_get_fresh_children (sh->success_children, sh->sources,
- sh->fresh_parent_dirs, priv->child_count);
- afr_sh_common_lookup (frame, this, &local->loc,
- afr_sh_children_lookup_done, NULL, 0,
- NULL);
- return;
+ priv = this->private;
+ local = frame->local;
-out:
- afr_sh_set_error (sh, op_errno);
- sh->op_failed = 1;
- afr_sh_missing_entries_finish (frame, this);
- return;
-}
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
-void
-afr_sh_common_reset (afr_self_heal_t *sh, unsigned int child_count)
-{
- int i = 0;
-
- for (i = 0; i < child_count; i++) {
- memset (&sh->buf[i], 0, sizeof (sh->buf[i]));
- memset (&sh->parentbufs[i], 0, sizeof (sh->parentbufs[i]));
- sh->child_errno[i] = 0;
- }
- memset (&sh->parentbuf, 0, sizeof (sh->parentbuf));
- sh->success_count = 0;
- afr_reset_children (sh->success_children, child_count);
- afr_reset_children (sh->fresh_children, child_count);
- afr_reset_xattr (sh->xattr, child_count);
- loc_wipe (&sh->lookup_loc);
-}
+ AFR_ONALL (frame, afr_selfheal_lock_cbk, entrylk, dom, &loc,
+ name, ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL);
-/* afr self-heal state will be lost if this call is made
- * please check the afr_sh_common_reset that is called in this function
- */
-int
-afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
- afr_lookup_done_cbk_t lookup_done , uuid_t gfid,
- int32_t flags, dict_t *xdata)
-{
- afr_local_t *local = NULL;
- int i = 0;
- int call_count = 0;
- afr_private_t *priv = NULL;
- dict_t *xattr_req = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- priv = this->private;
- sh = &local->self_heal;
-
- call_count = afr_up_children_count (local->child_up, priv->child_count);
-
- local->call_count = call_count;
-
- xattr_req = dict_new();
-
- if (xattr_req) {
- afr_xattr_req_prepare (this, xattr_req, loc->path);
- if (gfid) {
- gf_log (this->name, GF_LOG_DEBUG,
- "looking up %s with gfid: %s",
- loc->path, uuid_utoa (gfid));
- GF_ASSERT (!uuid_is_null (gfid));
- afr_set_dict_gfid (xattr_req, gfid);
- }
- }
-
- afr_sh_common_reset (sh, priv->child_count);
- sh->lookup_done = lookup_done;
- loc_copy (&sh->lookup_loc, loc);
- sh->lookup_flags = flags;
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- gf_log (this->name, GF_LOG_DEBUG,
- "looking up %s on subvolume %s",
- loc->path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame,
- afr_sh_common_lookup_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->lookup,
- loc, xattr_req);
-
- if (!--call_count)
- break;
- }
- }
-
- if (xattr_req)
- dict_unref (xattr_req);
-
- return 0;
-}
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == -1 &&
+ local->replies[i].op_errno == EAGAIN) {
+ afr_selfheal_locked_fill (frame, this, locked_on);
+ afr_selfheal_unentrylk (frame, this, inode, dom, name,
+ locked_on);
+ AFR_SEQ (frame, afr_selfheal_lock_cbk, entrylk, dom,
+ &loc, name, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
+ break;
+ }
+ }
+ loc_wipe (&loc);
-int
-afr_sh_post_nb_entrylk_conflicting_sh_cbk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- sh = &local->self_heal;
-
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
- "Non blocking entrylks failed.");
- sh->op_failed = -1;
- afr_sh_missing_entries_done (frame, this);
- } else {
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Non blocking entrylks done. Proceeding to FOP");
- afr_sh_common_lookup (frame, this, &sh->parent_loc,
- afr_sh_find_fresh_parents,
- NULL, AFR_LOOKUP_FAIL_CONFLICTS,
- NULL);
- }
-
- return 0;
+ return afr_selfheal_locked_fill (frame, this, locked_on);
}
+
int
-afr_sh_post_nb_entrylk_gfid_sh_cbk (call_frame_t *frame, xlator_t *this)
+afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- int_lock = &local->internal_lock;
-
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
- "Non blocking entrylks failed.");
- afr_sh_missing_entries_done (frame, this);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "Non blocking entrylks done. Proceeding to FOP");
- afr_sh_common_lookup (frame, this, &local->loc,
- afr_sh_missing_entries_lookup_done,
- sh->sh_gfid_req, AFR_LOOKUP_FAIL_CONFLICTS|
- AFR_LOOKUP_FAIL_MISSING_GFIDS,
- NULL);
- }
-
- return 0;
+ loc_t loc = {0,};
+
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
+
+ AFR_ONLIST (locked_on, frame, afr_selfheal_lock_cbk, entrylk,
+ dom, &loc, name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL);
+
+ loc_wipe (&loc);
+
+ return 0;
}
-int
-afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc,
- char *base_name, afr_lock_cbk_t lock_cbk)
+
+gf_boolean_t
+afr_is_pending_set (xlator_t *this, dict_t *xdata, int type)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ int idx = -1;
+ afr_private_t *priv = NULL;
+ void *pending_raw = NULL;
+ int *pending_int = NULL;
+ int i = 0;
- local = frame->local;
- int_lock = &local->internal_lock;
+ priv = this->private;
+ idx = afr_index_for_transaction_type (type);
- int_lock->transaction_lk_type = AFR_SELFHEAL_LK;
- int_lock->selfheal_lk_type = AFR_ENTRY_SELF_HEAL_LK;
+ if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw) == 0) {
+ if (pending_raw) {
+ pending_int = pending_raw;
- afr_set_lock_number (frame, this);
+ if (ntoh32 (pending_int[idx]))
+ return _gf_true;
+ }
+ }
- int_lock->lk_basename = base_name;
- int_lock->lk_loc = loc;
- int_lock->lock_cbk = lock_cbk;
+ for (i = 0; i < priv->child_count; i++) {
+ if (dict_get_ptr (xdata, priv->pending_key[i],
+ &pending_raw))
+ continue;
+ if (!pending_raw)
+ continue;
+ pending_int = pending_raw;
- afr_nonblocking_entrylk (frame, this);
+ if (ntoh32 (pending_int[idx]))
+ return _gf_true;
+ }
- return 0;
+ return _gf_false;
}
-static int
-afr_self_heal_parent_entrylk (call_frame_t *frame, xlator_t *this,
- afr_lock_cbk_t lock_cbk)
+
+gf_boolean_t
+afr_is_data_set (xlator_t *this, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_internal_lock_t *int_lock = NULL;
- int ret = -1;
- int32_t op_errno = 0;
-
- local = frame->local;
- sh = &local->self_heal;
-
- gf_log (this->name, GF_LOG_TRACE,
- "attempting to recreate missing entries for path=%s",
- local->loc.path);
-
- ret = afr_build_parent_loc (&sh->parent_loc, &local->loc, &op_errno);
- if (ret)
- goto out;
-
- afr_sh_entrylk (frame, this, &sh->parent_loc, NULL,
- lock_cbk);
- return 0;
-out:
- int_lock = &local->internal_lock;
- int_lock->lock_op_ret = -1;
- lock_cbk (frame, this);
- return 0;
+ return afr_is_pending_set (this, xdata, AFR_DATA_TRANSACTION);
}
-static int
-afr_self_heal_conflicting_entries (call_frame_t *frame, xlator_t *this)
+gf_boolean_t
+afr_is_metadata_set (xlator_t *this, dict_t *xdata)
{
- afr_self_heal_parent_entrylk (frame, this,
- afr_sh_post_nb_entrylk_conflicting_sh_cbk);
- return 0;
+ return afr_is_pending_set (this, xdata, AFR_METADATA_TRANSACTION);
}
-static int
-afr_self_heal_gfids (call_frame_t *frame, xlator_t *this)
+gf_boolean_t
+afr_is_entry_set (xlator_t *this, dict_t *xdata)
{
- afr_self_heal_parent_entrylk (frame, this,
- afr_sh_post_nb_entrylk_gfid_sh_cbk);
- return 0;
+ return afr_is_pending_set (this, xdata, AFR_ENTRY_TRANSACTION);
}
-afr_local_t *afr_local_copy (afr_local_t *l, xlator_t *this)
+
+void
+afr_inode_link (inode_t *inode, struct iatt *iatt)
{
- afr_private_t *priv = NULL;
- afr_local_t *lc = NULL;
- afr_self_heal_t *sh = NULL;
- afr_self_heal_t *shc = NULL;
-
- priv = this->private;
-
- sh = &l->self_heal;
-
- lc = mem_get0 (this->local_pool);
- if (!lc)
- goto out;
-
- shc = &lc->self_heal;
-
- shc->unwind = sh->unwind;
- shc->gfid_sh_success_cbk = sh->gfid_sh_success_cbk;
- shc->do_missing_entry_self_heal = sh->do_missing_entry_self_heal;
- shc->do_gfid_self_heal = sh->do_gfid_self_heal;
- shc->do_data_self_heal = sh->do_data_self_heal;
- shc->do_metadata_self_heal = sh->do_metadata_self_heal;
- shc->do_entry_self_heal = sh->do_entry_self_heal;
- shc->force_confirm_spb = sh->force_confirm_spb;
- shc->forced_merge = sh->forced_merge;
- shc->background = sh->background;
- shc->type = sh->type;
-
- uuid_copy (shc->sh_gfid_req, sh->sh_gfid_req);
- if (l->loc.path)
- loc_copy (&lc->loc, &l->loc);
-
- lc->child_up = memdup (l->child_up,
- sizeof (*lc->child_up) * priv->child_count);
- if (l->xattr_req)
- lc->xattr_req = dict_ref (l->xattr_req);
-
- if (l->cont.lookup.inode)
- lc->cont.lookup.inode = inode_ref (l->cont.lookup.inode);
- if (l->cont.lookup.xattr)
- lc->cont.lookup.xattr = dict_ref (l->cont.lookup.xattr);
- if (l->internal_lock.inode_locked_nodes)
- lc->internal_lock.inode_locked_nodes =
- memdup (l->internal_lock.inode_locked_nodes,
- sizeof (*lc->internal_lock.inode_locked_nodes) * priv->child_count);
- else
- lc->internal_lock.inode_locked_nodes =
- GF_CALLOC (sizeof (*l->internal_lock.inode_locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
- if (l->internal_lock.entry_locked_nodes)
- lc->internal_lock.entry_locked_nodes =
- memdup (l->internal_lock.entry_locked_nodes,
- sizeof (*lc->internal_lock.entry_locked_nodes) * priv->child_count);
- else
- lc->internal_lock.entry_locked_nodes =
- GF_CALLOC (sizeof (*l->internal_lock.entry_locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
- if (l->internal_lock.locked_nodes)
- lc->internal_lock.locked_nodes =
- memdup (l->internal_lock.locked_nodes,
- sizeof (*lc->internal_lock.locked_nodes) * priv->child_count);
- else
- lc->internal_lock.locked_nodes =
- GF_CALLOC (sizeof (*l->internal_lock.locked_nodes),
- priv->child_count,
- gf_afr_mt_char);
-
- lc->internal_lock.inodelk_lock_count =
- l->internal_lock.inodelk_lock_count;
- lc->internal_lock.entrylk_lock_count =
- l->internal_lock.entrylk_lock_count;
+ inode_t *linked_inode = NULL;
-out:
- return lc;
+ linked_inode = inode_link (inode, NULL, NULL, iatt);
+
+ uuid_copy (inode->gfid, iatt->ia_gfid);
+ inode->ia_type = iatt->ia_type;
+
+ if (linked_inode) {
+ inode_lookup (linked_inode);
+ inode_unref (linked_inode);
+ }
}
+
+/*
+ * This function inspects the looked up replies (in an unlocked manner)
+ * and decides whether a locked verification and possible healing is
+ * required or not. It updates the three booleans for each type
+ * of healing. If the boolean flag gets set to FALSE, then we are sure
+ * no healing is required. If the boolean flag gets set to TRUE then
+ * we have to proceed with locked reinspection.
+ */
+
int
-afr_self_heal_completion_cbk (call_frame_t *bgsh_frame, xlator_t *this)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t * sh = NULL;
- char sh_type_str[256] = {0,};
- gf_boolean_t split_brain = _gf_false;
+afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this,
+ inode_t *inode, uuid_t gfid,
+ gf_boolean_t *data_selfheal,
+ gf_boolean_t *metadata_selfheal,
+ gf_boolean_t *entry_selfheal)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int valid_cnt = 0;
+ struct iatt first = {0, };
+ struct afr_reply *replies = NULL;
+ int ret = -1;
+
+ priv = this->private;
+
+ replies = alloca0 (sizeof (*replies) * priv->child_count);
+
+ ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+ if (replies[i].op_ret == -1)
+ continue;
+
+ if (afr_is_data_set (this, replies[i].xdata))
+ *data_selfheal = _gf_true;
+
+ if (afr_is_metadata_set (this, replies[i].xdata))
+ *metadata_selfheal = _gf_true;
- priv = this->private;
- local = bgsh_frame->local;
- sh = &local->self_heal;
+ if (afr_is_entry_set (this, replies[i].xdata))
+ *entry_selfheal = _gf_true;
- if (local->govinda_gOvinda || sh->mdata_spb || sh->data_spb) {
- split_brain = _gf_true;
- sh->op_failed = 1;
- }
+ valid_cnt ++;
+ if (valid_cnt == 1) {
+ first = replies[i].poststat;
+ continue;
+ }
- afr_set_split_brain (this, sh->inode, split_brain);
+ if (!IA_EQUAL (first, replies[i].poststat, type)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "TYPE mismatch %d vs %d on %s for gfid:%s",
+ (int) first.ia_type,
+ (int) replies[i].poststat.ia_type,
+ priv->children[i]->name,
+ uuid_utoa (replies[i].poststat.ia_gfid));
+ return -EIO;
+ }
- afr_self_heal_type_str_get (sh, sh_type_str,
- sizeof(sh_type_str));
- if (sh->op_failed) {
- gf_loglevel_t loglevel = GF_LOG_ERROR;
- if (priv->shd.iamshd)
- loglevel = GF_LOG_DEBUG;
+ if (!IA_EQUAL (first, replies[i].poststat, uid)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "UID mismatch %d vs %d on %s for gfid:%s",
+ (int) first.ia_uid,
+ (int) replies[i].poststat.ia_uid,
+ priv->children[i]->name,
+ uuid_utoa (replies[i].poststat.ia_gfid));
- gf_log (this->name, loglevel, "background %s self-heal "
- "failed on %s", sh_type_str, local->loc.path);
+ *metadata_selfheal = _gf_true;
+ }
- } else {
- gf_log (this->name, GF_LOG_DEBUG, "background %s self-heal "
- "completed on %s", sh_type_str, local->loc.path);
+ if (!IA_EQUAL (first, replies[i].poststat, gid)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "GID mismatch %d vs %d on %s for gfid:%s",
+ (int) first.ia_uid,
+ (int) replies[i].poststat.ia_uid,
+ priv->children[i]->name,
+ uuid_utoa (replies[i].poststat.ia_gfid));
- }
+ *metadata_selfheal = _gf_true;
+ }
- FRAME_SU_UNDO (bgsh_frame, afr_local_t);
+ if (!IA_EQUAL (first, replies[i].poststat, prot)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "MODE mismatch %d vs %d on %s for gfid:%s",
+ (int) st_mode_from_ia (first.ia_prot, 0),
+ (int) st_mode_from_ia (replies[i].poststat.ia_prot, 0),
+ priv->children[i]->name,
+ uuid_utoa (replies[i].poststat.ia_gfid));
- if (!sh->unwound && sh->unwind) {
- sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno,
- sh->op_failed);
- }
+ *metadata_selfheal = _gf_true;
+ }
- if (sh->background) {
- LOCK (&priv->lock);
- {
- priv->background_self_heals_started--;
- }
- UNLOCK (&priv->lock);
- }
+ if (IA_ISREG(first.ia_type) &&
+ !IA_EQUAL (first, replies[i].poststat, size)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "SIZE mismatch %lld vs %lld on %s for gfid:%s",
+ (long long) first.ia_size,
+ (long long) replies[i].poststat.ia_size,
+ priv->children[i]->name,
+ uuid_utoa (replies[i].poststat.ia_gfid));
- AFR_STACK_DESTROY (bgsh_frame);
+ *data_selfheal = _gf_true;
+ }
+ }
- return 0;
-}
+ if (valid_cnt > 0)
+ afr_inode_link (inode, &first);
-int
-afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int32_t op_errno = 0;
- int ret = 0;
- afr_self_heal_t *orig_sh = NULL;
- call_frame_t *sh_frame = NULL;
- afr_local_t *sh_local = NULL;
- loc_t *loc = NULL;
-
- local = frame->local;
- orig_sh = &local->self_heal;
- priv = this->private;
-
- GF_ASSERT (local->loc.path);
-
- gf_log (this->name, GF_LOG_TRACE,
- "performing self heal on %s (metadata=%d data=%d entry=%d)",
- local->loc.path,
- local->self_heal.do_metadata_self_heal,
- local->self_heal.do_data_self_heal,
- local->self_heal.do_entry_self_heal);
-
- op_errno = ENOMEM;
- sh_frame = copy_frame (frame);
- if (!sh_frame)
- goto out;
- afr_set_lk_owner (sh_frame, this, sh_frame->root);
- afr_set_low_priority (sh_frame);
-
- sh_local = afr_local_copy (local, this);
- if (!sh_local)
- goto out;
- sh_frame->local = sh_local;
- sh = &sh_local->self_heal;
-
- sh->inode = inode_ref (inode);
- sh->orig_frame = frame;
-
- sh->completion_cbk = afr_self_heal_completion_cbk;
-
- sh->success = GF_CALLOC (priv->child_count, sizeof (*sh->success),
- gf_afr_mt_char);
- if (!sh->success)
- goto out;
- sh->sources = GF_CALLOC (sizeof (*sh->sources), priv->child_count,
- gf_afr_mt_int);
- if (!sh->sources)
- goto out;
- sh->locked_nodes = GF_CALLOC (sizeof (*sh->locked_nodes),
- priv->child_count,
- gf_afr_mt_int);
- if (!sh->locked_nodes)
- goto out;
-
- sh->pending_matrix = afr_matrix_create (priv->child_count,
- priv->child_count);
- if (!sh->pending_matrix)
- goto out;
-
- sh->delta_matrix = afr_matrix_create (priv->child_count,
- priv->child_count);
- if (!sh->delta_matrix)
- goto out;
-
- sh->fresh_parent_dirs = afr_children_create (priv->child_count);
- if (!sh->fresh_parent_dirs)
- goto out;
- ret = afr_sh_common_create (sh, priv->child_count);
- if (ret) {
- op_errno = -ret;
- goto out;
- }
-
- if (local->self_heal.background) {
- LOCK (&priv->lock);
- {
- if (priv->background_self_heals_started
- < priv->background_self_heal_count) {
- priv->background_self_heals_started++;
-
-
- } else {
- local->self_heal.background = _gf_false;
- sh->background = _gf_false;
- }
- }
- UNLOCK (&priv->lock);
- }
-
- if (!local->loc.parent) {
- sh->do_missing_entry_self_heal = _gf_false;
- sh->do_gfid_self_heal = _gf_false;
- }
-
- FRAME_SU_DO (sh_frame, afr_local_t);
- if (sh->do_missing_entry_self_heal) {
- afr_self_heal_conflicting_entries (sh_frame, this);
- } else if (sh->do_gfid_self_heal) {
- GF_ASSERT (!uuid_is_null (sh->sh_gfid_req));
- afr_self_heal_gfids (sh_frame, this);
- } else {
- loc = &sh_local->loc;
- if (uuid_is_null (loc->inode->gfid) && uuid_is_null (loc->gfid)) {
- if (!uuid_is_null (inode->gfid))
- GF_ASSERT (!uuid_compare (inode->gfid,
- sh->sh_gfid_req));
- uuid_copy (loc->gfid, sh->sh_gfid_req);
- }
- gf_log (this->name, GF_LOG_TRACE,
- "proceeding to metadata check on %s",
- local->loc.path);
-
- afr_sh_missing_entries_done (sh_frame, this);
- }
- op_errno = 0;
+ if (valid_cnt < 2)
+ return -ENOTCONN;
-out:
- if (op_errno) {
- orig_sh->unwind (frame, this, -1, op_errno, 1);
- if (sh_frame)
- AFR_STACK_DESTROY (sh_frame);
- }
- return 0;
+ return 0;
}
-void
-afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str,
- size_t size)
+
+inode_t *
+afr_inode_find (xlator_t *this, uuid_t gfid)
{
- GF_ASSERT (str && (size > strlen (" missing-entry gfid "
- "meta-data data entry")));
+ inode_table_t *table = NULL;
+ inode_t *inode = NULL;
- if (self_heal_p->do_metadata_self_heal) {
- snprintf (str, size, " meta-data");
- }
+ table = this->itable;
+ if (!table)
+ return NULL;
- if (self_heal_p->do_data_self_heal) {
- snprintf (str + strlen(str), size - strlen(str), " data");
- }
+ inode = inode_find (table, gfid);
+ if (inode)
+ return inode;
- if (self_heal_p->do_entry_self_heal) {
- snprintf (str + strlen(str), size - strlen(str), " entry");
- }
+ inode = inode_new (table);
+ if (!inode)
+ return NULL;
- if (self_heal_p->do_missing_entry_self_heal) {
- snprintf (str + strlen(str), size - strlen(str),
- " missing-entry");
- }
+ uuid_copy (inode->gfid, gfid);
- if (self_heal_p->do_gfid_self_heal) {
- snprintf (str + strlen(str), size - strlen(str), " gfid");
- }
+ return inode;
}
-afr_self_heal_type
-afr_self_heal_type_for_transaction (afr_transaction_type type)
-{
- afr_self_heal_type sh_type = AFR_SELF_HEAL_INVALID;
-
- switch (type) {
- case AFR_DATA_TRANSACTION:
- sh_type = AFR_SELF_HEAL_DATA;
- break;
- case AFR_METADATA_TRANSACTION:
- sh_type = AFR_SELF_HEAL_METADATA;
- break;
- case AFR_ENTRY_TRANSACTION:
- sh_type = AFR_SELF_HEAL_ENTRY;
- break;
- case AFR_ENTRY_RENAME_TRANSACTION:
- GF_ASSERT (0);
- break;
- }
- return sh_type;
-}
-int
-afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name)
+call_frame_t *
+afr_frame_create (xlator_t *this)
{
- int ret = -1;
- uuid_t pargfid = {0};
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ int op_errno = 0;
+ pid_t pid = -1;
- if (!child)
- goto out;
+ frame = create_frame (this, this->ctx->pool);
+ if (!frame)
+ return NULL;
- if (!uuid_is_null (parent->inode->gfid))
- uuid_copy (pargfid, parent->inode->gfid);
- else if (!uuid_is_null (parent->gfid))
- uuid_copy (pargfid, parent->gfid);
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local) {
+ STACK_DESTROY (frame->root);
+ return NULL;
+ }
- if (uuid_is_null (pargfid))
- goto out;
+ syncopctx_setfspid (&pid);
- if (strcmp (parent->path, "/") == 0)
- ret = gf_asprintf ((char **)&child->path, "/%s", name);
- else
- ret = gf_asprintf ((char **)&child->path, "%s/%s", parent->path,
- name);
+ frame->root->pid = pid;
- if (-1 == ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "asprintf failed while setting child path");
- }
+ afr_set_lk_owner (frame, this, frame->root);
- child->name = strrchr (child->path, '/');
- if (child->name)
- child->name++;
-
- child->parent = inode_ref (parent->inode);
- child->inode = inode_new (parent->inode->table);
- uuid_copy (child->pargfid, pargfid);
-
- if (!child->inode) {
- ret = -1;
- goto out;
- }
+ return frame;
+}
- ret = 0;
-out:
- if ((ret == -1) && child)
- loc_wipe (child);
- return ret;
-}
+/*
+ * This is the entry point for healing a given GFID
+ */
int
-afr_sh_erase_pending (call_frame_t *frame, xlator_t *this,
- afr_transaction_type type, afr_fxattrop_cbk_t cbk,
- int (*finish)(call_frame_t *frame, xlator_t *this))
+afr_selfheal (xlator_t *this, uuid_t gfid)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int i = 0;
- dict_t **erase_xattr = NULL;
- int ret = -1;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- afr_sh_pending_to_delta (priv, sh->xattr, sh->delta_matrix,
- sh->success, priv->child_count, type);
-
- erase_xattr = GF_CALLOC (sizeof (*erase_xattr), priv->child_count,
- gf_afr_mt_dict_t);
- if (!erase_xattr)
- goto out;
-
- for (i = 0; i < priv->child_count; i++) {
- if (sh->xattr[i]) {
- call_count++;
- erase_xattr[i] = dict_new ();
- if (!erase_xattr[i])
- goto out;
- }
- }
-
- afr_sh_delta_to_xattr (this, sh->delta_matrix, erase_xattr,
- priv->child_count, type);
-
- gf_log (this->name, GF_LOG_DEBUG, "Delta matrix for: %s",
- lkowner_utoa (&frame->root->lk_owner));
- afr_sh_print_pending_matrix (sh->delta_matrix, this);
- local->call_count = call_count;
- if (call_count == 0) {
- ret = 0;
- finish (frame, this);
- goto out;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (!erase_xattr[i])
- continue;
-
- if (sh->healing_fd) {//true for ENTRY, reg file DATA transaction
- STACK_WIND_COOKIE (frame, cbk, (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- sh->healing_fd,
- GF_XATTROP_ADD_ARRAY, erase_xattr[i],
- NULL);
- } else {
- STACK_WIND_COOKIE (frame, cbk, (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->loc,
- GF_XATTROP_ADD_ARRAY, erase_xattr[i],
- NULL);
- }
- }
-
- ret = 0;
+ inode_t *inode = NULL;
+ call_frame_t *frame = NULL;
+ int ret = -1;
+ gf_boolean_t data_selfheal = _gf_false;
+ gf_boolean_t metadata_selfheal = _gf_false;
+ gf_boolean_t entry_selfheal = _gf_false;
+
+ inode = afr_inode_find (this, gfid);
+ if (!inode)
+ goto out;
+
+ frame = afr_frame_create (this);
+ if (!frame)
+ goto out;
+
+ ret = afr_selfheal_unlocked_inspect (frame, this, inode, gfid,
+ &data_selfheal,
+ &metadata_selfheal,
+ &entry_selfheal);
+ if (ret)
+ goto out;
+
+ if (data_selfheal)
+ afr_selfheal_data (frame, this, inode);
+
+ if (metadata_selfheal)
+ afr_selfheal_metadata (frame, this, inode);
+
+ if (entry_selfheal)
+ afr_selfheal_entry (frame, this, inode);
+
+ inode_forget (inode, 1);
out:
- if (erase_xattr) {
- for (i = 0; i < priv->child_count; i++) {
- if (erase_xattr[i]) {
- dict_unref (erase_xattr[i]);
- }
- }
- }
-
- GF_FREE (erase_xattr);
-
- if (ret < 0) {
- sh->op_failed = _gf_true;
- finish (frame, this);
- }
-
- return 0;
+ if (inode)
+ inode_unref (inode);
+ if (frame)
+ AFR_STACK_DESTROY (frame);
+
+ return ret;
}
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h
deleted file mode 100644
index 6eabd1766..000000000
--- a/xlators/cluster/afr/src/afr-self-heal-common.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#ifndef __AFR_SELF_HEAL_COMMON_H__
-#define __AFR_SELF_HEAL_COMMON_H__
-
-#define FILE_HAS_HOLES(buf) (((buf)->ia_size) > ((buf)->ia_blocks * 512))
-#define AFR_SH_MIN_PARTICIPANTS 2
-
-typedef enum {
- AFR_SELF_HEAL_ENTRY,
- AFR_SELF_HEAL_METADATA,
- AFR_SELF_HEAL_DATA,
- AFR_SELF_HEAL_INVALID = -1,
-} afr_self_heal_type;
-
-typedef enum {
- AFR_LOOKUP_FAIL_CONFLICTS = 1,
- AFR_LOOKUP_FAIL_MISSING_GFIDS = 2,
-} afr_lookup_flags_t;
-
-int
-afr_sh_select_source (int sources[], int child_count);
-
-int
-afr_sh_source_count (int sources[], int child_count);
-
-void
-afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this);
-
-int
-afr_build_pending_matrix (char **pending_key, int32_t **pending_matrix,
- unsigned char *ignorant_subvols,
- dict_t *xattr[], afr_transaction_type type,
- size_t child_count);
-
-void
-afr_sh_pending_to_delta (afr_private_t *priv, dict_t **xattr,
- int32_t *delta_matrix[], unsigned char success[],
- int child_count, afr_transaction_type type);
-
-int
-afr_mark_sources (xlator_t *this, int32_t *sources, int32_t **pending_matrix,
- struct iatt *bufs, afr_self_heal_type type,
- int32_t *success_children, int32_t *subvol_status);
-
-int
-afr_sh_delta_to_xattr (xlator_t *this,
- int32_t *delta_matrix[], dict_t *xattr[],
- int child_count, afr_transaction_type type);
-
-void
-afr_self_heal_type_str_get (afr_self_heal_t *self_heal_p, char *str,
- size_t size);
-
-afr_self_heal_type
-afr_self_heal_type_for_transaction (afr_transaction_type type);
-
-int
-afr_build_sources (xlator_t *this, dict_t **xattr, struct iatt *bufs,
- int32_t **pending_matrix, int32_t *sources,
- int32_t *success_children, afr_transaction_type type,
- int32_t *subvol_status, gf_boolean_t ignore_ignorant);
-void
-afr_sh_common_reset (afr_self_heal_t *sh, unsigned int child_count);
-
-void
-afr_sh_common_lookup_resp_handler (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf,
- dict_t *xattr, struct iatt *postparent,
- loc_t *loc);
-
-int
-afr_sh_common_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
- afr_lookup_done_cbk_t lookup_cbk, uuid_t uuid,
- int32_t flags, dict_t *xdata);
-int
-afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this,
- int active_src, struct iatt *buf,
- struct iatt *parentbuf);
-int
-afr_sh_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc,
- char *base_name, afr_lock_cbk_t lock_cbk);
-int
-afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this,
- int child_index);
-int
-afr_sh_data_unlock (call_frame_t *frame, xlator_t *this,
- afr_lock_cbk_t lock_cbk);
-afr_local_t *
-afr_local_copy (afr_local_t *l, xlator_t *this);
-int
-afr_sh_data_lock (call_frame_t *frame, xlator_t *this,
- off_t start, off_t len,
- afr_lock_cbk_t success_handler,
- afr_lock_cbk_t failure_handler);
-void
-afr_sh_set_error (afr_self_heal_t *sh, int32_t op_errno);
-void
-afr_sh_mark_source_sinks (call_frame_t *frame, xlator_t *this);
-typedef int
-(*afr_fxattrop_cbk_t) (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *xattr, dict_t *xdata);
-int
-afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name);
-int
-afr_impunge_frame_create (call_frame_t *frame, xlator_t *this,
- int active_source, call_frame_t **impunge_frame);
-void
-afr_sh_reset (call_frame_t *frame, xlator_t *this);
-
-void
-afr_children_intersection_get (int32_t *set1, int32_t *set2,
- int *intersection, unsigned int child_count);
-int
-afr_get_no_xattr_dir_read_child (xlator_t *this, int32_t *success_children,
- struct iatt *bufs);
-int
-afr_sh_erase_pending (call_frame_t *frame, xlator_t *this,
- afr_transaction_type type, afr_fxattrop_cbk_t cbk,
- int (*finish)(call_frame_t *frame, xlator_t *this));
-#endif /* __AFR_SELF_HEAL_COMMON_H__ */
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index 6501e596c..c0548d995 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -1,5 +1,5 @@
/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
@@ -8,1387 +8,628 @@
cases as published by the Free Software Foundation.
*/
-#include <libgen.h>
-#include <unistd.h>
-#include <fnmatch.h>
-#include <sys/time.h>
-#include <stdlib.h>
-#include <signal.h>
#ifndef _CONFIG_H
#define _CONFIG_H
#include "config.h"
#endif
-#include "glusterfs.h"
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
-
-#include "afr-transaction.h"
#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
-#include "afr-self-heal-algorithm.h"
-
-int
-afr_sh_data_fail (call_frame_t *frame, xlator_t *this);
-
-static inline gf_boolean_t
-afr_sh_data_proceed (unsigned int success_count)
-{
- return (success_count >= AFR_SH_MIN_PARTICIPANTS);
-}
-
-extern int
-sh_loop_finish (call_frame_t *loop_frame, xlator_t *this);
-
-int
-afr_post_sh_big_lock_success (call_frame_t *frame, xlator_t *this);
-
-int
-afr_post_sh_big_lock_failure (call_frame_t *frame, xlator_t *this);
-
-int
-afr_sh_data_finish (call_frame_t *frame, xlator_t *this);
-
-int
-afr_sh_data_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- sh->completion_cbk (frame, this);
-
- return 0;
-}
-
-
-int
-afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int child_index = (long) cookie;
-
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "flush failed on %s on subvolume %s: %s",
- local->loc.path, priv->children[child_index]->name,
- strerror (op_errno));
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- afr_sh_data_done (frame, this);
- }
-
- return 0;
-}
-
-int
-afr_sh_data_close (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
- int i = 0;
- int call_count = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- if (!sh->healing_fd) {
- //This happens when file is non-reg
- afr_sh_data_done (frame, this);
- return 0;
- }
- call_count = afr_set_elem_count_get (sh->success,
- priv->child_count);
- local->call_count = call_count;
-
- if (call_count == 0) {
- afr_sh_data_done (frame, this);
- return 0;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (!sh->success[i])
- continue;
- gf_log (this->name, GF_LOG_DEBUG,
- "closing fd of %s on %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->flush,
- sh->healing_fd, NULL);
-
- if (!--call_count)
- break;
- }
-
- return 0;
-}
-
-int
-afr_sh_data_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *statpre,
- struct iatt *statpost, dict_t *xdata)
-{
-
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int child_index = (long) cookie;
-
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_INFO,
- "setattr failed on %s on subvolume %s: %s",
- local->loc.path, priv->children[child_index]->name,
- strerror (op_errno));
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- afr_sh_data_finish (frame, this);
- }
-
- return 0;
-}
-
-int
-afr_sh_data_setattr (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
- int i = 0;
- int call_count = 0;
- int source = 0;
- int32_t valid = 0;
- struct iatt stbuf = {0,};
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- source = sh->source;
-
- valid |= (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME);
-
- stbuf.ia_atime = sh->buf[source].ia_atime;
- stbuf.ia_atime_nsec = sh->buf[source].ia_atime_nsec;
- stbuf.ia_mtime = sh->buf[source].ia_mtime;
- stbuf.ia_mtime_nsec = sh->buf[source].ia_mtime_nsec;
-
- call_count = afr_set_elem_count_get (sh->success,
- priv->child_count);
- local->call_count = call_count;
-
- if (call_count == 0) {
- GF_ASSERT (0);
- afr_sh_data_finish (frame, this);
- return 0;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (!sh->success[i])
- continue;
-
- STACK_WIND_COOKIE (frame, afr_sh_data_setattr_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->setattr,
- &local->loc, &stbuf, valid, NULL);
-
- if (!--call_count)
- break;
- }
-
- return 0;
-}
-
-int
-afr_sh_data_setattr_fstat_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- struct iatt *buf, dict_t *xdata)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int child_index = (long) cookie;
-
- local = frame->local;
- sh = &local->self_heal;
-
- GF_ASSERT (sh->source == child_index);
- if (op_ret != -1) {
- sh->buf[child_index] = *buf;
- afr_sh_data_setattr (frame, this);
- } else {
- gf_log (this->name, GF_LOG_ERROR, "%s: Failed to set "
- "time-stamps after self-heal", local->loc.path);
- afr_sh_data_fail (frame, this);
- }
-
- return 0;
-}
-
-/*
- * If there are any writes after the self-heal is triggered then the
- * stbuf stored in local->self_heal.buf[] will be invalid so we do one more
- * stat on the source and then set the [am]times
- */
-int
-afr_sh_set_timestamps (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- STACK_WIND_COOKIE (frame, afr_sh_data_setattr_fstat_cbk,
- (void *) (long) sh->source,
- priv->children[sh->source],
- priv->children[sh->source]->fops->fstat,
- sh->healing_fd, NULL);
- return 0;
-}
-
-//Fun fact, lock_cbk is being used for both lock & unlock
-int
-afr_sh_data_unlock (call_frame_t *frame, xlator_t *this,
- afr_lock_cbk_t lock_cbk)
-{
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- sh = &local->self_heal;
-
- GF_ASSERT (sh->data_lock_held);
-
- sh->data_lock_held = _gf_false;
- int_lock->lock_cbk = lock_cbk;
- afr_unlock (frame, this);
-
- return 0;
-}
-
-int
-afr_sh_data_finish (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "finishing data selfheal of %s", local->loc.path);
-
- if (sh->data_lock_held)
- afr_sh_data_unlock (frame, this, afr_sh_data_close);
- else
- afr_sh_data_close (frame, this);
-
- return 0;
-}
-
-int
-afr_sh_data_fail (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "finishing failed data selfheal of %s", local->loc.path);
-
- sh->op_failed = 1;
- if (sh->data_lock_held)
- afr_sh_data_unlock (frame, this, afr_sh_data_close);
- else
- afr_sh_data_close (frame, this);
- return 0;
-}
-
-int
-afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, dict_t *xattr, dict_t *xdata)
-{
- int call_count = 0;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int32_t child_index = (long) cookie;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_ERROR, "Erasing of pending change "
- "log failed on %s for subvol %s, reason: %s",
- local->loc.path, priv->children[child_index]->name,
- strerror (op_errno));
- sh->op_failed = 1;
- }
-
- call_count = afr_frame_return (frame);
+#include "byte-order.h"
- if (call_count == 0) {
- if (sh->op_failed) {
- if (sh->old_loop_frame)
- sh_loop_finish (sh->old_loop_frame, this);
- sh->old_loop_frame = NULL;
- afr_sh_data_fail (frame, this);
- goto out;
- }
- if (!IA_ISREG (sh->type)) {
- afr_sh_data_finish (frame, this);
- goto out;
- }
- GF_ASSERT (sh->old_loop_frame);
- afr_sh_data_lock (frame, this, 0, 0,
- afr_post_sh_big_lock_success,
- afr_post_sh_big_lock_failure);
- }
-out:
- return 0;
-}
+enum {
+ AFR_SELFHEAL_DATA_FULL = 0,
+ AFR_SELFHEAL_DATA_DIFF,
+};
-int
-afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this)
-{
- afr_sh_erase_pending (frame, this, AFR_DATA_TRANSACTION,
- afr_sh_data_erase_pending_cbk,
- afr_sh_data_finish);
- return 0;
-}
-
-static struct afr_sh_algorithm *
-sh_algo_from_name (xlator_t *this, char *name)
+#define HAS_HOLES(i) ((i->ia_blocks * 512) < (i->ia_size))
+static int
+__checksum_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, uint32_t weak, uint8_t *strong,
+ dict_t *xdata)
{
- int i = 0;
+ afr_local_t *local = NULL;
+ int i = (long) cookie;
- while (afr_self_heal_algorithms[i].name) {
- if (!strcmp (name, afr_self_heal_algorithms[i].name)) {
- return &afr_self_heal_algorithms[i];
- }
+ local = frame->local;
- i++;
- }
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (strong)
+ memcpy (local->replies[i].checksum, strong, MD5_DIGEST_LENGTH);
- return NULL;
+ syncbarrier_wake (&local->barrier);
+ return 0;
}
static int
-sh_zero_byte_files_exist (afr_local_t *local, int child_count)
+attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre, struct iatt *post,
+ dict_t *xdata)
{
- int i = 0;
- int ret = 0;
- afr_self_heal_t *sh = NULL;
-
- sh = &local->self_heal;
- for (i = 0; i < child_count; i++) {
- if (!local->child_up[i] || sh->child_errno[i])
- continue;
- if (sh->buf[i].ia_size == 0) {
- ret = 1;
- break;
- }
- }
+ int i = (long) cookie;
+ afr_local_t *local = NULL;
- return ret;
-}
+ local = frame->local;
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (pre)
+ local->replies[i].prestat = *pre;
+ if (post)
+ local->replies[i].poststat = *post;
+ if (xdata)
+ local->replies[i].xdata = dict_ref (xdata);
-struct afr_sh_algorithm *
-afr_sh_data_pick_algo (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t * priv = NULL;
- struct afr_sh_algorithm * algo = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t * sh = NULL;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
- algo = sh_algo_from_name (this, priv->data_self_heal_algorithm);
-
- if (algo == NULL) {
- /* option not set, so fall back on heuristics */
-
- if (sh_zero_byte_files_exist (local, priv->child_count)
- || (sh->file_size <= (priv->data_self_heal_window_size *
- this->ctx->page_size))) {
-
- /*
- * If the file does not exist on one of the subvolumes,
- * or a zero-byte file exists (created by entry self-heal)
- * the entire content has to be copied anyway, so there
- * is no benefit from using the "diff" algorithm.
- *
- * If the file size is about the same as page size,
- * the entire file can be read and written with a few
- * (pipelined) STACK_WINDs, which will be faster
- * than "diff" which has to read checksums and then
- * read and write.
- */
-
- algo = sh_algo_from_name (this, "full");
-
- } else {
- algo = sh_algo_from_name (this, "diff");
- }
- }
+ syncbarrier_wake (&local->barrier);
- return algo;
+ return 0;
}
-int
-afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this)
+static gf_boolean_t
+__afr_selfheal_data_checksums_match (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, int source,
+ unsigned char *healed_sinks,
+ off_t offset, size_t size)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- struct afr_sh_algorithm *sh_algo = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ unsigned char *wind_subvols = NULL;
+ int i = 0;
- local = frame->local;
- sh = &local->self_heal;
+ priv = this->private;
+ local = frame->local;
- sh->algo_completion_cbk = afr_sh_data_erase_pending;
- sh->algo_abort_cbk = afr_sh_data_fail;
+ wind_subvols = alloca0 (priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source || healed_sinks[i])
+ wind_subvols[i] = 1;
+ }
- sh_algo = afr_sh_data_pick_algo (frame, this);
+ AFR_ONLIST (wind_subvols, frame, __checksum_cbk, rchecksum, fd,
+ offset, size, NULL);
- sh->algo = sh_algo;
- sh_algo->fn (frame, this);
+ if (!local->replies[source].valid || local->replies[source].op_ret != 0)
+ return _gf_false;
- return 0;
-}
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source)
+ continue;
+ if (memcmp (local->replies[source].checksum,
+ local->replies[i].checksum,
+ MD5_DIGEST_LENGTH))
+ return _gf_false;
+ }
-int
-afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
-{
- int call_count = 0;
- int child_index = 0;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "ftruncate of %s on subvolume %s failed (%s)",
- local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- sh->op_failed = 1;
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "ftruncate of %s on subvolume %s completed",
- local->loc.path,
- priv->children[child_index]->name);
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if (sh->op_failed)
- afr_sh_data_fail (frame, this);
- else
- afr_sh_data_sync_prepare (frame, this);
- }
-
- return 0;
+ return _gf_true;
}
-int
-afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- afr_self_heal_t *sh = NULL;
- int *sources = NULL;
- int call_count = 0;
- int i = 0;
-
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- sources = sh->sources;
- call_count = sh->active_sinks;
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (sources[i] || !local->child_up[i])
- continue;
-
- STACK_WIND_COOKIE (frame, afr_sh_data_trim_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->ftruncate,
- sh->healing_fd, sh->file_size,
- NULL);
-
- if (!--call_count)
- break;
- }
-
- return 0;
+static int
+__afr_selfheal_data_read_write (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int source, unsigned char *healed_sinks,
+ off_t offset, size_t size,
+ struct afr_reply *replies)
+{
+ struct iovec *iovec = NULL;
+ int count = 0;
+ struct iobref *iobref = NULL;
+ int ret = 0;
+ int i = 0;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ ret = syncop_readv (priv->children[source], fd, size, offset, 0,
+ &iovec, &count, &iobref);
+ if (ret <= 0)
+ return ret;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i])
+ continue;
+
+ /*
+ * TODO: Use fiemap() and discard() to heal holes
+ * in the future.
+ *
+ * For now,
+ *
+ * - if the source had any holes at all,
+ * AND
+ * - if we are writing past the original file size
+ * of the sink
+ * AND
+ * - is NOT the last block of the source file. if
+ * the block contains EOF, it has to be written
+ * in order to set the file size even if the
+ * last block is 0-filled.
+ * AND
+ * - if the read buffer is filled with only 0's
+ *
+ * then, skip writing to this source. We don't depend
+ * on the write to happen to update the size as we
+ * have performed an ftruncate() upfront anyways.
+ */
+#define is_last_block(o,b,s) ((s >= o) && (s <= (o + b)))
+ if (HAS_HOLES ((&replies[source].poststat)) &&
+ offset >= replies[i].poststat.ia_size &&
+ !is_last_block (offset, size,
+ replies[source].poststat.ia_size) &&
+ (iov_0filled (iovec, count) == 0))
+ continue;
+
+ ret = syncop_writev (priv->children[i], fd, iovec, count,
+ offset, iobref, 0);
+ if (ret != iov_length (iovec, count)) {
+ /* write() failed on this sink. unset the corresponding
+ member in sinks[] (which is healed_sinks[] in the
+ caller) so that this server does NOT get considered
+ as successfully healed.
+ */
+ healed_sinks[i] = 0;
+ }
+ }
+ if (iobref)
+ iobref_unref (iobref);
+
+ return ret;
}
-int
-afr_sh_inode_set_read_ctx (afr_self_heal_t *sh, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- int ret = 0;
- int i = 0;
-
- priv = this->private;
- sh->source = afr_sh_select_source (sh->sources, priv->child_count);
- if (sh->source < 0) {
- ret = -1;
- goto out;
- }
-
- /* detect changes not visible through pending flags -- JIC */
- for (i = 0; i < priv->child_count; i++) {
- if (i == sh->source || sh->child_errno[i])
- continue;
-
- if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[sh->source]))
- sh->sources[i] = 0;
- }
- afr_reset_children (sh->fresh_children, priv->child_count);
- afr_get_fresh_children (sh->success_children, sh->sources,
- sh->fresh_children, priv->child_count);
- afr_inode_set_read_ctx (this, sh->inode, sh->source,
- sh->fresh_children);
-out:
- return ret;
+static int
+afr_selfheal_data_block (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int source, unsigned char *healed_sinks, off_t offset,
+ size_t size, int type, struct afr_reply *replies)
+{
+ int ret = -1;
+ int sink_count = 0;
+ afr_private_t *priv = NULL;
+ unsigned char *data_lock = NULL;
+
+ priv = this->private;
+ sink_count = AFR_COUNT (healed_sinks, priv->child_count);
+ data_lock = alloca0 (priv->child_count);
+
+ ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name,
+ offset, size, data_lock);
+ {
+ if (ret < sink_count) {
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ if (type == AFR_SELFHEAL_DATA_DIFF &&
+ __afr_selfheal_data_checksums_match (frame, this, fd, source,
+ healed_sinks, offset, size)) {
+ ret = 0;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_data_read_write (frame, this, fd, source,
+ healed_sinks, offset, size,
+ replies);
+ }
+unlock:
+ afr_selfheal_uninodelk (frame, this, fd->inode, this->name,
+ offset, size, data_lock);
+ return ret;
}
-void
-afr_sh_data_fix (call_frame_t *frame, xlator_t *this)
-{
- int source = 0;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- source = sh->source;
- sh->block_size = this->ctx->page_size;
- sh->file_size = sh->buf[source].ia_size;
-
- if (FILE_HAS_HOLES (&sh->buf[source]))
- sh->file_has_holes = 1;
-
- if (sh->background && sh->unwind && !sh->unwound) {
- sh->unwind (sh->orig_frame, this, sh->op_ret, sh->op_errno,
- sh->op_failed);
- sh->unwound = _gf_true;
- }
- afr_sh_mark_source_sinks (frame, this);
- if (sh->active_sinks == 0) {
- gf_log (this->name, GF_LOG_INFO,
- "no active sinks for performing self-heal on file %s",
- local->loc.path);
- afr_sh_data_finish (frame, this);
- return;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "self-healing file %s from subvolume %s to %d other",
- local->loc.path, priv->children[sh->source]->name,
- sh->active_sinks);
- afr_sh_data_trim_sinks (frame, this);
-}
-int
-afr_sh_data_fxattrop_fstat_done (call_frame_t *frame, xlator_t *this)
+static int
+afr_selfheal_data_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ unsigned char *healed_sinks)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int nsources = 0;
- int ret = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- gf_log (this->name, GF_LOG_DEBUG, "Pending matrix for: %s",
- lkowner_utoa (&frame->root->lk_owner));
-
- nsources = afr_build_sources (this, sh->xattr, sh->buf, sh->pending_matrix,
- sh->sources, sh->success_children,
- AFR_DATA_TRANSACTION, NULL, _gf_true);
- if ((nsources == -1)
- && (priv->favorite_child != -1)
- && (sh->child_errno[priv->favorite_child] == 0)) {
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Picking favorite child %s as authentic source to "
- "resolve conflicting data of %s",
- priv->children[priv->favorite_child]->name,
- local->loc.path);
-
- sh->sources[priv->favorite_child] = 1;
-
- nsources = afr_sh_source_count (sh->sources,
- priv->child_count);
- }
-
- if (nsources == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "Unable to self-heal contents of '%s' (possible "
- "split-brain). Please delete the file from all but "
- "the preferred subvolume.", local->loc.path);
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
- sh->data_spb = _gf_true;
- afr_sh_data_fail (frame, this);
- return 0;
- }
+ local = frame->local;
+ priv = this->private;
- sh->data_spb = _gf_false;
- ret = afr_sh_inode_set_read_ctx (sh, this);
- if (ret) {
- gf_log (this->name, GF_LOG_DEBUG,
- "No active sources found.");
+ AFR_ONLIST (healed_sinks, frame, attr_cbk, fsync, fd, 0, NULL);
- afr_sh_data_fail (frame, this);
- return 0;
- }
-
- if (sh->sync_done) {
- afr_sh_data_setattr (frame, this);
- } else {
- if (nsources == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "No self-heal needed for %s",
- local->loc.path);
-
- afr_sh_data_finish (frame, this);
- return 0;
- }
-
- if (sh->do_data_self_heal &&
- afr_data_self_heal_enabled (priv->data_self_heal))
- afr_sh_data_fix (frame, this);
- else
- afr_sh_data_finish (frame, this);
- }
- return 0;
+ for (i = 0; i < priv->child_count; i++)
+ if (healed_sinks[i] && local->replies[i].op_ret != 0)
+ /* fsync() failed. Do NOT consider this server
+ as successfully healed. Mark it so.
+ */
+ healed_sinks[i] = 0;
+ return 0;
}
-int
-afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local,
- dict_t **xattr,
- afr_transaction_type txn_type,
- uuid_t gfid)
-{
- afr_private_t *priv = NULL;
- int read_child = -1;
- int32_t **pending_matrix = NULL;
- int32_t *sources = NULL;
- int32_t *success_children = NULL;
- struct iatt *bufs = NULL;
- int32_t nsources = 0;
- int32_t prev_read_child = -1;
- int32_t config_read_child = -1;
- int32_t subvol_status = 0;
-
- priv = this->private;
- bufs = local->cont.lookup.bufs;
- success_children = local->cont.lookup.success_children;
-
- pending_matrix = local->cont.lookup.pending_matrix;
- sources = local->cont.lookup.sources;
- memset (sources, 0, sizeof (*sources) * priv->child_count);
-
- nsources = afr_build_sources (this, xattr, bufs, pending_matrix,
- sources, success_children, txn_type,
- &subvol_status, _gf_false);
- if (subvol_status & SPLIT_BRAIN) {
- gf_log (this->name, GF_LOG_DEBUG, "%s: Possible split-brain",
- local->loc.path);
- switch (txn_type) {
- case AFR_DATA_TRANSACTION:
- local->cont.lookup.possible_spb = _gf_true;
- nsources = 1;
- sources[success_children[0]] = 1;
- break;
- case AFR_ENTRY_TRANSACTION:
- read_child = afr_get_no_xattr_dir_read_child (this,
- success_children,
- bufs);
- sources[read_child] = 1;
- nsources = 1;
- break;
- default:
- break;
- }
- }
- if (nsources < 0)
- goto out;
-
- prev_read_child = local->read_child_index;
- config_read_child = priv->read_child;
- read_child = afr_select_read_child_from_policy (success_children,
- priv->child_count,
- prev_read_child,
- config_read_child,
- sources,
- priv->hash_mode, gfid);
-out:
- gf_log (this->name, GF_LOG_DEBUG, "returning read_child: %d",
- read_child);
- return read_child;
-}
-int
-afr_sh_data_fstat_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- struct iatt *buf, dict_t *xdata)
+static int
+afr_selfheal_data_restore_time (call_frame_t *frame, xlator_t *this,
+ inode_t *inode, int source,
+ unsigned char *healed_sinks,
+ struct afr_reply *replies)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int call_count = -1;
- int child_index = (long) cookie;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (op_ret != -1) {
- gf_log (this->name, GF_LOG_TRACE,
- "fstat of %s on %s succeeded",
- local->loc.path,
- priv->children[child_index]->name);
-
- sh->buf[child_index] = *buf;
- sh->success_children[sh->success_count] = child_index;
- sh->success_count++;
- } else {
- gf_log (this->name, GF_LOG_ERROR, "%s: fstat failed "
- "on %s, reason %s", local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- sh->child_errno[child_index] = op_errno;
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- /* Previous versions of glusterfs might have set
- * the pending data xattrs which need to be erased
- */
- if (!afr_sh_data_proceed (sh->success_count)) {
- gf_log (this->name, GF_LOG_ERROR, "inspecting metadata "
- "succeeded on < %d children, aborting "
- "self-heal for %s", AFR_SH_MIN_PARTICIPANTS,
- local->loc.path);
- afr_sh_data_fail (frame, this);
- goto out;
- }
- afr_sh_data_fxattrop_fstat_done (frame, this);
- }
-out:
- return 0;
-}
+ loc_t loc = {0, };
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
-int
-afr_sh_data_fstat (call_frame_t *frame, xlator_t *this)
-{
- afr_self_heal_t *sh = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int i = 0;
- int child = 0;
- int32_t *fstat_children = NULL;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- fstat_children = memdup (sh->success_children,
- sizeof (*fstat_children) * priv->child_count);
- if (!fstat_children) {
- afr_sh_data_fail (frame, this);
- goto out;
- }
- call_count = sh->success_count;
- local->call_count = call_count;
-
- memset (sh->buf, 0, sizeof (*sh->buf) * priv->child_count);
- afr_reset_children (sh->success_children, priv->child_count);
- sh->success_count = 0;
- for (i = 0; i < priv->child_count; i++) {
- child = fstat_children[i];
- if (child == -1)
- break;
- STACK_WIND_COOKIE (frame, afr_sh_data_fstat_cbk,
- (void *) (long) child,
- priv->children[child],
- priv->children[child]->fops->fstat,
- sh->healing_fd, NULL);
- --call_count;
- }
- GF_ASSERT (!call_count);
-out:
- GF_FREE (fstat_children);
- return 0;
-}
+ AFR_ONLIST (healed_sinks, frame, attr_cbk, setattr, &loc,
+ &replies[source].poststat,
+ (GF_SET_ATTR_ATIME|GF_SET_ATTR_MTIME), NULL);
-void
-afr_sh_common_fxattrop_resp_handler (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, dict_t *xattr)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int child_index = (long) cookie;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (op_ret != -1) {
- gf_log (this->name, GF_LOG_TRACE,
- "fxattrop of %s on %s succeeded",
- local->loc.path,
- priv->children[child_index]->name);
-
- sh->xattr[child_index] = dict_ref (xattr);
- sh->success_children[sh->success_count] = child_index;
- sh->success_count++;
- } else {
- gf_log (this->name, GF_LOG_ERROR, "fxattrop of %s "
- "failed on %s, reason %s", local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- sh->child_errno[child_index] = op_errno;
- }
- }
- UNLOCK (&frame->lock);
-}
+ loc_wipe (&loc);
-int
-afr_sh_data_fxattrop_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *xattr, dict_t *xdata)
-{
- int call_count = -1;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- afr_sh_common_fxattrop_resp_handler (frame, cookie, this, op_ret,
- op_errno, xattr);
-
- call_count = afr_frame_return (frame);
- if (call_count == 0) {
- if (!afr_sh_data_proceed (sh->success_count)) {
- gf_log (this->name, GF_LOG_ERROR, "%s, inspecting "
- "change log succeeded on < %d children",
- local->loc.path, AFR_SH_MIN_PARTICIPANTS);
- afr_sh_data_fail (frame, this);
- goto out;
- }
- afr_sh_data_fstat (frame, this);
- }
-out:
- return 0;
+ return 0;
}
-
-int
-afr_sh_data_fxattrop (call_frame_t *frame, xlator_t *this)
+static int
+afr_data_self_heal_type_get (afr_private_t *priv, unsigned char *healed_sinks,
+ int source, struct afr_reply *replies)
{
- afr_self_heal_t *sh = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- dict_t *xattr_req = NULL;
- int32_t *zero_pending = NULL;
- int call_count = 0;
+ int type = AFR_SELFHEAL_DATA_FULL;
int i = 0;
- int ret = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- call_count = afr_up_children_count (local->child_up,
- priv->child_count);
-
- local->call_count = call_count;
-
- xattr_req = dict_new();
- if (!xattr_req) {
- ret = -1;
- goto out;
- }
- for (i = 0; i < priv->child_count; i++) {
- zero_pending = GF_CALLOC (3, sizeof (*zero_pending),
- gf_afr_mt_int32_t);
- if (!zero_pending) {
- ret = -1;
- goto out;
- }
- ret = dict_set_dynptr (xattr_req, priv->pending_key[i],
- zero_pending,
- 3 * sizeof (*zero_pending));
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "Unable to set dict value");
- goto out;
- } else {
- zero_pending = NULL;
- }
- }
-
- afr_reset_xattr (sh->xattr, priv->child_count);
- afr_reset_children (sh->success_children, priv->child_count);
- memset (sh->child_errno, 0,
- sizeof (*sh->child_errno) * priv->child_count);
- sh->success_count = 0;
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_sh_data_fxattrop_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- sh->healing_fd, GF_XATTROP_ADD_ARRAY,
- xattr_req, NULL);
-
- if (!--call_count)
+ if (priv->data_self_heal_algorithm == NULL) {
+ type = AFR_SELFHEAL_DATA_FULL;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i] && i != source)
+ continue;
+ if (replies[i].poststat.ia_size) {
+ type = AFR_SELFHEAL_DATA_DIFF;
break;
+ }
}
+ } else if (strcmp (priv->data_self_heal_algorithm, "full") == 0) {
+ type = AFR_SELFHEAL_DATA_FULL;
+ } else if (strcmp (priv->data_self_heal_algorithm, "diff") == 0) {
+ type = AFR_SELFHEAL_DATA_DIFF;
}
-
-out:
- if (xattr_req)
- dict_unref (xattr_req);
-
- if (ret) {
- GF_FREE (zero_pending);
- afr_sh_data_fail (frame, this);
- }
-
- return 0;
+ return type;
}
-int
-afr_sh_data_big_lock_success (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
+static int
+afr_selfheal_data_do (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int source, unsigned char *healed_sinks,
+ struct afr_reply *replies)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ off_t off = 0;
+ size_t block = 128 * 1024;
+ int type = AFR_SELFHEAL_DATA_FULL;
+ int ret = -1;
+ call_frame_t *iter_frame = NULL;
+ char *sinks_str = NULL;
+ char *p = NULL;
+
+ priv = this->private;
+
+ sinks_str = alloca0 (priv->child_count * 8);
+ p = sinks_str;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i])
+ continue;
+ p += sprintf (p, "%d ", i);
+ }
+
+ gf_log (this->name, GF_LOG_INFO, "performing data selfheal on %s. "
+ "source=%d sinks=%s",
+ uuid_utoa (fd->inode->gfid), source, sinks_str);
+
+ type = afr_data_self_heal_type_get (priv, healed_sinks, source,
+ replies);
+
+ iter_frame = afr_copy_frame (frame);
+ if (!iter_frame)
+ return -ENOMEM;
+
+ for (off = 0; off < replies[source].poststat.ia_size; off += block) {
+ ret = afr_selfheal_data_block (iter_frame, this, fd, source,
+ healed_sinks, off, block, type,
+ replies);
+ if (ret < 0)
+ goto out;
+
+ AFR_STACK_RESET (iter_frame);
+ }
+
+ afr_selfheal_data_restore_time (frame, this, fd->inode, source,
+ healed_sinks, replies);
+
+ ret = afr_selfheal_data_fsync (frame, this, fd, healed_sinks);
- sh->data_lock_held = _gf_true;
- afr_sh_data_fxattrop (frame, this);
- return 0;
+out:
+ if (iter_frame)
+ AFR_STACK_DESTROY (iter_frame);
+ return ret;
}
-int
-afr_sh_data_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- sh = &local->self_heal;
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_ERROR, "Blocking data inodelks "
- "failed for %s. by %s",
- local->loc.path, lkowner_utoa (&frame->root->lk_owner));
+static int
+__afr_selfheal_truncate_sinks (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, unsigned char *healed_sinks,
+ struct afr_reply *replies, uint64_t size)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ unsigned char *larger_sinks = 0;
+ int i = 0;
- sh->data_lock_failure_handler (frame, this);
- } else {
+ local = frame->local;
+ priv = this->private;
- gf_log (this->name, GF_LOG_DEBUG, "Blocking data inodelks "
- "done for %s by %s. Proceding to self-heal",
- local->loc.path, lkowner_utoa (&frame->root->lk_owner));
+ larger_sinks = alloca0 (priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ if (healed_sinks[i] && replies[i].poststat.ia_size > size)
+ larger_sinks[i] = 1;
+ }
- sh->data_lock_success_handler (frame, this);
- }
+ AFR_ONLIST (larger_sinks, frame, attr_cbk, ftruncate, fd, size, NULL);
- return 0;
+ for (i = 0; i < priv->child_count; i++)
+ if (healed_sinks[i] && local->replies[i].op_ret == -1)
+ /* truncate() failed. Do NOT consider this server
+ as successfully healed. Mark it so.
+ */
+ healed_sinks[i] = 0;
+ return 0;
}
-int
-afr_sh_data_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- sh = &local->self_heal;
-
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG, "Non Blocking data inodelks "
- "failed for %s. by %s",
- local->loc.path, lkowner_utoa (&frame->root->lk_owner));
-
- int_lock->lock_cbk = afr_sh_data_post_blocking_inodelk_cbk;
- afr_blocking_lock (frame, this);
- } else {
-
- gf_log (this->name, GF_LOG_DEBUG, "Non Blocking data inodelks "
- "done for %s by %s. Proceeding to self-heal",
- local->loc.path, lkowner_utoa (&frame->root->lk_owner));
- sh->data_lock_success_handler (frame, this);
- }
-
- return 0;
+/*
+ * If by chance there are multiple sources with differing sizes, select
+ * the largest file as the source.
+ *
+ * This can only happen if data was directly modified in the backend.
+ */
+static int
+__afr_selfheal_data_finalize_source (xlator_t *this, unsigned char *sources,
+ unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *locked_on,
+ struct afr_reply *replies)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ uint64_t size = 0;
+ int source = -1;
+ int locked_count = 0;
+ int sources_count = 0;
+ int healed_sinks_count = 0;
+
+ priv = this->private;
+
+ locked_count = AFR_COUNT (locked_on, priv->child_count);
+ sources_count = AFR_COUNT (sources, priv->child_count);
+ healed_sinks_count = AFR_COUNT (healed_sinks, priv->child_count);
+
+ if (locked_count == healed_sinks_count || !sources_count) {
+ /* split brain */
+ return -EIO;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (size <= replies[i].poststat.ia_size) {
+ size = replies[i].poststat.ia_size;
+ source = i;
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (replies[i].poststat.ia_size < size) {
+ sources[i] = 0;
+ sinks[i] = 1;
+ }
+ }
+
+ return source;
}
-int
-afr_sh_data_lock_rec (call_frame_t *frame, xlator_t *this, off_t start, off_t len)
+/*
+ * __afr_selfheal_data_prepare:
+ *
+ * This function inspects the on-disk xattrs and determines which subvols
+ * are sources and sinks.
+ *
+ * The return value is the index of the subvolume to be used as the source
+ * for self-healing, or -1 if no healing is necessary/split brain.
+ */
+static int
+__afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ unsigned char *locked_on, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks,
+ struct afr_reply *replies)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ int ret = -1;
+ int source = -1;
+ afr_private_t *priv = NULL;
+ int i = 0;
- local = frame->local;
- int_lock = &local->internal_lock;
+ priv = this->private;
- int_lock->transaction_lk_type = AFR_SELFHEAL_LK;
- int_lock->selfheal_lk_type = AFR_DATA_SELF_HEAL_LK;
+ ret = afr_selfheal_unlocked_discover (frame, fd->inode, fd->inode->gfid,
+ replies);
+ if (ret)
+ return ret;
- afr_set_lock_number (frame, this);
+ ret = afr_selfheal_find_direction (frame, this, replies,
+ AFR_DATA_TRANSACTION,
+ locked_on, sources, sinks);
+ if (ret)
+ return ret;
- int_lock->lk_flock.l_start = start;
- int_lock->lk_flock.l_len = len;
- int_lock->lk_flock.l_type = F_WRLCK;
- int_lock->lock_cbk = afr_sh_data_post_nonblocking_inodelk_cbk;
+ source = __afr_selfheal_data_finalize_source (this, sources, sinks,
+ healed_sinks, locked_on,
+ replies);
+ if (source < 0)
+ return -EIO;
- afr_nonblocking_inodelk (frame, this);
+ for (i = 0; i < priv->child_count; i++)
+ /* Initialize the healed_sinks[] array optimistically to
+ the intersection of to-be-healed (i.e sinks[]) and
+ the list of servers which are up (i.e locked_on[]).
- return 0;
-}
+ As we encounter failures in the healing process, we
+ will unmark the respective servers in the healed_sinks[]
+ array.
+ */
+ healed_sinks[i] = sinks[i] && locked_on[i];
-int
-afr_post_sh_big_lock_success (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- GF_ASSERT (sh->old_loop_frame);
- sh_loop_finish (sh->old_loop_frame, this);
- sh->old_loop_frame = NULL;
- sh->data_lock_held = _gf_true;
- sh->sync_done = _gf_true;
- afr_sh_data_fxattrop (frame, this);
- return 0;
+ return source;
}
-int
-afr_post_sh_big_lock_failure (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- GF_ASSERT (sh->old_loop_frame);
- sh_loop_finish (sh->old_loop_frame, this);
- sh->old_loop_frame = NULL;
- afr_sh_set_timestamps (frame, this);
- return 0;
+static int
+__afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ unsigned char *locked_on)
+{
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *data_lock = NULL;
+ unsigned char *healed_sinks = NULL;
+ struct afr_reply *locked_replies = NULL;
+ int source = -1;
+ gf_boolean_t compat = _gf_false;
+ unsigned char *compat_lock = NULL;
+
+ priv = this->private;
+
+ sources = alloca0 (priv->child_count);
+ sinks = alloca0 (priv->child_count);
+ healed_sinks = alloca0 (priv->child_count);
+ data_lock = alloca0 (priv->child_count);
+ compat_lock = alloca0 (priv->child_count);
+
+ locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count);
+
+ ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name, 0, 0,
+ data_lock);
+ {
+ if (ret < 2) {
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_data_prepare (frame, this, fd, data_lock,
+ sources, sinks, healed_sinks,
+ locked_replies);
+ if (ret < 0)
+ goto unlock;
+
+ source = ret;
+
+ ret = __afr_selfheal_truncate_sinks (frame, this, fd, healed_sinks,
+ locked_replies,
+ locked_replies[source].poststat.ia_size);
+ if (ret < 0)
+ goto unlock;
+
+ ret = 0;
+
+ /* Locking from (LLONG_MAX - 2) to (LLONG_MAX - 1) is for
+ compatibility with older self-heal clients which do not
+ hold a lock in the @priv->sh_domain domain to guard
+ against concurrent ongoing self-heals
+ */
+ afr_selfheal_inodelk (frame, this, fd->inode, this->name,
+ LLONG_MAX - 2, 1, compat_lock);
+ compat = _gf_true;
+ }
+unlock:
+ afr_selfheal_uninodelk (frame, this, fd->inode, this->name, 0, 0,
+ data_lock);
+ if (ret < 0)
+ goto out;
+
+ ret = afr_selfheal_data_do (frame, this, fd, source, healed_sinks,
+ locked_replies);
+ if (ret)
+ goto out;
+
+ ret = afr_selfheal_undo_pending (frame, this, fd->inode, sources, sinks,
+ healed_sinks, AFR_DATA_TRANSACTION,
+ locked_replies, data_lock);
+out:
+ if (compat)
+ afr_selfheal_uninodelk (frame, this, fd->inode, this->name,
+ LLONG_MAX - 2, 1, compat_lock);
+ return ret;
}
-int
-afr_sh_data_lock (call_frame_t *frame, xlator_t *this,
- off_t start, off_t len,
- afr_lock_cbk_t success_handler,
- afr_lock_cbk_t failure_handler)
+static fd_t *
+afr_selfheal_data_open (xlator_t *this, inode_t *inode)
{
- afr_local_t * local = NULL;
- afr_self_heal_t * sh = NULL;
+ loc_t loc = {0,};
+ int ret = 0;
+ fd_t *fd = NULL;
- local = frame->local;
- sh = &local->self_heal;
+ fd = fd_create (inode, 0);
+ if (!fd)
+ return NULL;
- sh->data_lock_success_handler = success_handler;
- sh->data_lock_failure_handler = failure_handler;
- return afr_sh_data_lock_rec (frame, this, start, len);
-}
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
-int
-afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int child_index = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- child_index = (long) cookie;
-
- /* TODO: some of the open's might fail.
- In that case, modify cleanup fn to send flush on those
- fd's which are already open */
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "open of %s failed on child %s (%s)",
- local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- sh->op_failed = 1;
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "open of %s succeeded on child %s",
- local->loc.path,
- priv->children[child_index]->name);
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if (sh->op_failed) {
- afr_sh_data_fail (frame, this);
- return 0;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "fd for %s opened, commencing sync",
- local->loc.path);
+ ret = syncop_open (this, &loc, O_RDWR|O_LARGEFILE, fd);
+ if (ret) {
+ fd_unref (fd);
+ fd = NULL;
+ } else {
+ fd_bind (fd);
+ }
- afr_sh_data_lock (frame, this, 0, 0,
- afr_sh_data_big_lock_success,
- afr_sh_data_fail);
- }
+ loc_wipe (&loc);
- return 0;
+ return fd;
}
int
-afr_sh_data_open (call_frame_t *frame, xlator_t *this)
-{
- int i = 0;
- int call_count = 0;
- fd_t *fd = NULL;
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- call_count = afr_up_children_count (local->child_up, priv->child_count);
- local->call_count = call_count;
-
- fd = fd_create (local->loc.inode, frame->root->pid);
- sh->healing_fd = fd;
-
- /* open sinks */
- for (i = 0; i < priv->child_count; i++) {
- if(!local->child_up[i])
- continue;
-
- STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->open,
- &local->loc,
- O_RDWR|O_LARGEFILE, fd, NULL);
-
- if (!--call_count)
- break;
- }
-
- return 0;
-}
-
-void
-afr_sh_non_reg_fix (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode)
{
- afr_private_t *priv = NULL;
- afr_self_heal_t *sh = NULL;
- afr_local_t *local = NULL;
- int i = 0;
-
- if (op_ret < 0) {
- afr_sh_data_fail (frame, this);
- return;
- }
+ afr_private_t *priv = NULL;
+ unsigned char *locked_on = NULL;
+ int ret = 0;
+ fd_t *fd = NULL;
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
+ priv = this->private;
- for (i = 0; i < priv->child_count ; i++) {
- if (1 == local->child_up[i])
- sh->success[i] = 1;
- }
+ fd = afr_selfheal_data_open (this, inode);
+ if (!fd)
+ return -EIO;
- afr_sh_erase_pending (frame, this, AFR_DATA_TRANSACTION,
- afr_sh_data_erase_pending_cbk,
- afr_sh_data_finish);
-}
+ locked_on = alloca0 (priv->child_count);
-int
-afr_sh_non_reg_lock_success (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- sh->data_lock_held = _gf_true;
- afr_sh_common_lookup (frame, this, &local->loc,
- afr_sh_non_reg_fix, NULL,
- AFR_LOOKUP_FAIL_CONFLICTS |
- AFR_LOOKUP_FAIL_MISSING_GFIDS,
- NULL);
- return 0;
-}
+ ret = afr_selfheal_tryinodelk (frame, this, inode, priv->sh_domain, 0, 0,
+ locked_on);
+ {
+ if (ret < 2) {
+ /* Either less than two subvols available, or another
+ selfheal (from another server) is in progress. Skip
+ for now in any case there isn't anything to do.
+ */
+ ret = -ENOTCONN;
+ goto unlock;
+ }
-gf_boolean_t
-afr_can_start_data_self_heal (afr_self_heal_t *sh, afr_private_t *priv)
-{
- if (sh->force_confirm_spb)
- return _gf_true;
- if (sh->do_data_self_heal &&
- afr_data_self_heal_enabled (priv->data_self_heal))
- return _gf_true;
- return _gf_false;
-}
+ ret = __afr_selfheal_data (frame, this, fd, locked_on);
+ }
+unlock:
+ afr_selfheal_uninodelk (frame, this, inode, priv->sh_domain, 0, 0, locked_on);
-int
-afr_self_heal_data (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = this->private;
-
- local = frame->local;
- sh = &local->self_heal;
-
- /* Self-heal completion cbk changes inode split-brain status based on
- * govinda_gOvinda, mdata_spb, data_spb value.
- * Initialize data_spb with current split-brain status.
- * If for some reason self-heal fails(locking phase etc), it makes sure
- * we retain the split-brain status before this self-heal started.
- */
- sh->data_spb = afr_is_split_brain (this, sh->inode);
- if (afr_can_start_data_self_heal (sh, priv)) {
- if (IA_ISREG (sh->type)) {
- afr_sh_data_open (frame, this);
- } else {
- afr_sh_data_lock (frame, this, 0, 0,
- afr_sh_non_reg_lock_success,
- afr_sh_data_fail);
- }
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "not doing data self heal on %s",
- local->loc.path);
- afr_sh_data_done (frame, this);
- }
+ if (fd)
+ fd_unref (fd);
- return 0;
+ return ret;
}
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
index 60c140c1a..9e714b026 100644
--- a/xlators/cluster/afr/src/afr-self-heal-entry.c
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
@@ -1,5 +1,5 @@
/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
@@ -8,2352 +8,622 @@
cases as published by the Free Software Foundation.
*/
-#include <libgen.h>
-#include <unistd.h>
-#include <fnmatch.h>
-#include <sys/time.h>
-#include <stdlib.h>
-#include <signal.h>
#ifndef _CONFIG_H
#define _CONFIG_H
#include "config.h"
#endif
-#include "glusterfs.h"
-#include "inode.h"
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
+#include "afr-self-heal.h"
#include "byte-order.h"
-
#include "afr-transaction.h"
-#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
-
-#define AFR_INIT_SH_FRAME_VALS(_frame, _local, _sh, _sh_frame, _sh_local, _sh_sh)\
- do {\
- _local = _frame->local;\
- _sh = &_local->self_heal;\
- _sh_frame = _sh->sh_frame;\
- _sh_local = _sh_frame->local;\
- _sh_sh = &_sh_local->self_heal;\
- } while (0);
-
-int
-afr_sh_entry_impunge_create_file (call_frame_t *impunge_frame, xlator_t *this,
- int child_index);
-int
-afr_sh_entry_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- sh->completion_cbk (frame, this);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_unlock (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- int_lock->lock_cbk = afr_sh_entry_done;
- afr_unlock (frame, this);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_finish (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- gf_log (this->name, GF_LOG_TRACE,
- "finishing entry selfheal of %s", local->loc.path);
-
- afr_sh_entry_unlock (frame, this);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_erase_pending_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, dict_t *xattr, dict_t *xdata)
-{
- long i = 0;
- int call_count = 0;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_local_t *orig_local = NULL;
- call_frame_t *orig_frame = NULL;
- afr_private_t *priv = NULL;
- int32_t read_child = -1;
-
- local = frame->local;
- priv = this->private;
- sh = &local->self_heal;
- i = (long)cookie;
-
-
- afr_children_add_child (sh->fresh_children, i, priv->child_count);
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_INFO,
- "%s: failed to erase pending xattrs on %s (%s)",
- local->loc.path, priv->children[i]->name,
- strerror (op_errno));
- }
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if (sh->source == -1) {
- //this happens if the forced merge option is set
- read_child = sh->fresh_children[0];
- } else {
- read_child = sh->source;
- }
- afr_inode_set_read_ctx (this, sh->inode, read_child,
- sh->fresh_children);
- orig_frame = sh->orig_frame;
- orig_local = orig_frame->local;
-
- if (sh->source != -1) {
- orig_local->cont.lookup.buf.ia_nlink = sh->buf[sh->source].ia_nlink;
- }
-
- afr_sh_entry_finish (frame, this);
- }
-
- return 0;
-}
-
-
-int
-afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- if (sh->entries_skipped) {
- sh->op_failed = _gf_true;
- goto out;
- }
- afr_sh_erase_pending (frame, this, AFR_ENTRY_TRANSACTION,
- afr_sh_entry_erase_pending_cbk,
- afr_sh_entry_finish);
- return 0;
-out:
- afr_sh_entry_finish (frame, this);
- return 0;
-}
-
static int
-next_active_source (call_frame_t *frame, xlator_t *this,
- int current_active_source)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int source = -1;
- int next_active_source = -1;
- int i = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- source = sh->source;
-
- if (source != -1) {
- if (current_active_source != source)
- next_active_source = source;
- goto out;
- }
-
- /*
- the next active sink becomes the source for the
- 'conservative decision' of merging all entries
- */
-
- for (i = 0; i < priv->child_count; i++) {
- if ((sh->sources[i] == 0)
- && (local->child_up[i] == 1)
- && (i > current_active_source)) {
-
- next_active_source = i;
- break;
- }
- }
-out:
- return next_active_source;
-}
-
-
-
-static int
-next_active_sink (call_frame_t *frame, xlator_t *this,
- int current_active_sink)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int next_active_sink = -1;
- int i = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- /*
- the next active sink becomes the source for the
- 'conservative decision' of merging all entries
- */
-
- for (i = 0; i < priv->child_count; i++) {
- if ((sh->sources[i] == 0)
- && (local->child_up[i] == 1)
- && (i > current_active_sink)) {
-
- next_active_sink = i;
- break;
- }
- }
-
- return next_active_sink;
-}
-
-int
-afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this);
-
-int
-afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this);
-
-int
-afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this);
-
-int
-afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this,
- int active_src);
-
-int
-afr_sh_entry_expunge_entry_done (call_frame_t *frame, xlator_t *this,
- int active_src, int32_t op_ret,
- int32_t op_errno)
-{
- int call_count = 0;
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- afr_sh_entry_expunge_subvol (frame, this, active_src);
-
- return 0;
-}
-
-int
-afr_sh_entry_expunge_parent_setattr_cbk (call_frame_t *expunge_frame,
- void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preop, struct iatt *postop,
- dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- call_frame_t *frame = NULL;
- int active_src = (long) cookie;
- afr_self_heal_t *sh = NULL;
- afr_local_t *local = NULL;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
- expunge_sh = &expunge_local->self_heal;
- frame = expunge_sh->sh_frame;
- local = frame->local;
- sh = &local->self_heal;
-
- if (op_ret != 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "setattr on parent directory of %s on subvolume %s failed: %s",
- expunge_local->loc.path,
- priv->children[active_src]->name, strerror (op_errno));
- }
-
- AFR_STACK_DESTROY (expunge_frame);
- sh->expunge_done (frame, this, active_src, op_ret, op_errno);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_expunge_remove_cbk (call_frame_t *expunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- int active_src = 0;
- int32_t valid = 0;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
- expunge_sh = &expunge_local->self_heal;
-
- active_src = (long) cookie;
-
- if (op_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "removed %s on %s",
- expunge_local->loc.path,
- priv->children[active_src]->name);
- } else {
- gf_log (this->name, GF_LOG_INFO,
- "removing %s on %s failed (%s)",
- expunge_local->loc.path,
- priv->children[active_src]->name,
- strerror (op_errno));
- }
-
- valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME;
-
- STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_parent_setattr_cbk,
- (void *) (long) active_src,
- priv->children[active_src],
- priv->children[active_src]->fops->setattr,
- &expunge_sh->parent_loc,
- &expunge_sh->parentbuf,
- valid, NULL);
-
- return 0;
-}
+afr_selfheal_entry_delete (call_frame_t *frame, xlator_t *this, inode_t *dir,
+ const char *name, inode_t *inode, int child,
+ struct afr_reply *replies)
+{
+ afr_private_t *priv = NULL;
+ xlator_t *subvol = NULL;
+ int ret = 0;
+ loc_t loc = {0, };
+ char g[64];
+
+ priv = this->private;
+
+ subvol = priv->children[child];
+
+ loc.parent = inode_ref (dir);
+ uuid_copy (loc.pargfid, dir->gfid);
+ loc.name = name;
+ loc.inode = inode_ref (inode);
+
+ if (replies[child].valid && replies[child].op_ret == 0) {
+ switch (replies[child].poststat.ia_type) {
+ case IA_IFDIR:
+ gf_log (this->name, GF_LOG_WARNING,
+ "expunging dir %s/%s (%s) on %s",
+ uuid_utoa (dir->gfid), name,
+ uuid_utoa_r (replies[child].poststat.ia_gfid, g),
+ subvol->name);
+ ret = syncop_rmdir (subvol, &loc, 1);
+ break;
+ default:
+ gf_log (this->name, GF_LOG_WARNING,
+ "expunging file %s/%s (%s) on %s",
+ uuid_utoa (dir->gfid), name,
+ uuid_utoa_r (replies[child].poststat.ia_gfid, g),
+ subvol->name);
+ ret = syncop_unlink (subvol, &loc);
+ break;
+ }
+ }
+
+ loc_wipe (&loc);
+
+ return ret;
+}
+
+
+int
+afr_selfheal_recreate_entry (call_frame_t *frame, xlator_t *this, int dst,
+ int source, inode_t *dir, const char *name,
+ inode_t *inode, struct afr_reply *replies)
+{
+ int ret = 0;
+ loc_t loc = {0,};
+ loc_t srcloc = {0,};
+ afr_private_t *priv = NULL;
+ dict_t *xdata = NULL;
+ struct iatt *iatt = NULL;
+ char *linkname = NULL;
+ mode_t mode = 0;
+ struct iatt newent = {0,};
+
+ priv = this->private;
+
+ xdata = dict_new();
+ if (!xdata)
+ return -ENOMEM;
+
+ loc.parent = inode_ref (dir);
+ uuid_copy (loc.pargfid, dir->gfid);
+ loc.name = name;
+ loc.inode = inode_ref (inode);
+
+ ret = afr_selfheal_entry_delete (frame, this, dir, name, inode, dst,
+ replies);
+ if (ret)
+ goto out;
+
+ ret = dict_set_static_bin (xdata, "gfid-req",
+ replies[source].poststat.ia_gfid, 16);
+ if (ret)
+ goto out;
+
+ iatt = &replies[source].poststat;
+
+ srcloc.inode = inode_ref (inode);
+ uuid_copy (srcloc.gfid, iatt->ia_gfid);
+
+ mode = st_mode_from_ia (iatt->ia_prot, iatt->ia_type);
+
+ switch (iatt->ia_type) {
+ case IA_IFDIR:
+ ret = syncop_mkdir (priv->children[dst], &loc, mode, xdata, 0);
+ break;
+ case IA_IFLNK:
+ ret = syncop_lookup (priv->children[dst], &srcloc, 0, 0, 0, 0);
+ if (ret == 0) {
+ ret = syncop_link (priv->children[dst], &srcloc, &loc);
+ } else {
+ ret = syncop_readlink (priv->children[source], &srcloc,
+ &linkname, 4096);
+ if (ret <= 0)
+ goto out;
+ ret = syncop_symlink (priv->children[dst], &loc, linkname,
+ xdata, NULL);
+ }
+ break;
+ default:
+ ret = dict_set_int32 (xdata, GLUSTERFS_INTERNAL_FOP_KEY, 1);
+ if (ret)
+ goto out;
+ ret = syncop_mknod (priv->children[dst], &loc, mode,
+ iatt->ia_rdev, xdata, &newent);
+ if (ret == 0 && iatt->ia_size && !newent.ia_size) {
+ /* New entry created. Mark @dst pending on all sources */
+ ret = 1;
+ }
+ break;
+ }
-
-int
-afr_sh_entry_expunge_unlink (call_frame_t *expunge_frame, xlator_t *this,
- int active_src)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
-
- gf_log (this->name, GF_LOG_TRACE,
- "expunging file %s on %s",
- expunge_local->loc.path, priv->children[active_src]->name);
-
- STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk,
- (void *) (long) active_src,
- priv->children[active_src],
- priv->children[active_src]->fops->unlink,
- &expunge_local->loc, 0, NULL);
-
- return 0;
-}
-
-
-
-int
-afr_sh_entry_expunge_rmdir (call_frame_t *expunge_frame, xlator_t *this,
- int active_src)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "expunging directory %s on %s",
- expunge_local->loc.path, priv->children[active_src]->name);
-
- STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk,
- (void *) (long) active_src,
- priv->children[active_src],
- priv->children[active_src]->fops->rmdir,
- &expunge_local->loc, 1, NULL);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this,
- int active_src, struct iatt *buf,
- struct iatt *parentbuf)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- call_frame_t *frame = NULL;
- int type = 0;
- afr_self_heal_t *sh = NULL;
- afr_local_t *local = NULL;
- loc_t *loc = NULL;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
- expunge_sh = &expunge_local->self_heal;
- frame = expunge_sh->sh_frame;
- local = frame->local;
- sh = &local->self_heal;
- loc = &expunge_local->loc;
-
- type = buf->ia_type;
- if (loc->parent && uuid_is_null (loc->parent->gfid))
- uuid_copy (loc->pargfid, parentbuf->ia_gfid);
-
- switch (type) {
- case IA_IFSOCK:
- case IA_IFREG:
- case IA_IFBLK:
- case IA_IFCHR:
- case IA_IFIFO:
- case IA_IFLNK:
- afr_sh_entry_expunge_unlink (expunge_frame, this, active_src);
- break;
- case IA_IFDIR:
- afr_sh_entry_expunge_rmdir (expunge_frame, this, active_src);
- break;
- default:
- gf_log (this->name, GF_LOG_ERROR,
- "%s has unknown file type on %s: 0%o",
- expunge_local->loc.path,
- priv->children[active_src]->name, type);
- goto out;
- break;
- }
-
- return 0;
-out:
- AFR_STACK_DESTROY (expunge_frame);
- sh->expunge_done (frame, this, active_src, -1, EINVAL);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_expunge_lookup_cbk (call_frame_t *expunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf, dict_t *x,
- struct iatt *postparent)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- call_frame_t *frame = NULL;
- int active_src = 0;
- afr_self_heal_t *sh = NULL;
- afr_local_t *local = NULL;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
- expunge_sh = &expunge_local->self_heal;
- frame = expunge_sh->sh_frame;
- active_src = (long) cookie;
- local = frame->local;
- sh = &local->self_heal;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "lookup of %s on %s failed (%s)",
- expunge_local->loc.path,
- priv->children[active_src]->name,
- strerror (op_errno));
- goto out;
- }
-
- afr_sh_entry_expunge_remove (expunge_frame, this, active_src, buf,
- postparent);
-
- return 0;
out:
- AFR_STACK_DESTROY (expunge_frame);
- sh->expunge_done (frame, this, active_src, op_ret, op_errno);
-
- return 0;
+ if (xdata)
+ dict_unref (xdata);
+ loc_wipe (&loc);
+ loc_wipe (&srcloc);
+ return ret;
}
-int
-afr_sh_entry_expunge_purge (call_frame_t *expunge_frame, xlator_t *this,
- int active_src)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
-
- gf_log (this->name, GF_LOG_TRACE,
- "looking up %s on %s",
- expunge_local->loc.path, priv->children[active_src]->name);
-
- STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_lookup_cbk,
- (void *) (long) active_src,
- priv->children[active_src],
- priv->children[active_src]->fops->lookup,
- &expunge_local->loc, NULL);
-
- return 0;
-}
-
-int
-afr_sh_entry_expunge_entry_cbk (call_frame_t *expunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf, dict_t *x,
- struct iatt *postparent)
-{
- afr_private_t *priv = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- int source = 0;
- call_frame_t *frame = NULL;
- int active_src = 0;
- int need_expunge = 0;
- afr_self_heal_t *sh = NULL;
- afr_local_t *local = NULL;
-
- priv = this->private;
- expunge_local = expunge_frame->local;
- expunge_sh = &expunge_local->self_heal;
- frame = expunge_sh->sh_frame;
- active_src = expunge_sh->active_source;
- source = (long) cookie;
- local = frame->local;
- sh = &local->self_heal;
-
- if (op_ret == -1 && op_errno == ENOENT)
- need_expunge = 1;
- else if (op_ret == -1)
- goto out;
-
- if (!uuid_is_null (expunge_sh->entrybuf.ia_gfid) &&
- !uuid_is_null (buf->ia_gfid) &&
- (uuid_compare (expunge_sh->entrybuf.ia_gfid, buf->ia_gfid) != 0)) {
- char uuidbuf1[64];
- char uuidbuf2[64];
- gf_log (this->name, GF_LOG_DEBUG,
- "entry %s found on %s with mismatching gfid (%s/%s)",
- expunge_local->loc.path,
- priv->children[source]->name,
- uuid_utoa_r (expunge_sh->entrybuf.ia_gfid, uuidbuf1),
- uuid_utoa_r (buf->ia_gfid, uuidbuf2));
- need_expunge = 1;
- }
-
- if (need_expunge) {
- gf_log (this->name, GF_LOG_INFO,
- "Entry %s is missing on %s and deleting from "
- "replica's other bricks",
- expunge_local->loc.path,
- priv->children[source]->name);
-
- if (postparent)
- expunge_sh->parentbuf = *postparent;
-
- afr_sh_entry_expunge_purge (expunge_frame, this, active_src);
-
- return 0;
- }
-
-out:
- if (op_ret == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "%s exists under %s",
- expunge_local->loc.path,
- priv->children[source]->name);
- } else {
- gf_log (this->name, GF_LOG_INFO,
- "looking up %s under %s failed (%s)",
- expunge_local->loc.path,
- priv->children[source]->name,
- strerror (op_errno));
- }
-
- AFR_STACK_DESTROY (expunge_frame);
- sh->expunge_done (frame, this, active_src, op_ret, op_errno);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this,
- gf_dirent_t *entry)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int ret = -1;
- call_frame_t *expunge_frame = NULL;
- afr_local_t *expunge_local = NULL;
- afr_self_heal_t *expunge_sh = NULL;
- int active_src = 0;
- int source = 0;
- int op_errno = 0;
- char *name = NULL;
- int op_ret = -1;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- active_src = sh->active_source;
- source = sh->source;
- sh->expunge_done = afr_sh_entry_expunge_entry_done;
-
- name = entry->d_name;
-
- if ((strcmp (name, ".") == 0)
- || (strcmp (name, "..") == 0)) {
-
- gf_log (this->name, GF_LOG_TRACE,
- "skipping inspection of %s under %s",
- name, local->loc.path);
- op_ret = 0;
- goto out;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "inspecting existence of %s under %s",
- name, local->loc.path);
-
- expunge_frame = copy_frame (frame);
- if (!expunge_frame) {
- op_errno = ENOMEM;
- goto out;
- }
-
- AFR_LOCAL_ALLOC_OR_GOTO (expunge_local, out);
-
- expunge_frame->local = expunge_local;
- expunge_sh = &expunge_local->self_heal;
- expunge_sh->sh_frame = frame;
- expunge_sh->active_source = active_src;
- expunge_sh->entrybuf = entry->d_stat;
- loc_copy (&expunge_sh->parent_loc, &local->loc);
-
- ret = afr_build_child_loc (this, &expunge_local->loc, &local->loc,
- name);
- if (ret != 0) {
- op_errno = EINVAL;
- goto out;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "looking up %s on %s", expunge_local->loc.path,
- priv->children[source]->name);
-
- STACK_WIND_COOKIE (expunge_frame,
- afr_sh_entry_expunge_entry_cbk,
- (void *) (long) source,
- priv->children[source],
- priv->children[source]->fops->lookup,
- &expunge_local->loc, NULL);
-
- ret = 0;
-out:
- if (ret == -1)
- sh->expunge_done (frame, this, active_src, op_ret, op_errno);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_expunge_readdir_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- gf_dirent_t *entries, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- gf_dirent_t *entry = NULL;
- off_t last_offset = 0;
- int active_src = 0;
- int entry_count = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- active_src = sh->active_source;
-
- if (op_ret <= 0) {
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
- "readdir of %s on subvolume %s failed (%s)",
- local->loc.path,
- priv->children[active_src]->name,
- strerror (op_errno));
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "readdir of %s on subvolume %s complete",
- local->loc.path,
- priv->children[active_src]->name);
- }
-
- afr_sh_entry_expunge_all (frame, this);
- return 0;
- }
-
- list_for_each_entry (entry, &entries->list, list) {
- last_offset = entry->d_off;
- entry_count++;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "readdir'ed %d entries from %s",
- entry_count, priv->children[active_src]->name);
-
- sh->offset = last_offset;
- local->call_count = entry_count;
-
- list_for_each_entry (entry, &entries->list, list) {
- afr_sh_entry_expunge_entry (frame, this, entry);
- }
-
- return 0;
-}
-
-int
-afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this,
- int active_src)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- STACK_WIND (frame, afr_sh_entry_expunge_readdir_cbk,
- priv->children[active_src],
- priv->children[active_src]->fops->readdirp,
- sh->healing_fd, sh->block_size, sh->offset, NULL);
-
- return 0;
-}
-
-
-int
-afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this)
+static int
+afr_selfheal_newentry_mark (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int source, struct afr_reply *replies,
+ unsigned char *sources, unsigned char *newentry)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int active_src = -1;
+ int ret = 0;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ dict_t *xattr = NULL;
+ int **changelog = NULL;
+ int idx = 0;
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
+ priv = this->private;
- sh->offset = 0;
+ idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION);
- if (sh->source == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no active sources for %s to expunge entries",
- local->loc.path);
- goto out;
- }
+ uuid_copy (inode->gfid, replies[source].poststat.ia_gfid);
- active_src = next_active_sink (frame, this, sh->active_source);
- sh->active_source = active_src;
+ changelog = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS);
- if (sh->op_failed) {
- goto out;
- }
+ xattr = dict_new();
+ if (!xattr)
+ return -ENOMEM;
- if (active_src == -1) {
- /* completed creating missing files on all subvolumes */
- goto out;
- }
+ for (i = 0; i < priv->child_count; i++) {
+ if (!newentry[i])
+ continue;
+ changelog[i][idx] = hton32(1);
+ }
- gf_log (this->name, GF_LOG_TRACE,
- "expunging entries of %s on %s to other sinks",
- local->loc.path, priv->children[active_src]->name);
+ afr_set_pending_dict (priv, xattr, changelog);
- afr_sh_entry_expunge_subvol (frame, this, active_src);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ afr_selfheal_post_op (frame, this, inode, i, xattr);
+ }
- return 0;
-out:
- afr_sh_entry_impunge_all (frame, this);
- return 0;
-
-}
-
-
-int
-afr_sh_entry_impunge_entry_done (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- int call_count = 0;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- if (op_ret < 0)
- sh->entries_skipped = _gf_true;
- call_count = afr_frame_return (frame);
- if (call_count == 0)
- afr_sh_entry_impunge_subvol (frame, this);
-
- return 0;
+ dict_unref (xattr);
+ return ret;
}
-void
-afr_sh_entry_call_impunge_done (call_frame_t *impunge_frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t *impunge_local = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- call_frame_t *frame = NULL;
-
- AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh,
- frame, local, sh);
- AFR_STACK_DESTROY (impunge_frame);
- sh->impunge_done (frame, this, op_ret, op_errno);
+static int
+__afr_selfheal_heal_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ char *name, inode_t *inode, int source,
+ unsigned char *sources, unsigned char *healed_sinks,
+ unsigned char *locked_on, struct afr_reply *replies)
+{
+ int ret = 0;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ unsigned char *newentry = NULL;
+
+ priv = this->private;
+ newentry = alloca0 (priv->child_count);
+
+ if (!replies[source].valid)
+ return -EIO;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i])
+ continue;
+ if (replies[source].op_ret == -1 &&
+ replies[source].op_errno == ENOENT) {
+ ret = afr_selfheal_entry_delete (frame, this, fd->inode,
+ name, inode, i, replies);
+ } else {
+ if (!uuid_compare (replies[i].poststat.ia_gfid,
+ replies[source].poststat.ia_gfid))
+ continue;
+
+ ret = afr_selfheal_recreate_entry (frame, this, i, source,
+ fd->inode, name, inode,
+ replies);
+ if (ret > 0) {
+ newentry[i] = 1;
+ ret = 0;
+ }
+ }
+ if (ret < 0)
+ break;
+ }
+
+ if (AFR_COUNT (newentry, priv->child_count))
+ afr_selfheal_newentry_mark (frame, this, inode, source, replies,
+ sources, newentry);
+ return ret;
}
-int
-afr_sh_entry_impunge_setattr_cbk (call_frame_t *impunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preop, struct iatt *postop,
- dict_t *xdata)
-{
- int call_count = 0;
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- int child_index = 0;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- child_index = (long) cookie;
-
- if (op_ret == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "setattr done for %s on %s",
- impunge_local->loc.path,
- priv->children[child_index]->name);
- } else {
- gf_log (this->name, GF_LOG_INFO,
- "setattr (%s) on %s failed (%s)",
- impunge_local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- }
-
- call_count = afr_frame_return (impunge_frame);
- if (call_count == 0) {
- afr_sh_entry_call_impunge_done (impunge_frame, this,
- 0, op_errno);
- }
-
- return 0;
-}
-int
-afr_sh_entry_impunge_parent_setattr_cbk (call_frame_t *setattr_frame,
- void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preop, struct iatt *postop,
- dict_t *xdata)
+static int
+__afr_selfheal_merge_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ char *name, inode_t *inode, unsigned char *sources,
+ unsigned char *healed_sinks, unsigned char *locked_on,
+ struct afr_reply *replies)
{
- int call_count = 0;
- afr_local_t *setattr_local = NULL;
-
- setattr_local = setattr_frame->local;
- if (op_ret != 0) {
- gf_log (this->name, GF_LOG_INFO,
- "setattr on parent directory (%s) failed: %s",
- setattr_local->loc.path, strerror (op_errno));
- }
-
- call_count = afr_frame_return (setattr_frame);
- if (call_count == 0)
- AFR_STACK_DESTROY (setattr_frame);
- return 0;
-}
+ int ret = 0;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int source = -1;
-int
-afr_sh_entry_impunge_setattr (call_frame_t *impunge_frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_local_t *setattr_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- call_frame_t *setattr_frame = NULL;
- int32_t valid = 0;
- int32_t op_errno = 0;
- int child_index = 0;
- int call_count = 0;
- int i = 0;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "setting ownership of %s on %s to %d/%d",
- impunge_local->loc.path,
- priv->children[child_index]->name,
- impunge_sh->entrybuf.ia_uid,
- impunge_sh->entrybuf.ia_gid);
-
- setattr_frame = copy_frame (impunge_frame);
- if (!setattr_frame) {
- op_errno = ENOMEM;
- goto out;
- }
- AFR_LOCAL_ALLOC_OR_GOTO (setattr_frame->local, out);
- setattr_local = setattr_frame->local;
- call_count = afr_errno_count (NULL, impunge_sh->child_errno,
- priv->child_count, 0);
- loc_copy (&setattr_local->loc, &impunge_sh->parent_loc);
- impunge_local->call_count = call_count;
- setattr_local->call_count = call_count;
- for (i = 0; i < priv->child_count; i++) {
- if (impunge_sh->child_errno[i])
- continue;
- valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME;
- STACK_WIND_COOKIE (setattr_frame,
- afr_sh_entry_impunge_parent_setattr_cbk,
- (void *) (long) i, priv->children[i],
- priv->children[i]->fops->setattr,
- &setattr_local->loc,
- &impunge_sh->parentbuf, valid, NULL);
-
- valid = GF_SET_ATTR_UID | GF_SET_ATTR_GID |
- GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME;
- STACK_WIND_COOKIE (impunge_frame,
- afr_sh_entry_impunge_setattr_cbk,
- (void *) (long) i, priv->children[i],
- priv->children[i]->fops->setattr,
- &impunge_local->loc,
- &impunge_sh->entrybuf, valid, NULL);
- call_count--;
- }
- GF_ASSERT (!call_count);
- return 0;
-out:
- if (setattr_frame)
- AFR_STACK_DESTROY (setattr_frame);
- afr_sh_entry_call_impunge_done (impunge_frame, this, 0, op_errno);
- return 0;
-}
+ priv = this->private;
-int
-afr_sh_entry_impunge_xattrop_cbk (call_frame_t *impunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- dict_t *xattr, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- int child_index = 0;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
-
- child_index = (long) cookie;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_INFO,
- "%s: failed to perform xattrop on %s (%s)",
- impunge_local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- goto out;
- }
-
- afr_sh_entry_impunge_setattr (impunge_frame, this);
- return 0;
-out:
- afr_sh_entry_call_impunge_done (impunge_frame, this,
- -1, op_errno);
- return 0;
-}
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid && replies[i].op_ret == 0) {
+ source = i;
+ break;
+ }
+ }
-void
-afr_sh_prepare_new_entry_pending_matrix (int32_t **pending,
- int *child_errno,
- struct iatt *buf,
- unsigned int child_count)
-{
- int midx = 0;
- int idx = 0;
- int i = 0;
-
- midx = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION);
- if (IA_ISDIR (buf->ia_type))
- idx = afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION);
- else if (IA_ISREG (buf->ia_type))
- idx = afr_index_for_transaction_type (AFR_DATA_TRANSACTION);
- else
- idx = -1;
- for (i = 0; i < child_count; i++) {
- if (child_errno[i])
- continue;
- pending[i][midx] = hton32 (1);
- if (idx == -1)
- continue;
- pending[i][idx] = hton32 (1);
- }
-}
+ if (source == -1) {
+ /* entry got deleted in the mean time? */
+ return 0;
+ }
-int
-afr_sh_entry_impunge_perform_xattrop (call_frame_t *impunge_frame,
- xlator_t *this)
-{
- int active_src = 0;
- dict_t *xattr = NULL;
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int32_t op_errno = 0;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- active_src = impunge_sh->active_source;
-
- afr_sh_prepare_new_entry_pending_matrix (impunge_local->pending,
- impunge_sh->child_errno,
- &impunge_sh->entrybuf,
- priv->child_count);
- xattr = dict_new ();
- if (!xattr) {
- op_errno = ENOMEM;
- goto out;
- }
-
- afr_set_pending_dict (priv, xattr, impunge_local->pending, active_src,
- LOCAL_LAST);
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_xattrop_cbk,
- (void *) (long) active_src,
- priv->children[active_src],
- priv->children[active_src]->fops->xattrop,
- &impunge_local->loc, GF_XATTROP_ADD_ARRAY, xattr, NULL);
-
- if (xattr)
- dict_unref (xattr);
- return 0;
-out:
- afr_sh_entry_call_impunge_done (impunge_frame, this,
- -1, op_errno);
- return 0;
-}
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source || !healed_sinks[i])
+ continue;
-int
-afr_sh_entry_impunge_newfile_cbk (call_frame_t *impunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *stbuf,
- struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- int call_count = 0;
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int child_index = 0;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
-
- child_index = (long) cookie;
-
- if (op_ret == -1) {
- impunge_sh->child_errno[child_index] = op_errno;
- gf_log (this->name, GF_LOG_ERROR,
- "creation of %s on %s failed (%s)",
- impunge_local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- } else {
- impunge_sh->child_errno[child_index] = 0;
- }
-
- call_count = afr_frame_return (impunge_frame);
- if (call_count == 0) {
- if (!afr_errno_count (NULL, impunge_sh->child_errno,
- priv->child_count, 0)) {
- // new_file creation failed every where
- afr_sh_entry_call_impunge_done (impunge_frame, this,
- -1, op_errno);
- goto out;
- }
- afr_sh_entry_impunge_perform_xattrop (impunge_frame, this);
- }
-out:
- return 0;
-}
+ if (replies[i].op_errno != ENOENT)
+ continue;
-int
-afr_sh_entry_impunge_hardlink_cbk (call_frame_t *impunge_frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- int call_count = 0;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
-
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
-
- if (IA_IFLNK == impunge_sh->entrybuf.ia_type) {
- //For symlinks impunge is attempted un-conditionally
- //So the file can already exist.
- if ((op_ret < 0) && (op_errno == EEXIST))
- op_ret = 0;
- }
-
- call_count = afr_frame_return (impunge_frame);
- if (call_count == 0)
- afr_sh_entry_call_impunge_done (impunge_frame, this,
- op_ret, op_errno);
-
- return 0;
-}
+ ret = afr_selfheal_recreate_entry (frame, this, i, source,
+ fd->inode, name, inode,
+ replies);
+ }
-int
-afr_sh_entry_impunge_hardlink (call_frame_t *impunge_frame, xlator_t *this,
- int child_index)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- loc_t *loc = NULL;
- struct iatt *buf = NULL;
- loc_t oldloc = {0};
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- loc = &impunge_local->loc;
- buf = &impunge_sh->entrybuf;
-
- oldloc.inode = inode_ref (loc->inode);
- uuid_copy (oldloc.gfid, buf->ia_gfid);
- gf_log (this->name, GF_LOG_DEBUG, "linking missing file %s on %s",
- loc->path, priv->children[child_index]->name);
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_hardlink_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->link,
- &oldloc, loc, NULL);
- loc_wipe (&oldloc);
-
- return 0;
+ return ret;
}
-int
-afr_sh_nameless_lookup_cbk (call_frame_t *impunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, dict_t *xattr,
- struct iatt *postparent)
-{
- if (op_ret < 0) {
- afr_sh_entry_impunge_create_file (impunge_frame, this,
- (long)cookie);
- } else {
- afr_sh_entry_impunge_hardlink (impunge_frame, this,
- (long)cookie);
- }
- return 0;
-}
-int
-afr_sh_entry_impunge_check_hardlink (call_frame_t *impunge_frame,
- xlator_t *this,
- int child_index, struct iatt *stbuf)
+static int
+__afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ char *name, inode_t *inode, int source,
+ unsigned char *sources, unsigned char *healed_sinks,
+ unsigned char *locked_on, struct afr_reply *replies)
{
- afr_private_t *priv = NULL;
- call_frame_t *frame = NULL;
- afr_local_t *impunge_local = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- afr_self_heal_t *sh = NULL;
- loc_t *loc = NULL;
- dict_t *xattr_req = NULL;
- loc_t oldloc = {0};
- int ret = -1;
-
- priv = this->private;
- AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh,
- frame, local, sh);
- loc = &impunge_local->loc;
-
- xattr_req = dict_new ();
- if (!xattr_req)
- goto out;
- oldloc.inode = inode_ref (loc->inode);
- uuid_copy (oldloc.gfid, stbuf->ia_gfid);
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_nameless_lookup_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->lookup,
- &oldloc, xattr_req);
- ret = 0;
-out:
- if (xattr_req)
- dict_unref (xattr_req);
- loc_wipe (&oldloc);
- if (ret)
- sh->impunge_done (frame, this, -1, ENOMEM);
- return 0;
-}
+ int ret = -1;
-int
-afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this,
- int child_index, struct iatt *stbuf)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- dict_t *dict = NULL;
- int ret = 0;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "creating missing file %s on %s",
- impunge_local->loc.path,
- priv->children[child_index]->name);
-
- dict = dict_new ();
- if (!dict)
- gf_log (this->name, GF_LOG_ERROR, "Out of memory");
-
- GF_ASSERT (!uuid_is_null (stbuf->ia_gfid));
- ret = afr_set_dict_gfid (dict, stbuf->ia_gfid);
- if (ret)
- gf_log (this->name, GF_LOG_INFO, "%s: gfid set failed",
- impunge_local->loc.path);
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->mknod,
- &impunge_local->loc,
- st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type),
- makedev (ia_major (stbuf->ia_rdev),
- ia_minor (stbuf->ia_rdev)), 0, dict);
-
- if (dict)
- dict_unref (dict);
-
- return 0;
+ if (source < 0)
+ ret = __afr_selfheal_merge_dirent (frame, this, fd, name, inode,
+ sources, healed_sinks,
+ locked_on, replies);
+ else
+ ret = __afr_selfheal_heal_dirent (frame, this, fd, name, inode,
+ source, sources, healed_sinks,
+ locked_on, replies);
+ return ret;
}
-
-int
-afr_sh_entry_impunge_mkdir (call_frame_t *impunge_frame, xlator_t *this,
- int child_index, struct iatt *stbuf)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- dict_t *dict = NULL;
-
- int ret = 0;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
-
- dict = dict_new ();
- if (!dict) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
- return 0;
- }
-
- GF_ASSERT (!uuid_is_null (stbuf->ia_gfid));
- ret = afr_set_dict_gfid (dict, stbuf->ia_gfid);
- if (ret)
- gf_log (this->name, GF_LOG_INFO, "%s: gfid set failed",
- impunge_local->loc.path);
-
- gf_log (this->name, GF_LOG_DEBUG,
- "creating missing directory %s on %s",
- impunge_local->loc.path,
- priv->children[child_index]->name);
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->mkdir,
- &impunge_local->loc,
- st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type),
- 0, dict);
-
- if (dict)
- dict_unref (dict);
-
- return 0;
+static int
+afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int source, unsigned char *sources,
+ unsigned char *healed_sinks, char *name)
+{
+ afr_private_t *priv = NULL;
+ int ret = 0;
+ unsigned char *locked_on = NULL;
+ struct afr_reply *replies = NULL;
+ inode_t *inode = NULL;
+
+ priv = this->private;
+
+ locked_on = alloca0 (priv->child_count);
+
+ replies = alloca0 (priv->child_count * sizeof(*replies));
+
+ ret = afr_selfheal_entrylk (frame, this, fd->inode, this->name,
+ name, locked_on);
+ {
+ if (ret < 2) {
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ inode = afr_selfheal_unlocked_lookup_on (frame, fd->inode, name,
+ replies, locked_on);
+ if (!inode) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_entry_dirent (frame, this, fd, name, inode,
+ source, sources, healed_sinks,
+ locked_on, replies);
+ }
+unlock:
+ afr_selfheal_unentrylk (frame, this, fd->inode, this->name, name,
+ locked_on);
+ if (inode)
+ inode_unref (inode);
+ return ret;
}
-int
-afr_sh_entry_impunge_symlink (call_frame_t *impunge_frame, xlator_t *this,
- int child_index, const char *linkname)
+static int
+afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int child, int source, unsigned char *sources,
+ unsigned char *healed_sinks)
{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- dict_t *dict = NULL;
- struct iatt *buf = NULL;
- int ret = 0;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
-
- buf = &impunge_local->cont.dir_fop.buf;
-
- dict = dict_new ();
- if (!dict) {
- afr_sh_entry_call_impunge_done (impunge_frame, this,
- -1, ENOMEM);
- goto out;
- }
-
- GF_ASSERT (!uuid_is_null (buf->ia_gfid));
- ret = afr_set_dict_gfid (dict, buf->ia_gfid);
- if (ret)
- gf_log (this->name, GF_LOG_INFO,
- "%s: dict set gfid failed",
- impunge_local->loc.path);
-
- gf_log (this->name, GF_LOG_DEBUG,
- "creating missing symlink %s -> %s on %s",
- impunge_local->loc.path, linkname,
- priv->children[child_index]->name);
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->symlink,
- linkname, &impunge_local->loc, 0, dict);
-
- if (dict)
- dict_unref (dict);
-out:
- return 0;
-}
+ int ret = 0;
+ gf_dirent_t entries;
+ gf_dirent_t *entry = NULL;
+ off_t offset = 0;
+ call_frame_t *iter_frame = NULL;
+ xlator_t *subvol = NULL;
+ afr_private_t *priv = NULL;
+ priv = this->private;
+ subvol = priv->children[child];
-int
-afr_sh_entry_impunge_symlink_unlink_cbk (call_frame_t *impunge_frame,
- void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int child_index = -1;
- int call_count = -1;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
-
- child_index = (long) cookie;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_INFO,
- "unlink of %s on %s failed (%s)",
- impunge_local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- goto out;
- }
-
- afr_sh_entry_impunge_symlink (impunge_frame, this, child_index,
- impunge_sh->linkname);
-
- return 0;
-out:
- LOCK (&impunge_frame->lock);
- {
- call_count = --impunge_local->call_count;
- }
- UNLOCK (&impunge_frame->lock);
+ INIT_LIST_HEAD (&entries.list);
- if (call_count == 0)
- afr_sh_entry_call_impunge_done (impunge_frame, this,
- op_ret, op_errno);
+ iter_frame = afr_copy_frame (frame);
+ if (!iter_frame)
+ return -ENOMEM;
- return 0;
-}
+ while ((ret = syncop_readdir (subvol, fd, 131072, offset, &entries))) {
+ if (ret > 0)
+ ret = 0;
+ list_for_each_entry (entry, &entries.list, list) {
+ offset = entry->d_off;
+ if (!strcmp (entry->d_name, ".") ||
+ !strcmp (entry->d_name, ".."))
+ continue;
-int
-afr_sh_entry_impunge_symlink_unlink (call_frame_t *impunge_frame, xlator_t *this,
- int child_index)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
+ if (__is_root_gfid (fd->inode->gfid) &&
+ !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR))
+ continue;
- priv = this->private;
- impunge_local = impunge_frame->local;
+ ret = afr_selfheal_entry_dirent (iter_frame, this, fd,
+ source, sources,
+ healed_sinks,
+ entry->d_name);
+ AFR_STACK_RESET (iter_frame);
- gf_log (this->name, GF_LOG_DEBUG,
- "unlinking symlink %s with wrong target on %s",
- impunge_local->loc.path,
- priv->children[child_index]->name);
+ if (ret)
+ break;
+ }
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_symlink_unlink_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->unlink,
- &impunge_local->loc, 0, NULL);
+ gf_dirent_free (&entries);
+ if (ret)
+ break;
+ }
- return 0;
+ AFR_STACK_DESTROY (iter_frame);
+ return ret;
}
-
-int
-afr_sh_entry_impunge_readlink_sink_cbk (call_frame_t *impunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- const char *linkname, struct iatt *sbuf, dict_t *xdata)
+static int
+afr_selfheal_entry_do (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int source, unsigned char *sources,
+ unsigned char *healed_sinks,
+ struct afr_reply *locked_replies)
{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int child_index = -1;
- int call_count = -1;
- int active_src = -1;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- active_src = impunge_sh->active_source;
-
- child_index = (long) cookie;
-
- if ((op_ret == -1) && (op_errno != ENOENT)) {
- gf_log (this->name, GF_LOG_INFO,
- "readlink of %s on %s failed (%s)",
- impunge_local->loc.path,
- priv->children[active_src]->name,
- strerror (op_errno));
- goto out;
- }
-
- /* symlink doesn't exist on the sink */
-
- if ((op_ret == -1) && (op_errno == ENOENT)) {
- afr_sh_entry_impunge_symlink (impunge_frame, this,
- child_index, impunge_sh->linkname);
- return 0;
- }
-
-
- /* symlink exists on the sink, so check if targets match */
-
- if (strcmp (linkname, impunge_sh->linkname) == 0) {
- /* targets match, nothing to do */
-
- goto out;
- } else {
- /*
- * Hah! Sneaky wolf in sheep's clothing!
- */
- afr_sh_entry_impunge_symlink_unlink (impunge_frame, this,
- child_index);
- return 0;
- }
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int ret = 0;
-out:
- LOCK (&impunge_frame->lock);
- {
- call_count = --impunge_local->call_count;
- }
- UNLOCK (&impunge_frame->lock);
+ priv = this->private;
- if (call_count == 0)
- afr_sh_entry_call_impunge_done (impunge_frame, this,
- op_ret, op_errno);
+ gf_log (this->name, GF_LOG_INFO, "performing entry selfheal on %s",
+ uuid_utoa (fd->inode->gfid));
- return 0;
+ for (i = 0; i < priv->child_count; i++) {
+ if (i != source && !healed_sinks[i])
+ continue;
+ ret = afr_selfheal_entry_do_subvol (frame, this, fd, i, source,
+ sources, healed_sinks);
+ if (ret)
+ break;
+ }
+ return ret;
}
-int
-afr_sh_entry_impunge_readlink_sink (call_frame_t *impunge_frame, xlator_t *this,
- int child_index)
+static int
+__afr_selfheal_entry_finalize_source (xlator_t *this, unsigned char *sources,
+ unsigned char *sinks,
+ unsigned char *locked_on,
+ struct afr_reply *replies)
{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "checking symlink target of %s on %s",
- impunge_local->loc.path, priv->children[child_index]->name);
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_sink_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->readlink,
- &impunge_local->loc, 4096, NULL);
-
- return 0;
-}
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int source = -1;
+ int locked_count = 0;
+ int sources_count = 0;
+ int sinks_count = 0;
+ priv = this->private;
-int
-afr_sh_entry_impunge_readlink_cbk (call_frame_t *impunge_frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- const char *linkname, struct iatt *sbuf, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int child_index = -1;
- int call_count = -1;
- int active_src = -1;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- active_src = impunge_sh->active_source;
-
- child_index = (long) cookie;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_INFO,
- "readlink of %s on %s failed (%s)",
- impunge_local->loc.path,
- priv->children[active_src]->name,
- strerror (op_errno));
- goto out;
- }
-
- impunge_sh->linkname = gf_strdup (linkname);
- afr_sh_entry_impunge_readlink_sink (impunge_frame, this, child_index);
-
- return 0;
+ locked_count = AFR_COUNT (locked_on, priv->child_count);
+ sources_count = AFR_COUNT (sources, priv->child_count);
+ sinks_count = AFR_COUNT (sinks, priv->child_count);
-out:
- LOCK (&impunge_frame->lock);
- {
- call_count = --impunge_local->call_count;
- }
- UNLOCK (&impunge_frame->lock);
+ if (locked_count == sinks_count || !sources_count) {
+ return -1;
+ }
- if (call_count == 0)
- afr_sh_entry_call_impunge_done (impunge_frame, this,
- op_ret, op_errno);
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i]) {
+ source = i;
+ break;
+ }
+ }
- return 0;
+ return source;
}
-int
-afr_sh_entry_impunge_readlink (call_frame_t *impunge_frame, xlator_t *this,
- int child_index, struct iatt *stbuf)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int active_src = -1;
-
- priv = this->private;
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- active_src = impunge_sh->active_source;
- impunge_local->cont.dir_fop.buf = *stbuf;
-
- STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_cbk,
- (void *) (long) child_index,
- priv->children[active_src],
- priv->children[active_src]->fops->readlink,
- &impunge_local->loc, 4096, NULL);
-
- return 0;
-}
-
-int
-afr_sh_entry_impunge_create (call_frame_t *impunge_frame, xlator_t *this,
- int child_index)
-{
- call_frame_t *frame = NULL;
- afr_local_t *impunge_local = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- ia_type_t type = IA_INVAL;
- int active_src = 0;
- struct iatt *buf = NULL;
-
- AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh,
- frame, local, sh);
- active_src = impunge_sh->active_source;
- afr_update_loc_gfids (&impunge_local->loc, &impunge_sh->entrybuf,
- &impunge_sh->parentbuf);
-
- buf = &impunge_sh->entrybuf;
- type = buf->ia_type;
-
- switch (type) {
- case IA_IFSOCK:
- case IA_IFREG:
- case IA_IFBLK:
- case IA_IFCHR:
- case IA_IFIFO:
- case IA_IFLNK:
- afr_sh_entry_impunge_check_hardlink (impunge_frame, this,
- child_index, buf);
- break;
- case IA_IFDIR:
- afr_sh_entry_impunge_mkdir (impunge_frame, this,
- child_index, buf);
- break;
- default:
- gf_log (this->name, GF_LOG_ERROR,
- "%s has unknown file type on %s: 0%o",
- impunge_local->loc.path,
- priv->children[active_src]->name, type);
- sh->impunge_done (frame, this, -1, EINVAL);
- break;
- }
-
- return 0;
-}
-
-int
-afr_sh_entry_impunge_create_file (call_frame_t *impunge_frame, xlator_t *this,
- int child_index)
+static int
+__afr_selfheal_entry_prepare (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ unsigned char *locked_on, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks,
+ struct afr_reply *replies, int *source_p)
{
- call_frame_t *frame = NULL;
- afr_local_t *impunge_local = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- ia_type_t type = IA_INVAL;
- int active_src = 0;
- struct iatt *buf = NULL;
-
- AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh,
- frame, local, sh);
- active_src = impunge_sh->active_source;
- buf = &impunge_sh->entrybuf;
- type = buf->ia_type;
-
- switch (type) {
- case IA_IFSOCK:
- case IA_IFREG:
- case IA_IFBLK:
- case IA_IFCHR:
- case IA_IFIFO:
- afr_sh_entry_impunge_mknod (impunge_frame, this,
- child_index, buf);
- break;
- case IA_IFLNK:
- afr_sh_entry_impunge_readlink (impunge_frame, this,
- child_index, buf);
- break;
- default:
- gf_log (this->name, GF_LOG_ERROR,
- "%s has unknown file type on %s: 0%o",
- impunge_local->loc.path,
- priv->children[active_src]->name, type);
- sh->impunge_done (frame, this, -1, EINVAL);
- break;
- }
-
- return 0;
-}
+ int ret = -1;
+ int source = -1;
+ afr_private_t *priv = NULL;
+ int i = 0;
-gf_boolean_t
-afr_sh_need_recreate (afr_self_heal_t *impunge_sh, unsigned int child,
- unsigned int child_count)
-{
- gf_boolean_t recreate = _gf_false;
+ priv = this->private;
- GF_ASSERT (impunge_sh->child_errno);
+ ret = afr_selfheal_unlocked_discover (frame, fd->inode, fd->inode->gfid,
+ replies);
+ if (ret)
+ return ret;
- if (child == impunge_sh->active_source)
- goto out;
+ ret = afr_selfheal_find_direction (frame, this, replies,
+ AFR_ENTRY_TRANSACTION,
+ locked_on, sources, sinks);
+ if (ret)
+ return ret;
- if (IA_IFLNK == impunge_sh->entrybuf.ia_type) {
- recreate = _gf_true;
- goto out;
- }
+ source = __afr_selfheal_entry_finalize_source (this, sources, sinks,
+ locked_on, replies);
+ if (source < 0) {
+ /* If source is < 0 (typically split-brain), we perform a
+ conservative merge of entries rather than erroring out */
+ }
+ *source_p = source;
- if (impunge_sh->child_errno[child] == ENOENT)
- recreate = _gf_true;
-out:
- return recreate;
-}
+ for (i = 0; i < priv->child_count; i++)
+ /* Initialize the healed_sinks[] array optimistically to
+ the intersection of to-be-healed (i.e sinks[]) and
+ the list of servers which are up (i.e locked_on[]).
-unsigned int
-afr_sh_recreate_count (afr_self_heal_t *impunge_sh, int *sources,
- unsigned int child_count)
-{
- int count = 0;
- int i = 0;
+ As we encounter failures in the healing process, we
+ will unmark the respective servers in the healed_sinks[]
+ array.
+ */
+ healed_sinks[i] = sinks[i] && locked_on[i];
- for (i = 0; i < child_count; i++) {
- if (afr_sh_need_recreate (impunge_sh, i, child_count))
- count++;
- }
-
- return count;
+ return ret;
}
-int
-afr_sh_entry_call_impunge_recreate (call_frame_t *impunge_frame,
- xlator_t *this)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- call_frame_t *frame = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- unsigned int recreate_count = 0;
- int i = 0;
- int active_src = 0;
-
- priv = this->private;
- AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh,
- frame, local, sh);
- active_src = impunge_sh->active_source;
- impunge_sh->entrybuf = impunge_sh->buf[active_src];
- impunge_sh->parentbuf = impunge_sh->parentbufs[active_src];
- recreate_count = afr_sh_recreate_count (impunge_sh, sh->sources,
- priv->child_count);
- if (!recreate_count) {
- afr_sh_entry_call_impunge_done (impunge_frame, this, 0, 0);
- goto out;
- }
- impunge_local->call_count = recreate_count;
- for (i = 0; i < priv->child_count; i++) {
- if (!impunge_local->child_up[i]) {
- impunge_sh->child_errno[i] = ENOTCONN;
- continue;
- }
- if (!afr_sh_need_recreate (impunge_sh, i, priv->child_count)) {
- impunge_sh->child_errno[i] = EEXIST;
- continue;
- }
- }
- for (i = 0; i < priv->child_count; i++) {
- if (!afr_sh_need_recreate (impunge_sh, i, priv->child_count))
- continue;
- (void)afr_sh_entry_impunge_create (impunge_frame, this, i);
- recreate_count--;
- }
- GF_ASSERT (!recreate_count);
-out:
- return 0;
-}
-void
-afr_sh_entry_common_lookup_done (call_frame_t *impunge_frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_private_t *priv = NULL;
- afr_local_t *impunge_local = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- call_frame_t *frame = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- unsigned int gfid_miss_count = 0;
- unsigned int children_up_count = 0;
- uuid_t gfid = {0};
- int active_src = 0;
-
- priv = this->private;
- AFR_INIT_SH_FRAME_VALS (impunge_frame, impunge_local, impunge_sh,
- frame, local, sh);
- active_src = impunge_sh->active_source;
-
- if (op_ret < 0)
- goto done;
- if (impunge_sh->child_errno[active_src]) {
- op_ret = -1;
- op_errno = impunge_sh->child_errno[active_src];
- goto done;
- }
-
- gfid_miss_count = afr_gfid_missing_count (this->name,
- impunge_sh->success_children,
- impunge_sh->buf, priv->child_count,
- impunge_local->loc.path);
- children_up_count = afr_up_children_count (impunge_local->child_up,
- priv->child_count);
- if ((gfid_miss_count == children_up_count) &&
- (children_up_count < priv->child_count)) {
- op_ret = -1;
- op_errno = ENODATA;
- gf_log (this->name, GF_LOG_ERROR, "Not all children are up, "
- "gfid should not be assigned in this state for %s",
- impunge_local->loc.path);
- goto done;
- }
-
- if (gfid_miss_count) {
- afr_update_gfid_from_iatts (gfid, impunge_sh->buf,
- impunge_sh->success_children,
- priv->child_count);
- if (uuid_is_null (gfid)) {
- sh->entries_skipped = _gf_true;
- gf_log (this->name, GF_LOG_INFO, "%s: Skipping entry "
- "self-heal because of gfid absence",
- impunge_local->loc.path);
- goto done;
- }
- afr_sh_common_lookup (impunge_frame, this, &impunge_local->loc,
- afr_sh_entry_common_lookup_done, gfid,
- AFR_LOOKUP_FAIL_CONFLICTS |
- AFR_LOOKUP_FAIL_MISSING_GFIDS,
- NULL);
- } else {
- afr_sh_entry_call_impunge_recreate (impunge_frame, this);
- }
- return;
-done:
- afr_sh_entry_call_impunge_done (impunge_frame, this,
- op_ret, op_errno);
- return;
-}
-
-int
-afr_sh_entry_impunge_entry (call_frame_t *frame, xlator_t *this,
- gf_dirent_t *entry)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_self_heal_t *impunge_sh = NULL;
- int ret = -1;
- call_frame_t *impunge_frame = NULL;
- afr_local_t *impunge_local = NULL;
- int active_src = 0;
- int op_errno = 0;
- int op_ret = -1;
-
- local = frame->local;
- sh = &local->self_heal;
-
- active_src = sh->active_source;
- sh->impunge_done = afr_sh_entry_impunge_entry_done;
-
- if ((strcmp (entry->d_name, ".") == 0)
- || (strcmp (entry->d_name, "..") == 0)) {
-
- gf_log (this->name, GF_LOG_TRACE,
- "skipping inspection of %s under %s",
- entry->d_name, local->loc.path);
- op_ret = 0;
- goto out;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "inspecting existence of %s under %s",
- entry->d_name, local->loc.path);
-
- ret = afr_impunge_frame_create (frame, this, active_src,
- &impunge_frame);
- if (ret) {
- op_errno = -ret;
- goto out;
- }
-
- impunge_local = impunge_frame->local;
- impunge_sh = &impunge_local->self_heal;
- ret = afr_build_child_loc (this, &impunge_local->loc, &local->loc,
- entry->d_name);
- loc_copy (&impunge_sh->parent_loc, &local->loc);
- if (ret != 0) {
- op_errno = ENOMEM;
- goto out;
- }
-
- afr_sh_common_lookup (impunge_frame, this, &impunge_local->loc,
- afr_sh_entry_common_lookup_done, NULL,
- AFR_LOOKUP_FAIL_CONFLICTS, NULL);
-
- op_ret = 0;
+static int
+__afr_selfheal_entry (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ unsigned char *locked_on)
+{
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *data_lock = NULL;
+ unsigned char *healed_sinks = NULL;
+ struct afr_reply *locked_replies = NULL;
+ int source = -1;
+
+ priv = this->private;
+
+ sources = alloca0 (priv->child_count);
+ sinks = alloca0 (priv->child_count);
+ healed_sinks = alloca0 (priv->child_count);
+ data_lock = alloca0 (priv->child_count);
+
+ locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count);
+
+ ret = afr_selfheal_entrylk (frame, this, fd->inode, this->name, NULL,
+ data_lock);
+ {
+ if (ret < 2) {
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_entry_prepare (frame, this, fd, data_lock,
+ sources, sinks, healed_sinks,
+ locked_replies, &source);
+ }
+unlock:
+ afr_selfheal_unentrylk (frame, this, fd->inode, this->name, NULL,
+ data_lock);
+ if (ret < 0)
+ goto out;
+
+ ret = afr_selfheal_entry_do (frame, this, fd, source, sources,
+ healed_sinks, locked_replies);
+ if (ret)
+ goto out;
+
+ ret = afr_selfheal_undo_pending (frame, this, fd->inode, sources, sinks,
+ healed_sinks, AFR_ENTRY_TRANSACTION,
+ locked_replies, data_lock);
out:
- if (ret) {
- if (impunge_frame)
- AFR_STACK_DESTROY (impunge_frame);
- sh->impunge_done (frame, this, op_ret, op_errno);
- }
-
- return 0;
-}
-
-
-int
-afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- gf_dirent_t *entries, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- gf_dirent_t *entry = NULL;
- off_t last_offset = 0;
- int active_src = 0;
- int entry_count = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- active_src = sh->active_source;
-
- if (op_ret <= 0) {
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
- "readdir of %s on subvolume %s failed (%s)",
- local->loc.path,
- priv->children[active_src]->name,
- strerror (op_errno));
- sh->op_failed = 1;
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "readdir of %s on subvolume %s complete",
- local->loc.path,
- priv->children[active_src]->name);
- }
-
- afr_sh_entry_impunge_all (frame, this);
- return 0;
- }
-
- list_for_each_entry (entry, &entries->list, list) {
- last_offset = entry->d_off;
- entry_count++;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "readdir'ed %d entries from %s",
- entry_count, priv->children[active_src]->name);
-
- sh->offset = last_offset;
- local->call_count = entry_count;
-
- list_for_each_entry (entry, &entries->list, list) {
- afr_sh_entry_impunge_entry (frame, this, entry);
- }
-
- return 0;
-}
-
-
-int
-afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int32_t active_src = 0;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
- active_src = sh->active_source;
- gf_log (this->name, GF_LOG_DEBUG, "%s: readdir from offset %zd",
- local->loc.path, sh->offset);
-
- STACK_WIND (frame, afr_sh_entry_impunge_readdir_cbk,
- priv->children[active_src],
- priv->children[active_src]->fops->readdirp,
- sh->healing_fd, sh->block_size, sh->offset, NULL);
-
- return 0;
+ return ret;
}
-int
-afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this)
+static fd_t *
+afr_selfheal_data_opendir (xlator_t *this, inode_t *inode)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- int active_src = -1;
-
- priv = this->private;
- local = frame->local;
- sh = &local->self_heal;
-
- sh->offset = 0;
-
- active_src = next_active_source (frame, this, sh->active_source);
- sh->active_source = active_src;
-
- if (sh->op_failed) {
- afr_sh_entry_finish (frame, this);
- return 0;
- }
+ loc_t loc = {0,};
+ int ret = 0;
+ fd_t *fd = NULL;
- if (active_src == -1) {
- /* completed creating missing files on all subvolumes */
- afr_sh_entry_erase_pending (frame, this);
- return 0;
- }
+ fd = fd_create (inode, 0);
+ if (!fd)
+ return NULL;
- gf_log (this->name, GF_LOG_TRACE,
- "impunging entries of %s on %s to other sinks",
- local->loc.path, priv->children[active_src]->name);
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
- afr_sh_entry_impunge_subvol (frame, this);
+ ret = syncop_opendir (this, &loc, fd);
+ if (ret) {
+ fd_unref (fd);
+ fd = NULL;
+ } else {
+ fd_bind (fd);
+ }
- return 0;
+ loc_wipe (&loc);
+ return fd;
}
int
-afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+afr_selfheal_entry (call_frame_t *frame, xlator_t *this, inode_t *inode)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int child_index = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- child_index = (long) cookie;
-
- /* TODO: some of the open's might fail.
- In that case, modify cleanup fn to send flush on those
- fd's which are already open */
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "opendir of %s failed on child %s (%s)",
- local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
- sh->op_failed = 1;
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if (sh->op_failed) {
- afr_sh_entry_finish (frame, this);
- return 0;
- }
- gf_log (this->name, GF_LOG_TRACE,
- "fd for %s opened, commencing sync",
- local->loc.path);
-
- sh->active_source = -1;
- afr_sh_entry_expunge_all (frame, this);
- }
-
- return 0;
-}
-
-
-int
-afr_sh_entry_open (call_frame_t *frame, xlator_t *this)
-{
- int i = 0;
- int call_count = 0;
-
- int source = -1;
- int *sources = NULL;
-
- fd_t *fd = NULL;
-
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- source = local->self_heal.source;
- sources = local->self_heal.sources;
-
- sh->block_size = priv->sh_readdir_size;
- sh->offset = 0;
-
- call_count = sh->active_sinks;
- if (source != -1)
- call_count++;
-
- local->call_count = call_count;
-
- fd = fd_create (local->loc.inode, frame->root->pid);
- sh->healing_fd = fd;
-
- if (source != -1) {
- gf_log (this->name, GF_LOG_TRACE,
- "opening directory %s on subvolume %s (source)",
- local->loc.path, priv->children[source]->name);
-
- /* open source */
- STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk,
- (void *) (long) source,
- priv->children[source],
- priv->children[source]->fops->opendir,
- &local->loc, fd, NULL);
- call_count--;
- }
-
- /* open sinks */
- for (i = 0; i < priv->child_count; i++) {
- if (sources[i] || !local->child_up[i])
- continue;
-
- gf_log (this->name, GF_LOG_TRACE,
- "opening directory %s on subvolume %s (sink)",
- local->loc.path, priv->children[i]->name);
+ afr_private_t *priv = NULL;
+ unsigned char *locked_on = NULL;
+ fd_t *fd = NULL;
+ int ret = 0;
- STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->opendir,
- &local->loc, fd, NULL);
+ priv = this->private;
- if (!--call_count)
- break;
- }
+ fd = afr_selfheal_data_opendir (this, inode);
+ if (!fd)
+ return -EIO;
- return 0;
-}
-
-
-int
-afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int source = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- source = sh->source;
-
- afr_sh_mark_source_sinks (frame, this);
- if (source != -1)
- sh->success[source] = 1;
-
- if (sh->active_sinks == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "no active sinks for self-heal on dir %s",
- local->loc.path);
- afr_sh_entry_finish (frame, this);
- return 0;
- }
- if (source == -1 && sh->active_sinks < 2) {
- gf_log (this->name, GF_LOG_TRACE,
- "cannot sync with 0 sources and 1 sink on dir %s",
- local->loc.path);
- afr_sh_entry_finish (frame, this);
- return 0;
- }
-
- if (source != -1)
- gf_log (this->name, GF_LOG_DEBUG,
- "self-healing directory %s from subvolume %s to "
- "%d other",
- local->loc.path, priv->children[source]->name,
- sh->active_sinks);
- else
- gf_log (this->name, GF_LOG_DEBUG,
- "no active sources for %s found. "
- "merging all entries as a conservative decision",
- local->loc.path);
-
- afr_sh_entry_open (frame, this);
-
- return 0;
-}
-
-
-void
-afr_sh_entry_fix (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int source = 0;
- int nsources = 0;
- int32_t subvol_status = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- if (op_ret < 0) {
- sh->op_failed = 1;
- afr_sh_set_error (sh, op_errno);
- afr_sh_entry_finish (frame, this);
- goto out;
- }
-
- if (sh->forced_merge) {
- sh->source = -1;
- goto heal;
- }
-
- nsources = afr_build_sources (this, sh->xattr, sh->buf,
- sh->pending_matrix, sh->sources,
- sh->success_children,
- AFR_ENTRY_TRANSACTION, &subvol_status,
- _gf_true);
- if ((subvol_status & ALL_FOOLS) ||
- (subvol_status & SPLIT_BRAIN)) {
- gf_log (this->name, GF_LOG_INFO, "%s: Performing conservative "
- "merge", local->loc.path);
- source = -1;
- memset (sh->sources, 0,
- sizeof (*sh->sources) * priv->child_count);
- } else if (nsources == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "No self-heal needed for %s",
- local->loc.path);
-
- afr_sh_entry_finish (frame, this);
- return;
- } else {
- source = afr_sh_select_source (sh->sources, priv->child_count);
- }
-
- sh->source = source;
-
- afr_reset_children (sh->fresh_children, priv->child_count);
- afr_get_fresh_children (sh->success_children, sh->sources,
- sh->fresh_children, priv->child_count);
- if (sh->source >= 0)
- afr_inode_set_read_ctx (this, sh->inode, sh->source,
- sh->fresh_children);
-
-heal:
- afr_sh_entry_sync_prepare (frame, this);
-out:
- return;
-}
-
-int
-afr_sh_post_nonblocking_entry_cbk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- sh = &local->self_heal;
-
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_ERROR, "Non Blocking entrylks "
- "failed for %s.", local->loc.path);
- sh->op_failed = 1;
- afr_sh_entry_done (frame, this);
- } else {
-
- gf_log (this->name, GF_LOG_DEBUG, "Non Blocking entrylks done "
- "for %s. Proceeding to FOP", local->loc.path);
- afr_sh_common_lookup (frame, this, &local->loc,
- afr_sh_entry_fix, NULL,
- AFR_LOOKUP_FAIL_CONFLICTS |
- AFR_LOOKUP_FAIL_MISSING_GFIDS,
- NULL);
- }
-
- return 0;
-}
-
-int
-afr_self_heal_entry (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ locked_on = alloca0 (priv->child_count);
+ ret = afr_selfheal_tryentrylk (frame, this, inode, priv->sh_domain, NULL,
+ locked_on);
+ {
+ if (ret < 2) {
+ /* Either less than two subvols available, or another
+ selfheal (from another server) is in progress. Skip
+ for now in any case there isn't anything to do.
+ */
+ ret = -ENOTCONN;
+ goto unlock;
+ }
- priv = this->private;
- local = frame->local;
+ ret = __afr_selfheal_entry (frame, this, fd, locked_on);
+ }
+unlock:
+ afr_selfheal_unentrylk (frame, this, inode, priv->sh_domain, NULL, locked_on);
- if (local->self_heal.do_entry_self_heal && priv->entry_self_heal) {
- afr_sh_entrylk (frame, this, &local->loc, NULL,
- afr_sh_post_nonblocking_entry_cbk);
- } else {
- gf_log (this->name, GF_LOG_TRACE,
- "proceeding to completion on %s",
- local->loc.path);
- afr_sh_entry_done (frame, this);
- }
+ if (fd)
+ fd_unref (fd);
- return 0;
+ return ret;
}
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
index 6e98f2a63..83628297f 100644
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
@@ -1,5 +1,5 @@
/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
@@ -8,539 +8,273 @@
cases as published by the Free Software Foundation.
*/
-#include <libgen.h>
-#include <unistd.h>
-#include <fnmatch.h>
-#include <sys/time.h>
-#include <stdlib.h>
-#include <signal.h>
#ifndef _CONFIG_H
#define _CONFIG_H
#include "config.h"
#endif
-#include "glusterfs.h"
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
-
-#include "afr-transaction.h"
#include "afr-self-heal.h"
-#include "afr-self-heal-common.h"
-
-
-int
-afr_sh_metadata_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
-
- local = frame->local;
- sh = &local->self_heal;
-
- afr_sh_reset (frame, this);
- if (IA_ISDIR (sh->type)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "proceeding to entry check on %s",
- local->loc.path);
- afr_self_heal_entry (frame, this);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "proceeding to data check on %s",
- local->loc.path);
- afr_self_heal_data (frame, this);
- }
-
- return 0;
-}
-
-int
-afr_sh_inode_unlock (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- int_lock->lock_cbk = afr_sh_metadata_done;
- afr_unlock (frame, this);
-
- return 0;
-}
-
-int
-afr_sh_metadata_finish (call_frame_t *frame, xlator_t *this)
-{
- afr_sh_inode_unlock (frame, this);
-
- return 0;
-}
-
-
-int
-afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, dict_t *xattr, dict_t *xdata)
-{
- afr_local_t *local = NULL;
- int call_count = 0;
- long i = 0;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- priv = this->private;
- sh = &local->self_heal;
- i = (long)cookie;
-
- if ((!IA_ISREG (sh->buf[sh->source].ia_type)) &&
- (!IA_ISDIR (sh->buf[sh->source].ia_type))) {
- afr_children_add_child (sh->fresh_children, i,
- priv->child_count);
- }
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if ((!IA_ISREG (sh->buf[sh->source].ia_type)) &&
- (!IA_ISDIR (sh->buf[sh->source].ia_type))) {
- afr_inode_set_read_ctx (this, sh->inode, sh->source,
- sh->fresh_children);
- }
- afr_sh_metadata_finish (frame, this);
- }
-
- return 0;
-}
-
-int
-afr_sh_metadata_erase_pending (call_frame_t *frame, xlator_t *this)
-{
- afr_sh_erase_pending (frame, this, AFR_METADATA_TRANSACTION,
- afr_sh_metadata_erase_pending_cbk,
- afr_sh_metadata_finish);
- return 0;
-}
-
-
-int
-afr_sh_metadata_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int child_index = 0;
-
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_INFO,
- "setting attributes failed for %s on %s (%s)",
- local->loc.path,
- priv->children[child_index]->name,
- strerror (op_errno));
-
- sh->success[child_index] = 0;
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0)
- afr_sh_metadata_erase_pending (frame, this);
-
- return 0;
-}
-
-
-int
-afr_sh_metadata_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preop, struct iatt *postop, dict_t *xdata)
-{
- afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno, xdata);
-
- return 0;
-}
-
-
-int
-afr_sh_metadata_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno, xdata);
-
- return 0;
-}
+#include "byte-order.h"
+#define AFR_HEAL_ATTR (GF_SET_ATTR_UID|GF_SET_ATTR_GID|GF_SET_ATTR_MODE)
int
-afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr)
+afr_selfheal_metadata_do (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int source, unsigned char *healed_sinks,
+ struct afr_reply *locked_replies)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int source = 0;
- int active_sinks = 0;
- int call_count = 0;
- int i = 0;
-
- struct iatt stbuf = {0,};
- int32_t valid = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- source = sh->source;
- active_sinks = sh->active_sinks;
-
- /*
- * 2 calls per sink - setattr, setxattr
- */
- if (xattr)
- call_count = active_sinks * 2;
- else
- call_count = active_sinks;
-
- local->call_count = call_count;
-
- stbuf.ia_atime = sh->buf[source].ia_atime;
- stbuf.ia_atime_nsec = sh->buf[source].ia_atime_nsec;
- stbuf.ia_mtime = sh->buf[source].ia_mtime;
- stbuf.ia_mtime_nsec = sh->buf[source].ia_mtime_nsec;
-
- stbuf.ia_uid = sh->buf[source].ia_uid;
- stbuf.ia_gid = sh->buf[source].ia_gid;
-
- stbuf.ia_type = sh->buf[source].ia_type;
- stbuf.ia_prot = sh->buf[source].ia_prot;
-
- valid = GF_SET_ATTR_MODE |
- GF_SET_ATTR_UID | GF_SET_ATTR_GID |
- GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME;
-
- for (i = 0; i < priv->child_count; i++) {
- if (call_count == 0) {
- break;
- }
- if (sh->sources[i] || !local->child_up[i])
- continue;
-
- gf_log (this->name, GF_LOG_DEBUG,
- "self-healing metadata of %s from %s to %s",
- local->loc.path, priv->children[source]->name,
- priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_sh_metadata_setattr_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->setattr,
- &local->loc, &stbuf, valid, NULL);
-
- call_count--;
-
- if (!xattr)
- continue;
-
- STACK_WIND_COOKIE (frame, afr_sh_metadata_xattr_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->setxattr,
- &local->loc, xattr, 0, NULL);
- call_count--;
- }
-
- return 0;
+ int ret = -1;
+ loc_t loc = {0,};
+ dict_t *xattr = NULL;
+ dict_t *old_xattr = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ priv = this->private;
+
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
+
+ gf_log (this->name, GF_LOG_INFO, "performing metadata selfheal on %s",
+ uuid_utoa (inode->gfid));
+
+ ret = syncop_getxattr (priv->children[source], &loc, &xattr, NULL);
+ if (ret < 0) {
+ loc_wipe (&loc);
+ return -EIO;
+ }
+
+ afr_filter_xattrs (xattr);
+ dict_del (xattr, GF_SELINUX_XATTR_KEY);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i])
+ continue;
+
+ ret = syncop_setattr (priv->children[i], &loc,
+ &locked_replies[source].poststat,
+ AFR_HEAL_ATTR, NULL, NULL);
+ if (ret)
+ healed_sinks[i] = 0;
+
+ old_xattr = NULL;
+ ret = syncop_getxattr (priv->children[i], &loc, &old_xattr, 0);
+ if (old_xattr) {
+ dict_del (old_xattr, GF_SELINUX_XATTR_KEY);
+ afr_filter_xattrs (old_xattr);
+ ret = syncop_removexattr (priv->children[i], &loc, "",
+ old_xattr);
+ }
+
+ ret = syncop_setxattr (priv->children[i], &loc, xattr, 0);
+ if (ret)
+ healed_sinks[i] = 0;
+ }
+
+ loc_wipe (&loc);
+ if (xattr)
+ dict_unref (xattr);
+
+ return 0;
}
-int
-afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xattr,
- dict_t *xdata)
+/*
+ * Look for mismatching uid/gid or mode even if xattrs don't say so, and
+ * pick one arbitrarily as winner.
+ */
+
+static int
+__afr_selfheal_metadata_finalize_source (xlator_t *this, unsigned char *sources,
+ unsigned char *sinks,
+ unsigned char *locked_on,
+ struct afr_reply *replies)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int source = 0;
-
- int i;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- source = sh->source;
-
- if (op_ret == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "getxattr of %s failed on subvolume %s (%s). proceeding without xattr",
- local->loc.path, priv->children[source]->name,
- strerror (op_errno));
-
- afr_sh_metadata_sync (frame, this, NULL);
- } else {
- for (i = 0; i < priv->child_count; i++) {
- dict_del (xattr, priv->pending_key[i]);
- }
-
- afr_sh_metadata_sync (frame, this, xattr);
- }
-
- return 0;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ struct iatt first = {0, };
+ int source = -1;
+ int locked_count = 0;
+ int sources_count = 0;
+ int sinks_count = 0;
+
+ priv = this->private;
+
+ locked_count = AFR_COUNT (locked_on, priv->child_count);
+ sources_count = AFR_COUNT (sources, priv->child_count);
+ sinks_count = AFR_COUNT (sinks, priv->child_count);
+
+ if (locked_count == sinks_count || !sources_count) {
+ if (!priv->metadata_splitbrain_forced_heal) {
+ return -EIO;
+ }
+ /* Metadata split brain, select one subvol
+ arbitrarily */
+ for (i = 0; i < priv->child_count; i++) {
+ if (locked_on[i] && sinks[i]) {
+ sources[i] = 1;
+ sinks[i] = 0;
+ break;
+ }
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (source == -1) {
+ source = i;
+ first = replies[i].poststat;
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (!IA_EQUAL (first, replies[i].poststat, type) ||
+ !IA_EQUAL (first, replies[i].poststat, uid) ||
+ !IA_EQUAL (first, replies[i].poststat, gid) ||
+ !IA_EQUAL (first, replies[i].poststat, prot)) {
+ sources[i] = 0;
+ sinks[i] = 1;
+ }
+ }
+
+ return source;
}
-int
-afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this)
+static int
+__afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *locked_on, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks,
+ struct afr_reply *replies)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int source = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- source = sh->source;
-
- afr_sh_mark_source_sinks (frame, this);
- if (sh->active_sinks == 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no active sinks for performing self-heal on file %s",
- local->loc.path);
- afr_sh_metadata_finish (frame, this);
- return 0;
- }
-
- gf_log (this->name, GF_LOG_TRACE,
- "syncing metadata of %s from subvolume %s to %d active sinks",
- local->loc.path, priv->children[source]->name,
- sh->active_sinks);
-
- STACK_WIND (frame, afr_sh_metadata_getxattr_cbk,
- priv->children[source],
- priv->children[source]->fops->getxattr,
- &local->loc, NULL, NULL);
-
- return 0;
+ int ret = -1;
+ int source = -1;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ priv = this->private;
+
+ ret = afr_selfheal_unlocked_discover (frame, inode, inode->gfid,
+ replies);
+ if (ret)
+ return ret;
+
+ ret = afr_selfheal_find_direction (frame, this, replies,
+ AFR_METADATA_TRANSACTION,
+ locked_on, sources, sinks);
+ if (ret)
+ return ret;
+
+ source = __afr_selfheal_metadata_finalize_source (this, sources, sinks,
+ locked_on, replies);
+ if (source < 0)
+ return -EIO;
+
+ for (i = 0; i < priv->child_count; i++)
+ /* Initialize the healed_sinks[] array optimistically to
+ the intersection of to-be-healed (i.e sinks[]) and
+ the list of servers which are up (i.e locked_on[]).
+
+ As we encounter failures in the healing process, we
+ will unmark the respective servers in the healed_sinks[]
+ array.
+ */
+ healed_sinks[i] = sinks[i] && locked_on[i];
+
+ return source;
}
-void
-afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno)
+static int
+__afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *locked_on)
{
- afr_local_t *local = NULL;
- afr_self_heal_t *sh = NULL;
- afr_private_t *priv = NULL;
- int nsources = 0;
- int source = 0;
- int i = 0;
-
- local = frame->local;
- sh = &local->self_heal;
- priv = this->private;
-
- if (op_ret < 0) {
- sh->op_failed = 1;
- afr_sh_set_error (sh, op_errno);
- afr_sh_metadata_finish (frame, this);
- goto out;
- }
- nsources = afr_build_sources (this, sh->xattr, sh->buf,
- sh->pending_matrix, sh->sources,
- sh->success_children,
- AFR_METADATA_TRANSACTION, NULL, _gf_false);
- if ((nsources == -1)
- && (priv->favorite_child != -1)
- && (sh->child_errno[priv->favorite_child] == 0)) {
-
- gf_log (this->name, GF_LOG_WARNING,
- "Picking favorite child %s as authentic source to resolve conflicting metadata of %s",
- priv->children[priv->favorite_child]->name,
- local->loc.path);
-
- sh->sources[priv->favorite_child] = 1;
-
- nsources = afr_sh_source_count (sh->sources,
- priv->child_count);
- }
-
- if (nsources == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "Unable to self-heal permissions/ownership of '%s' "
- "(possible split-brain). Please fix the file on "
- "all backend volumes", local->loc.path);
-
- sh->mdata_spb = _gf_true;
-
- afr_sh_metadata_finish (frame, this);
- goto out;
- }
-
- sh->mdata_spb = _gf_false;
- if (nsources == 0) {
- gf_log (this->name, GF_LOG_TRACE,
- "No self-heal needed for %s",
- local->loc.path);
-
- afr_sh_metadata_finish (frame, this);
- goto out;
- }
-
- source = afr_sh_select_source (sh->sources, priv->child_count);
-
- if (source == -1) {
- gf_log (this->name, GF_LOG_DEBUG,
- "No active sources found.");
-
- afr_sh_metadata_finish (frame, this);
- goto out;
- }
-
- sh->source = source;
-
- /* detect changes not visible through pending flags -- JIC */
- for (i = 0; i < priv->child_count; i++) {
- if (i == source || sh->child_errno[i])
- continue;
-
- if (PERMISSION_DIFFERS (&sh->buf[i], &sh->buf[source]))
- sh->sources[i] = 0;
-
- if (OWNERSHIP_DIFFERS (&sh->buf[i], &sh->buf[source]))
- sh->sources[i] = 0;
- }
-
- if ((!IA_ISREG (sh->buf[source].ia_type)) &&
- (!IA_ISDIR (sh->buf[source].ia_type))) {
- afr_reset_children (sh->fresh_children, priv->child_count);
- afr_get_fresh_children (sh->success_children, sh->sources,
- sh->fresh_children, priv->child_count);
- afr_inode_set_read_ctx (this, sh->inode, sh->source,
- sh->fresh_children);
- }
-
- if (sh->do_metadata_self_heal && priv->metadata_self_heal)
- afr_sh_metadata_sync_prepare (frame, this);
- else
- afr_sh_metadata_finish (frame, this);
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *data_lock = NULL;
+ unsigned char *healed_sinks = NULL;
+ struct afr_reply *locked_replies = NULL;
+ int source = -1;
+
+ priv = this->private;
+
+ sources = alloca0 (priv->child_count);
+ sinks = alloca0 (priv->child_count);
+ healed_sinks = alloca0 (priv->child_count);
+ data_lock = alloca0 (priv->child_count);
+
+ locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count);
+
+ ret = afr_selfheal_inodelk (frame, this, inode, this->name,
+ LLONG_MAX - 1, 0, data_lock);
+ {
+ if (ret < 2) {
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_metadata_prepare (frame, this, inode, data_lock,
+ sources, sinks, healed_sinks,
+ locked_replies);
+ if (ret < 0)
+ goto unlock;
+
+ source = ret;
+ ret = 0;
+ }
+unlock:
+ afr_selfheal_uninodelk (frame, this, inode, this->name,
+ LLONG_MAX -1, 0, data_lock);
+ if (ret < 0)
+ goto out;
+
+ ret = afr_selfheal_metadata_do (frame, this, inode, source, healed_sinks,
+ locked_replies);
+ if (ret)
+ goto out;
+
+ ret = afr_selfheal_undo_pending (frame, this, inode, sources, sinks,
+ healed_sinks, AFR_METADATA_TRANSACTION,
+ locked_replies, data_lock);
out:
- return;
-}
-
-int
-afr_sh_metadata_post_nonblocking_inodelk_cbk (call_frame_t *frame,
- xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- if (int_lock->lock_op_ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG, "Non Blocking metadata "
- "inodelks failed for %s.", local->loc.path);
- gf_log (this->name, GF_LOG_DEBUG, "Metadata self-heal "
- "failed for %s.", local->loc.path);
- afr_sh_metadata_done (frame, this);
- } else {
-
- gf_log (this->name, GF_LOG_DEBUG, "Non Blocking metadata "
- "inodelks done for %s. Proceeding to FOP",
- local->loc.path);
- afr_sh_common_lookup (frame, this, &local->loc,
- afr_sh_metadata_fix, NULL,
- AFR_LOOKUP_FAIL_CONFLICTS |
- AFR_LOOKUP_FAIL_MISSING_GFIDS,
- NULL);
- }
-
- return 0;
-}
-
-int
-afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- int_lock->transaction_lk_type = AFR_SELFHEAL_LK;
- int_lock->selfheal_lk_type = AFR_METADATA_SELF_HEAL_LK;
-
- afr_set_lock_number (frame, this);
-
- int_lock->lk_flock.l_start = LLONG_MAX - 1;
- int_lock->lk_flock.l_len = 0;
- int_lock->lk_flock.l_type = F_WRLCK;
- int_lock->lock_cbk = afr_sh_metadata_post_nonblocking_inodelk_cbk;
-
- afr_nonblocking_inodelk (frame, this);
-
- return 0;
+ return ret;
}
-gf_boolean_t
-afr_can_start_metadata_self_heal (afr_self_heal_t *sh, afr_private_t *priv)
-{
- if (sh->force_confirm_spb)
- return _gf_true;
- if (sh->do_metadata_self_heal && priv->metadata_self_heal)
- return _gf_true;
- return _gf_false;
-}
int
-afr_self_heal_metadata (call_frame_t *frame, xlator_t *this)
+afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = this->private;
- afr_self_heal_t *sh = &local->self_heal;
-
- local = frame->local;
- sh = &local->self_heal;
-
- /* Self-heal completion cbk changes inode split-brain status based on
- * govinda_gOvinda, mdata_spb, data_spb value.
- * Initialize mdata_spb with current split-brain status.
- * If for some reason self-heal fails(locking phase etc), it makes sure
- * we retain the split-brain status before this self-heal started.
- */
- sh->mdata_spb = afr_is_split_brain (this, sh->inode);
-
- if (afr_can_start_metadata_self_heal (sh, priv)) {
- afr_sh_metadata_lock (frame, this);
- } else {
- afr_sh_metadata_done (frame, this);
- }
-
- return 0;
+ afr_private_t *priv = NULL;
+ unsigned char *locked_on = NULL;
+ int ret = 0;
+
+ priv = this->private;
+
+ locked_on = alloca0 (priv->child_count);
+
+ ret = afr_selfheal_tryinodelk (frame, this, inode, priv->sh_domain, 0, 0,
+ locked_on);
+ {
+ if (ret < 2) {
+ /* Either less than two subvols available, or another
+ selfheal (from another server) is in progress. Skip
+ for now in any case there isn't anything to do.
+ */
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_metadata (frame, this, inode, locked_on);
+ }
+unlock:
+ afr_selfheal_uninodelk (frame, this, inode, priv->sh_domain, 0, 0, locked_on);
+
+ return ret;
}
diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c
new file mode 100644
index 000000000..ce80b8da3
--- /dev/null
+++ b/xlators/cluster/afr/src/afr-self-heal-name.c
@@ -0,0 +1,457 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "afr.h"
+#include "afr-self-heal.h"
+
+
+int
+__afr_selfheal_assign_gfid (call_frame_t *frame, xlator_t *this, inode_t *parent,
+ uuid_t pargfid, const char *bname, inode_t *inode,
+ struct afr_reply *replies, int gfid_idx)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ dict_t *xdata = NULL;
+ int ret = 0;
+ loc_t loc = {0, };
+
+ priv = this->private;
+
+ uuid_copy (parent->gfid, pargfid);
+
+ xdata = dict_new ();
+ if (!xdata) {
+ return -ENOMEM;
+ }
+
+ ret = dict_set_static_bin (xdata, "gfid-req",
+ replies[gfid_idx].poststat.ia_gfid, 16);
+ if (ret) {
+ dict_destroy (xdata);
+ return -ENOMEM;
+ }
+
+ loc.parent = inode_ref (parent);
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.pargfid, pargfid);
+ loc.name = bname;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].op_ret == 0 || replies[i].op_errno != ENODATA)
+ continue;
+
+ ret = syncop_lookup (priv->children[i], &loc, xdata, 0, 0, 0);
+ }
+
+ loc_wipe (&loc);
+ dict_unref (xdata);
+
+ return ret;
+}
+
+
+int
+__afr_selfheal_name_impunge (call_frame_t *frame, xlator_t *this, inode_t *parent,
+ uuid_t pargfid, const char *bname, inode_t *inode,
+ struct afr_reply *replies, int gfid_idx)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int ret = 0;
+
+ priv = this->private;
+
+ uuid_copy (parent->gfid, pargfid);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (uuid_compare (replies[i].poststat.ia_gfid,
+ replies[gfid_idx].poststat.ia_gfid) == 0)
+ continue;
+
+ ret |= afr_selfheal_recreate_entry (frame, this, i, gfid_idx,
+ parent, bname, inode, replies);
+ }
+
+ return ret;
+}
+
+
+int
+__afr_selfheal_name_expunge (call_frame_t *frame, xlator_t *this, inode_t *parent,
+ uuid_t pargfid, const char *bname, inode_t *inode,
+ struct afr_reply *replies)
+{
+ loc_t loc = {0, };
+ int i = 0;
+ afr_private_t *priv = NULL;
+ char g[64];
+ int ret = 0;
+
+ priv = this->private;
+
+ loc.parent = inode_ref (parent);
+ uuid_copy (loc.pargfid, pargfid);
+ loc.name = bname;
+ loc.inode = inode_ref (inode);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (replies[i].op_ret)
+ continue;
+
+ switch (replies[i].poststat.ia_type) {
+ case IA_IFDIR:
+ gf_log (this->name, GF_LOG_WARNING,
+ "expunging dir %s/%s (%s) on %s",
+ uuid_utoa (pargfid), bname,
+ uuid_utoa_r (replies[i].poststat.ia_gfid, g),
+ priv->children[i]->name);
+ ret |= syncop_rmdir (priv->children[i], &loc, 1);
+ break;
+ default:
+ gf_log (this->name, GF_LOG_WARNING,
+ "expunging file %s/%s (%s) on %s",
+ uuid_utoa (pargfid), bname,
+ uuid_utoa_r (replies[i].poststat.ia_gfid, g),
+ priv->children[i]->name);
+ ret |= syncop_unlink (priv->children[i], &loc);
+ break;
+ }
+ }
+
+ loc_wipe (&loc);
+
+ return ret;
+
+}
+
+
+int
+__afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent,
+ uuid_t pargfid, const char *bname, inode_t *inode,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks, int source,
+ unsigned char *locked_on, struct afr_reply *replies)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ uuid_t gfid = {0, };
+ int gfid_idx = -1;
+ gf_boolean_t source_is_empty = _gf_true;
+ gf_boolean_t need_heal = _gf_false;
+ int first_idx = -1;
+ char g1[64],g2[64];
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (first_idx == -1) {
+ first_idx = i;
+ continue;
+ }
+
+ if (replies[i].op_ret != replies[first_idx].op_ret)
+ need_heal = _gf_true;
+
+ if (uuid_compare (replies[i].poststat.ia_gfid,
+ replies[first_idx].poststat.ia_gfid))
+ need_heal = _gf_true;
+ }
+
+ if (!need_heal)
+ return 0;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (!replies[i].op_ret && (source == -1 || sources[i])) {
+ source_is_empty = _gf_false;
+ break;
+ }
+ }
+
+ if (source_is_empty) {
+ return __afr_selfheal_name_expunge (frame, this, parent, pargfid,
+ bname, inode, replies);
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (uuid_is_null (replies[i].poststat.ia_gfid))
+ continue;
+
+ if (uuid_is_null (gfid)) {
+ uuid_copy (gfid, replies[i].poststat.ia_gfid);
+ gfid_idx = i;
+ continue;
+ }
+
+ if (sources[i] || source == -1) {
+ if (gfid_idx != -1 &&
+ (sources[gfid_idx] || source == -1) &&
+ uuid_compare (gfid, replies[i].poststat.ia_gfid)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "GFID mismatch for <gfid:%s>/%s "
+ "%s on %s and %s on %s",
+ uuid_utoa (pargfid), bname,
+ uuid_utoa_r (replies[i].poststat.ia_gfid, g1),
+ priv->children[i]->name,
+ uuid_utoa_r (replies[gfid_idx].poststat.ia_gfid, g2),
+ priv->children[gfid_idx]->name);
+ return -1;
+ }
+
+ uuid_copy (gfid, replies[i].poststat.ia_gfid);
+ gfid_idx = i;
+ continue;
+ }
+ }
+
+ if (gfid_idx == -1)
+ return -1;
+
+ __afr_selfheal_assign_gfid (frame, this, parent, pargfid, bname, inode,
+ replies, gfid_idx);
+
+ return __afr_selfheal_name_impunge (frame, this, parent, pargfid,
+ bname, inode, replies, gfid_idx);
+}
+
+
+int
+__afr_selfheal_name_finalize_source (xlator_t *this, unsigned char *sources,
+ unsigned char *sinks, unsigned char *locked_on,
+ struct afr_reply *replies)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int source = -1;
+ int locked_count = 0;
+ int sources_count = 0;
+ int sinks_count = 0;
+
+ priv = this->private;
+
+ locked_count = AFR_COUNT (locked_on, priv->child_count);
+ sources_count = AFR_COUNT (sources, priv->child_count);
+ sinks_count = AFR_COUNT (sinks, priv->child_count);
+
+ if (locked_count == sinks_count || !sources_count) {
+ return -1;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i]) {
+ source = i;
+ break;
+ }
+ }
+
+ return source;
+}
+
+
+int
+__afr_selfheal_name_prepare (call_frame_t *frame, xlator_t *this, inode_t *parent,
+ uuid_t pargfid, unsigned char *locked_on,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks, struct afr_reply *replies,
+ int *source_p)
+{
+ int ret = -1;
+ int source = -1;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ priv = this->private;
+
+ ret = afr_selfheal_unlocked_discover (frame, parent, pargfid, replies);
+ if (ret)
+ return ret;
+
+ ret = afr_selfheal_find_direction (frame, this, replies,
+ AFR_ENTRY_TRANSACTION,
+ locked_on, sources, sinks);
+ if (ret)
+ return ret;
+
+ source = __afr_selfheal_name_finalize_source (this, sources, sinks,
+ locked_on, replies);
+ if (source < 0) {
+ /* If source is < 0 (typically split-brain), we perform a
+ conservative merge of entries rather than erroring out */
+ }
+ *source_p = source;
+
+ for (i = 0; i < priv->child_count; i++)
+ /* Initialize the healed_sinks[] array optimistically to
+ the intersection of to-be-healed (i.e sinks[]) and
+ the list of servers which are up (i.e locked_on[]).
+
+ As we encounter failures in the healing process, we
+ will unmark the respective servers in the healed_sinks[]
+ array.
+ */
+ healed_sinks[i] = sinks[i] && locked_on[i];
+
+ return ret;
+}
+
+
+int
+afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent,
+ uuid_t pargfid, const char *bname)
+{
+ afr_private_t *priv = NULL;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *healed_sinks = NULL;
+ unsigned char *locked_on = NULL;
+ int source = -1;
+ struct afr_reply *replies = NULL;
+ int ret = -1;
+ inode_t *inode = NULL;
+
+ priv = this->private;
+
+ locked_on = alloca0 (priv->child_count);
+ sources = alloca0 (priv->child_count);
+ sinks = alloca0 (priv->child_count);
+ healed_sinks = alloca0 (priv->child_count);
+
+ replies = alloca0 (priv->child_count * sizeof(*replies));
+
+ ret = afr_selfheal_entrylk (frame, this, parent, this->name, bname,
+ locked_on);
+ {
+ if (ret < 2) {
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_name_prepare (frame, this, parent, pargfid,
+ locked_on, sources, sinks,
+ healed_sinks, replies,
+ &source);
+ if (ret)
+ goto unlock;
+
+ inode = afr_selfheal_unlocked_lookup_on (frame, parent, bname,
+ replies, locked_on);
+ if (!inode) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_name_do (frame, this, parent, pargfid, bname,
+ inode, sources, sinks, healed_sinks,
+ source, locked_on, replies);
+ }
+unlock:
+ afr_selfheal_unentrylk (frame, this, parent, this->name, bname,
+ locked_on);
+ if (inode)
+ inode_unref (inode);
+
+ return ret;
+}
+
+
+int
+afr_selfheal_name_unlocked_inspect (call_frame_t *frame, xlator_t *this,
+ inode_t *parent, uuid_t pargfid,
+ const char *bname, gf_boolean_t *need_heal)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ struct afr_reply *replies = NULL;
+ inode_t *inode = NULL;
+ int first_idx = -1;
+
+ priv = this->private;
+
+ replies = alloca0 (sizeof (*replies) * priv->child_count);
+
+ inode = afr_selfheal_unlocked_lookup_on (frame, parent, bname,
+ replies, priv->child_up);
+ if (!inode)
+ return -ENOMEM;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (first_idx == -1) {
+ first_idx = i;
+ continue;
+ }
+
+ if (replies[i].op_ret != replies[first_idx].op_ret)
+ *need_heal = _gf_true;
+
+ if (uuid_compare (replies[i].poststat.ia_gfid,
+ replies[first_idx].poststat.ia_gfid))
+ *need_heal = _gf_true;
+ }
+
+ if (inode)
+ inode_unref (inode);
+ return 0;
+}
+
+int
+afr_selfheal_name (xlator_t *this, uuid_t pargfid, const char *bname)
+{
+ inode_t *parent = NULL;
+ call_frame_t *frame = NULL;
+ int ret = -1;
+ gf_boolean_t need_heal = _gf_false;
+
+ parent = afr_inode_find (this, pargfid);
+ if (!parent)
+ goto out;
+
+ frame = afr_frame_create (this);
+ if (!frame)
+ goto out;
+
+ ret = afr_selfheal_name_unlocked_inspect (frame, this, parent, pargfid,
+ bname, &need_heal);
+ if (ret)
+ goto out;
+
+ if (need_heal)
+ afr_selfheal_name_do (frame, this, parent, pargfid, bname);
+out:
+ if (parent)
+ inode_unref (parent);
+ if (frame)
+ AFR_STACK_DESTROY (frame);
+
+ return ret;
+}
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index 2efc1116d..a1b972ac3 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -1,5 +1,5 @@
/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
@@ -8,43 +8,160 @@
cases as published by the Free Software Foundation.
*/
-#ifndef __AFR_SELF_HEAL_H__
-#define __AFR_SELF_HEAL_H__
-#include <sys/stat.h>
+#ifndef _AFR_SELFHEAL_H
+#define _AFR_SELFHEAL_H
-#define FILETYPE_DIFFERS(buf1,buf2) ((buf1)->ia_type != (buf2)->ia_type)
-#define PERMISSION_DIFFERS(buf1,buf2) (st_mode_from_ia ((buf1)->ia_prot, (buf1)->ia_type) != st_mode_from_ia ((buf2)->ia_prot, (buf2)->ia_type))
-#define OWNERSHIP_DIFFERS(buf1,buf2) (((buf1)->ia_uid != (buf2)->ia_uid) || ((buf1)->ia_gid != (buf2)->ia_gid))
-#define SIZE_DIFFERS(buf1,buf2) ((buf1)->ia_size != (buf2)->ia_size)
-#define SIZE_GREATER(buf1,buf2) ((buf1)->ia_size > (buf2)->ia_size)
+/* Perform fop on all UP subvolumes and wait for all callbacks to return */
+
+#define AFR_ONALL(frame, rfn, fop, args ...) do { \
+ afr_local_t *__local = frame->local; \
+ afr_private_t *__priv = frame->this->private; \
+ int __i = 0, __count = 0; \
+ \
+ afr_replies_wipe (__local, __priv); \
+ \
+ for (__i = 0; __i < __priv->child_count; __i++) { \
+ if (!__priv->child_up[__i]) continue; \
+ STACK_WIND_COOKIE (frame, rfn, (void *)(long) __i, \
+ __priv->children[__i], \
+ __priv->children[__i]->fops->fop, args); \
+ __count++; \
+ } \
+ syncbarrier_wait (&__local->barrier, __count); \
+ } while (0)
+
+
+/* Perform fop on all subvolumes represented by list[] array and wait
+ for all callbacks to return */
+
+#define AFR_ONLIST(list, frame, rfn, fop, args ...) do { \
+ afr_local_t *__local = frame->local; \
+ afr_private_t *__priv = frame->this->private; \
+ int __i = 0, __count = 0; \
+ \
+ afr_replies_wipe (__local, __priv); \
+ \
+ for (__i = 0; __i < __priv->child_count; __i++) { \
+ if (!list[__i]) continue; \
+ STACK_WIND_COOKIE (frame, rfn, (void *)(long) __i, \
+ __priv->children[__i], \
+ __priv->children[__i]->fops->fop, args); \
+ __count++; \
+ } \
+ syncbarrier_wait (&__local->barrier, __count); \
+ } while (0)
+
+
+#define AFR_SEQ(frame, rfn, fop, args ...) do { \
+ afr_local_t *__local = frame->local; \
+ afr_private_t *__priv = frame->this->private; \
+ int __i = 0; \
+ \
+ afr_replies_wipe (__local, __priv); \
+ \
+ for (__i = 0; __i < __priv->child_count; __i++) { \
+ if (!__priv->child_up[__i]) continue; \
+ STACK_WIND_COOKIE (frame, rfn, (void *)(long) __i, \
+ __priv->children[__i], \
+ __priv->children[__i]->fops->fop, args); \
+ syncbarrier_wait (&__local->barrier, 1); \
+ } \
+ } while (0)
+
+
+#define ALLOC_MATRIX(n, type) ({type **__ptr = NULL; \
+ int __i; \
+ __ptr = alloca0 (n * sizeof(type *)); \
+ for (__i = 0; __i < n; __i++) __ptr[__i] = alloca0 (n * sizeof(type)); \
+ __ptr;})
+
+
+#define IA_EQUAL(f,s,field) (memcmp (&(f.ia_##field), &(s.ia_##field), sizeof (s.ia_##field)) == 0)
+
+
+int
+afr_selfheal (xlator_t *this, uuid_t gfid);
+
+int
+afr_selfheal_name (xlator_t *this, uuid_t gfid, const char *name);
+
+int
+afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode);
int
-afr_sh_has_metadata_pending (dict_t *xattr, xlator_t *this);
+afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode);
+
int
-afr_sh_has_entry_pending (dict_t *xattr, xlator_t *this);
+afr_selfheal_entry (call_frame_t *frame, xlator_t *this, inode_t *inode);
+
+
int
-afr_sh_has_data_pending (dict_t *xattr, xlator_t *this);
+afr_selfheal_inodelk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ unsigned char *locked_on);
int
-afr_self_heal_entry (call_frame_t *frame, xlator_t *this);
+afr_selfheal_tryinodelk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ unsigned char *locked_on);
int
-afr_self_heal_data (call_frame_t *frame, xlator_t *this);
+afr_selfheal_uninodelk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ const unsigned char *locked_on);
int
-afr_self_heal_metadata (call_frame_t *frame, xlator_t *this);
+afr_selfheal_entrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on);
int
-afr_self_heal_get_source (xlator_t *this, afr_local_t *local, dict_t **xattr);
+afr_selfheal_tryentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on);
int
-afr_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode);
+afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on);
int
-afr_lookup_select_read_child_by_txn_type (xlator_t *this, afr_local_t *local,
- dict_t **xattr,
- afr_transaction_type txn_type,
- uuid_t gfid);
-#endif /* __AFR_SELF_HEAL_H__ */
+afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode,
+ uuid_t gfid, struct afr_reply *replies);
+
+inode_t *
+afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent,
+ const char *name, struct afr_reply *replies,
+ unsigned char *lookup_on);
+
+int
+afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this,
+ struct afr_reply *replies,
+ afr_transaction_type type, unsigned char *locked_on,
+ unsigned char *sources, unsigned char *sinks);
+
+int
+afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies,
+ afr_transaction_type type, int *dirty, int **matrix);
+
+int
+afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks, afr_transaction_type type,
+ struct afr_reply *replies, unsigned char *locked_on);
+
+int
+afr_selfheal_recreate_entry (call_frame_t *frame, xlator_t *this, int dst,
+ int source, inode_t *dir, const char *name,
+ inode_t *inode, struct afr_reply *replies);
+
+int
+afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int subvol, dict_t *xattr);
+
+call_frame_t *
+afr_frame_create (xlator_t *this);
+
+inode_t *
+afr_inode_find (xlator_t *this, uuid_t gfid);
+
+#endif /* !_AFR_SELFHEAL_H */
diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c
index b3dee3f59..4bfe909bc 100644
--- a/xlators/cluster/afr/src/afr-self-heald.c
+++ b/xlators/cluster/afr/src/afr-self-heald.c
@@ -1,5 +1,5 @@
/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
@@ -8,1240 +8,1249 @@
cases as published by the Free Software Foundation.
*/
+
#ifndef _CONFIG_H
#define _CONFIG_H
#include "config.h"
#endif
+
#include "afr.h"
-#include "syncop.h"
+#include "afr-self-heal.h"
#include "afr-self-heald.h"
-#include "afr-self-heal-common.h"
#include "protocol-common.h"
-#include "event-history.h"
-typedef enum {
- STOP_CRAWL_ON_SINGLE_SUBVOL = 1
-} afr_crawl_flags_t;
+#define SHD_INODE_LRU_LIMIT 2048
+#define AFR_EH_HEALED_LIMIT 1024
+#define AFR_EH_HEAL_FAIL_LIMIT 1024
+#define AFR_EH_SPLIT_BRAIN_LIMIT 1024
+#define AFR_STATISTICS_HISTORY_SIZE 50
-typedef enum {
- HEAL = 1,
- INFO
-} shd_crawl_op;
-typedef struct shd_dump {
- dict_t *dict;
- xlator_t *this;
- int child;
-} shd_dump_t;
+#define ASSERT_LOCAL(this, healer) \
+ if (!afr_shd_is_subvol_local(this, healer->subvol)) { \
+ healer->local = _gf_false; \
+ if (safe_break (healer)) { \
+ break; \
+ } else { \
+ continue; \
+ } \
+ } else { \
+ healer->local = _gf_true; \
+ }
-typedef struct shd_event_ {
- int child;
- char *path;
-} shd_event_t;
-typedef struct shd_pos_ {
- int child;
- xlator_t *this;
- afr_child_pos_t pos;
-} shd_pos_t;
+#define NTH_INDEX_HEALER(this, n) &((((afr_private_t *)this->private))->shd.index_healers[n])
+#define NTH_FULL_HEALER(this, n) &((((afr_private_t *)this->private))->shd.full_healers[n])
-typedef int
-(*afr_crawl_done_cbk_t) (int ret, call_frame_t *sync_frame, void *crawl_data);
+int afr_shd_gfid_to_path (xlator_t *this, xlator_t *subvol, uuid_t gfid, char **path_p);
-void
-afr_start_crawl (xlator_t *this, int idx, afr_crawl_type_t crawl,
- process_entry_cbk_t process_entry, void *op_data,
- gf_boolean_t exclusive, int crawl_flags,
- afr_crawl_done_cbk_t crawl_done);
+char *
+afr_subvol_name (xlator_t *this, int subvol)
+{
+ afr_private_t *priv = NULL;
-static int
-_crawl_directory (fd_t *fd, loc_t *loc, afr_crawl_data_t *crawl_data);
+ priv = this->private;
+ if (subvol < 0 || subvol > priv->child_count)
+ return NULL;
-/* For calling straight through (e.g. already in a synctask). */
-int
-afr_find_child_position (xlator_t *this, int child, afr_child_pos_t *pos);
+ return priv->children[subvol]->name;
+}
-/* For deferring through a new synctask. */
-int
-afr_syncop_find_child_position (void *data);
-static int
-_loc_assign_gfid_path (loc_t *loc)
+void
+afr_destroy_crawl_event_data (void *data)
{
- int ret = -1;
- char gfid_path[64] = {0};
-
- if (loc->inode && !uuid_is_null (loc->inode->gfid)) {
- ret = inode_path (loc->inode, NULL, (char**)&loc->path);
- } else if (!uuid_is_null (loc->gfid)) {
- snprintf (gfid_path, sizeof (gfid_path), "<gfid:%s>",
- uuid_utoa (loc->gfid));
- loc->path = gf_strdup (gfid_path);
- if (loc->path)
- ret = 0;
- }
- return ret;
+ return;
}
+
void
-shd_cleanup_event (void *event)
+afr_destroy_shd_event_data (void *data)
{
- shd_event_t *shd_event = event;
+ shd_event_t *shd_event = data;
+
+ if (!shd_event)
+ return;
+ GF_FREE (shd_event->path);
- if (!shd_event)
- goto out;
- GF_FREE (shd_event->path);
- GF_FREE (shd_event);
-out:
return;
}
-int
-afr_get_local_child (afr_self_heald_t *shd, unsigned int child_count)
-{
- int i = 0;
- int ret = -1;
- for (i = 0; i < child_count; i++) {
- if (shd->pos[i] == AFR_POS_LOCAL) {
- ret = i;
- break;
- }
- }
- return ret;
-}
-static int
-_build_index_loc (xlator_t *this, loc_t *loc, char *name, loc_t *parent)
+gf_boolean_t
+afr_shd_is_subvol_local (xlator_t *this, int subvol)
{
- int ret = 0;
+ char *pathinfo = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xattr = NULL;
+ int ret = 0;
+ gf_boolean_t is_local = _gf_false;
+ loc_t loc = {0, };
- uuid_copy (loc->pargfid, parent->inode->gfid);
- loc->path = "";
- loc->name = name;
- loc->parent = inode_ref (parent->inode);
- if (!loc->parent) {
- loc->path = NULL;
- loc_wipe (loc);
- ret = -1;
- }
- return ret;
-}
+ priv = this->private;
-int
-_add_path_to_dict (xlator_t *this, dict_t *output, int child, char *path,
- struct timeval *tv, gf_boolean_t dyn)
-{
- //subkey not used for now
- int ret = -1;
- uint64_t count = 0;
- char key[256] = {0};
- int xl_id = 0;
+ loc.inode = this->itable->root;
+ uuid_copy (loc.gfid, loc.inode->gfid);
- ret = dict_get_int32 (output, this->name, &xl_id);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "xl does not have id");
- goto out;
- }
+ ret = syncop_getxattr (priv->children[subvol], &loc, &xattr,
+ GF_XATTR_PATHINFO_KEY);
+ if (ret)
+ return _gf_false;
+ if (!xattr)
+ return _gf_false;
- snprintf (key, sizeof (key), "%d-%d-count", xl_id, child);
- ret = dict_get_uint64 (output, key, &count);
+ ret = dict_get_str (xattr, GF_XATTR_PATHINFO_KEY, &pathinfo);
+ if (ret)
+ return _gf_false;
- snprintf (key, sizeof (key), "%d-%d-%"PRIu64, xl_id, child, count);
- if (dyn)
- ret = dict_set_dynstr (output, key, path);
- else
- ret = dict_set_str (output, key, path);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "%s: Could not add to output",
- path);
- goto out;
- }
+ afr_local_pathinfo (pathinfo, &is_local);
- if (!tv)
- goto inc_count;
- snprintf (key, sizeof (key), "%d-%d-%"PRIu64"-time", xl_id,
- child, count);
- ret = dict_set_uint32 (output, key, tv->tv_sec);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "%s: Could not set time",
- path);
- goto out;
- }
+ gf_log (this->name, GF_LOG_DEBUG, "subvol %s is %slocal",
+ priv->children[subvol]->name, is_local? "" : "not ");
-inc_count:
- snprintf (key, sizeof (key), "%d-%d-count", xl_id, child);
- ret = dict_set_uint64 (output, key, count + 1);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Could not increment count");
- goto out;
- }
- ret = 0;
-out:
- return ret;
+ return is_local;
}
+
int
-_get_path_from_gfid_loc (xlator_t *this, xlator_t *readdir_xl, loc_t *child,
- char **fpath)
+__afr_shd_healer_wait (struct subvol_healer *healer)
{
- dict_t *xattr = NULL;
- char *path = NULL;
- int ret = -1;
+ afr_private_t *priv = NULL;
+ struct timespec wait_till = {0, };
+ int ret = 0;
- ret = syncop_getxattr (readdir_xl, child, &xattr,
- GFID_TO_PATH_KEY);
- if (ret)
- goto out;
- ret = dict_get_str (xattr, GFID_TO_PATH_KEY, &path);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Failed to get path for "
- "gfid %s", uuid_utoa (child->gfid));
- goto out;
- }
- path = gf_strdup (path);
- if (!path) {
- ret = -1;
- goto out;
- }
- ret = 0;
-out:
- if (!ret)
- *fpath = path;
- if (xattr)
- dict_unref (xattr);
- return ret;
-}
+ priv = healer->this->private;
-int
-_add_event_to_dict (circular_buffer_t *cb, void *data)
-{
- int ret = 0;
- shd_dump_t *dump_data = NULL;
- shd_event_t *shd_event = NULL;
+disabled_loop:
+ wait_till.tv_sec = time (NULL) + 60;
- dump_data = data;
- shd_event = cb->data;
- if (shd_event->child != dump_data->child)
- goto out;
- ret = _add_path_to_dict (dump_data->this, dump_data->dict,
- dump_data->child, shd_event->path, &cb->tv,
- _gf_false);
-out:
- return ret;
+ while (!healer->rerun) {
+ ret = pthread_cond_timedwait (&healer->cond,
+ &healer->mutex,
+ &wait_till);
+ if (ret == ETIMEDOUT)
+ break;
+ }
+
+ ret = healer->rerun;
+ healer->rerun = 0;
+
+ if (!priv->shd.enabled)
+ goto disabled_loop;
+
+ return ret;
}
+
int
-_add_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict, int child)
+afr_shd_healer_wait (struct subvol_healer *healer)
{
- shd_dump_t dump_data = {0};
+ int ret = 0;
+
+ pthread_mutex_lock (&healer->mutex);
+ {
+ ret = __afr_shd_healer_wait (healer);
+ }
+ pthread_mutex_unlock (&healer->mutex);
- dump_data.this = this;
- dump_data.dict = dict;
- dump_data.child = child;
- eh_dump (eh, &dump_data, _add_event_to_dict);
- return 0;
+ return ret;
}
-int
-_add_summary_to_dict (xlator_t *this, afr_crawl_data_t *crawl_data,
- gf_dirent_t *entry,
- loc_t *childloc, loc_t *parentloc, struct iatt *iattr)
+
+gf_boolean_t
+safe_break (struct subvol_healer *healer)
{
- dict_t *output = NULL;
- xlator_t *readdir_xl = NULL;
- int ret = -1;
- char *path = NULL;
+ gf_boolean_t ret = _gf_false;
- if (uuid_is_null (childloc->gfid))
- goto out;
+ pthread_mutex_lock (&healer->mutex);
+ {
+ if (healer->rerun)
+ goto unlock;
- output = crawl_data->op_data;
- readdir_xl = crawl_data->readdir_xl;
+ healer->running = _gf_false;
+ ret = _gf_true;
+ }
+unlock:
+ pthread_mutex_unlock (&healer->mutex);
+
+ return ret;
+}
- ret = _get_path_from_gfid_loc (this, readdir_xl, childloc, &path);
- if (ret)
- goto out;
- ret = _add_path_to_dict (this, output, crawl_data->child, path, NULL,
- _gf_true);
+inode_t *
+afr_shd_inode_find (xlator_t *this, xlator_t *subvol, uuid_t gfid)
+{
+ inode_t *inode = NULL;
+ int ret = 0;
+ loc_t loc = {0, };
+ struct iatt iatt = {0, };
+
+ inode = inode_find (this->itable, gfid);
+ if (inode)
+ goto out;
+
+ loc.inode = inode_new (this->itable);
+ if (!loc.inode)
+ goto out;
+ uuid_copy (loc.gfid, gfid);
+
+ ret = syncop_lookup (subvol, &loc, NULL, &iatt, NULL, NULL);
+ if (ret < 0)
+ goto out;
+
+ inode = inode_link (loc.inode, NULL, NULL, &iatt);
+ if (inode)
+ inode_lookup (inode);
out:
- if (ret && path)
- GF_FREE (path);
- return ret;
+ loc_wipe (&loc);
+ return inode;
}
-void
-_remove_stale_index (xlator_t *this, xlator_t *readdir_xl,
- loc_t *parent, char *fname)
-{
- int ret = 0;
- loc_t index_loc = {0};
- ret = _build_index_loc (this, &index_loc, fname, parent);
- if (ret)
- goto out;
- gf_log (this->name, GF_LOG_DEBUG, "Removing stale index "
- "for %s on %s", index_loc.name, readdir_xl->name);
- ret = syncop_unlink (readdir_xl, &index_loc);
- if(ret && (errno != ENOENT)) {
- gf_log(this->name, GF_LOG_ERROR, "%s: Failed to remove index "
- "on %s - %s",index_loc.name, readdir_xl->name,
- strerror (errno));
- }
- index_loc.path = NULL;
- loc_wipe (&index_loc);
+fd_t *
+afr_shd_index_opendir (xlator_t *this, int child)
+{
+ fd_t *fd = NULL;
+ afr_private_t *priv = NULL;
+ xlator_t *subvol = NULL;
+ loc_t rootloc = {0, };
+ inode_t *inode = NULL;
+ int ret = 0;
+ dict_t *xattr = NULL;
+ void *index_gfid = NULL;
+
+ priv = this->private;
+ subvol = priv->children[child];
+
+ rootloc.inode = inode_ref (this->itable->root);
+ uuid_copy (rootloc.gfid, rootloc.inode->gfid);
+
+ ret = syncop_getxattr (subvol, &rootloc, &xattr,
+ GF_XATTROP_INDEX_GFID);
+ if (ret || !xattr) {
+ errno = -ret;
+ goto out;
+ }
+
+ ret = dict_get_ptr (xattr, GF_XATTROP_INDEX_GFID, &index_gfid);
+ if (ret)
+ goto out;
+
+ gf_log (this->name, GF_LOG_DEBUG, "index-dir gfid for %s: %s",
+ subvol->name, uuid_utoa (index_gfid));
+
+ inode = afr_shd_inode_find (this, subvol, index_gfid);
+ if (!inode)
+ goto out;
+ fd = fd_anonymous (inode);
out:
- return;
+ loc_wipe (&rootloc);
+ if (xattr)
+ dict_unref (xattr);
+ return fd;
}
-void
-_crawl_post_sh_action (xlator_t *this, loc_t *parent, loc_t *child,
- int32_t op_ret, int32_t op_errno, dict_t *xattr_rsp,
- afr_crawl_data_t *crawl_data)
+
+int
+afr_shd_index_purge (xlator_t *subvol, inode_t *inode, char *name)
{
- int ret = 0;
- afr_private_t *priv = NULL;
- afr_self_heald_t *shd = NULL;
- eh_t *eh = NULL;
- char *path = NULL;
- shd_event_t *event = NULL;
- int32_t sh_failed = 0;
- gf_boolean_t split_brain = 0;
-
- priv = this->private;
- shd = &priv->shd;
- if (crawl_data->crawl == INDEX) {
- if ((op_ret < 0) && (op_errno == ENOENT)) {
- _remove_stale_index (this, crawl_data->readdir_xl,
- parent, uuid_utoa (child->gfid));
- goto out;
- }
- ret = _get_path_from_gfid_loc (this, crawl_data->readdir_xl,
- child, &path);
- if (ret)
- goto out;
- } else {
- path = gf_strdup (child->path);
- if (!path) {
- ret = -1;
- goto out;
- }
- }
+ loc_t loc = {0, };
+ int ret = 0;
- if (xattr_rsp)
- ret = dict_get_int32 (xattr_rsp, "sh-failed", &sh_failed);
- split_brain = afr_is_split_brain (this, child->inode);
- if ((op_ret < 0 && op_errno == EIO) || split_brain)
- eh = shd->split_brain;
- else if ((op_ret < 0) || sh_failed)
- eh = shd->heal_failed;
- else
- eh = shd->healed;
- ret = -1;
- event = GF_CALLOC (1, sizeof (*event), gf_afr_mt_shd_event_t);
- if (!event)
- goto out;
- event->child = crawl_data->child;
- event->path = path;
- ret = eh_save_history (eh, event);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR, "%s:Failed to save to "
- "event history, (%d, %s)", path, op_ret, strerror (op_errno));
- goto out;
- }
- ret = 0;
-out:
- if (ret && path)
- GF_FREE (path);
- return;
+ loc.parent = inode_ref (inode);
+ loc.name = name;
+
+ ret = syncop_unlink (subvol, &loc);
+
+ loc_wipe (&loc);
+ return ret;
}
+
int
-_self_heal_entry (xlator_t *this, afr_crawl_data_t *crawl_data, gf_dirent_t *entry,
- loc_t *child, loc_t *parent, struct iatt *iattr)
+afr_shd_selfheal_name (struct subvol_healer *healer, int child, uuid_t parent,
+ const char *bname)
{
- struct iatt parentbuf = {0};
- int ret = 0;
- dict_t *xattr_rsp = NULL;
+ int ret = -1;
- gf_log (this->name, GF_LOG_DEBUG, "lookup %s", child->path);
+ ret = afr_selfheal_name (THIS, parent, bname);
- ret = syncop_lookup (this, child, NULL,
- iattr, &xattr_rsp, &parentbuf);
- _crawl_post_sh_action (this, parent, child, ret, errno, xattr_rsp,
- crawl_data);
- if (xattr_rsp)
- dict_unref (xattr_rsp);
- return ret;
+ return ret;
}
-static int
-afr_crawl_done (int ret, call_frame_t *sync_frame, void *data)
+int
+afr_shd_selfheal (struct subvol_healer *healer, int child, uuid_t gfid)
{
- GF_FREE (data);
- STACK_DESTROY (sync_frame->root);
- return 0;
+ int ret = 0;
+ eh_t *eh = NULL;
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ shd_event_t *shd_event = NULL;
+ char *path = NULL;
+ xlator_t *subvol = NULL;
+ xlator_t *this = NULL;
+ crawl_event_t *crawl_event = NULL;
+
+ this = healer->this;
+ priv = this->private;
+ shd = &priv->shd;
+ crawl_event = &healer->crawl_event;
+
+ subvol = priv->children[child];
+
+ ret = afr_selfheal (this, gfid);
+
+ if (ret == -EIO) {
+ eh = shd->split_brain;
+ crawl_event->split_brain_count++;
+ } else if (ret < 0) {
+ eh = shd->heal_failed;
+ crawl_event->heal_failed_count++;
+ } else if (ret == 0) {
+ eh = shd->healed;
+ crawl_event->healed_count++;
+ }
+
+ afr_shd_gfid_to_path (this, subvol, gfid, &path);
+ if (!path)
+ return ret;
+
+ if (eh) {
+ shd_event = GF_CALLOC (1, sizeof(*shd_event),
+ gf_afr_mt_shd_event_t);
+ if (!shd_event) {
+ GF_FREE (path);
+ return ret;
+ }
+
+ shd_event->child = child;
+ shd_event->path = path;
+
+ if (eh_save_history (eh, shd_event) < 0) {
+ GF_FREE (shd_event);
+ GF_FREE (path);
+ }
+ }
+ return ret;
}
+
void
-_do_self_heal_on_subvol (xlator_t *this, int child, afr_crawl_type_t crawl)
+afr_shd_sweep_prepare (struct subvol_healer *healer)
{
- afr_start_crawl (this, child, crawl, _self_heal_entry,
- NULL, _gf_true, STOP_CRAWL_ON_SINGLE_SUBVOL,
- afr_crawl_done);
-}
+ crawl_event_t *event = NULL;
-gf_boolean_t
-_crawl_proceed (xlator_t *this, int child, int crawl_flags, char **reason)
-{
- afr_private_t *priv = NULL;
- afr_self_heald_t *shd = NULL;
- gf_boolean_t proceed = _gf_false;
- char *msg = NULL;
-
- priv = this->private;
- shd = &priv->shd;
- if (!shd->enabled) {
- msg = "Self-heal daemon is not enabled";
- gf_log (this->name, GF_LOG_DEBUG, "%s", msg);
- goto out;
- }
- if (!priv->child_up[child]) {
- gf_log (this->name, GF_LOG_DEBUG, "Stopping crawl for %s , "
- "subvol went down", priv->children[child]->name);
- msg = "Brick is Not connected";
- goto out;
- }
+ event = &healer->crawl_event;
- if (crawl_flags & STOP_CRAWL_ON_SINGLE_SUBVOL) {
- if (afr_up_children_count (priv->child_up,
- priv->child_count) < 2) {
- gf_log (this->name, GF_LOG_DEBUG, "Stopping crawl as "
- "< 2 children are up");
- msg = "< 2 bricks in replica are running";
- goto out;
- }
- }
- proceed = _gf_true;
-out:
- if (reason)
- *reason = msg;
- return proceed;
-}
+ event->healed_count = 0;
+ event->split_brain_count = 0;
+ event->heal_failed_count = 0;
-int
-_do_crawl_op_on_local_subvols (xlator_t *this, afr_crawl_type_t crawl,
- shd_crawl_op op, dict_t *output)
-{
- afr_private_t *priv = NULL;
- char *status = NULL;
- char *subkey = NULL;
- char key[256] = {0};
- shd_pos_t pos_data = {0};
- int op_ret = -1;
- int xl_id = -1;
- int i = 0;
- int ret = 0;
- int crawl_flags = 0;
-
- priv = this->private;
- if (op == HEAL)
- crawl_flags |= STOP_CRAWL_ON_SINGLE_SUBVOL;
-
- if (output) {
- ret = dict_get_int32 (output, this->name, &xl_id);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Invalid input, "
- "translator-id is not available");
- goto out;
- }
- }
- pos_data.this = this;
- subkey = "status";
- for (i = 0; i < priv->child_count; i++) {
- if (_crawl_proceed (this, i, crawl_flags, &status)) {
- pos_data.child = i;
- /*
- * We're already in a synctask in this case, so we
- * don't need to defer through a second (and in fact
- * that can cause deadlock). Just call straight
- * through instead.
- */
- ret = afr_find_child_position(pos_data.this,
- pos_data.child,
- &pos_data.pos);
- if (ret) {
- status = "Not able to find brick location";
- } else if (pos_data.pos == AFR_POS_REMOTE) {
- status = "brick is remote";
- } else {
- op_ret = 0;
- if (op == HEAL) {
- status = "Started self-heal";
- _do_self_heal_on_subvol (this, i,
- crawl);
- } else if (output) {
- status = "";
- afr_start_crawl (this, i, INDEX,
- _add_summary_to_dict,
- output, _gf_false, 0,
- NULL);
- }
- }
- if (output) {
- snprintf (key, sizeof (key), "%d-%d-%s", xl_id,
- i, subkey);
- ret = dict_set_str (output, key, status);
- }
- if (!op_ret && (crawl == FULL))
- break;
- }
- if (output) {
- snprintf (key, sizeof (key), "%d-%d-%s", xl_id, i,
- subkey);
- ret = dict_set_str (output, key, status);
- }
- }
-out:
- return op_ret;
+ time (&event->start_time);
+ event->end_time = 0;
}
-int
-_do_self_heal_on_local_subvols (xlator_t *this, afr_crawl_type_t crawl,
- dict_t *output)
+
+void
+afr_shd_sweep_done (struct subvol_healer *healer)
{
- return _do_crawl_op_on_local_subvols (this, crawl, HEAL, output);
+ crawl_event_t *event = NULL;
+ crawl_event_t *history = NULL;
+ afr_self_heald_t *shd = NULL;
+
+ event = &healer->crawl_event;
+ shd = &(((afr_private_t *)healer->this->private)->shd);
+
+ time (&event->end_time);
+ history = memdup (event, sizeof (*event));
+ event->start_time = 0;
+
+ if (!history)
+ return;
+
+ if (eh_save_history (shd->statistics[healer->subvol], history) < 0)
+ GF_FREE (history);
}
+
int
-_get_index_summary_on_local_subvols (xlator_t *this, dict_t *output)
+afr_shd_index_sweep (struct subvol_healer *healer)
{
- return _do_crawl_op_on_local_subvols (this, INDEX, INFO, output);
+ xlator_t *this = NULL;
+ int child = -1;
+ fd_t *fd = NULL;
+ xlator_t *subvol = NULL;
+ afr_private_t *priv = NULL;
+ off_t offset = 0;
+ gf_dirent_t entries;
+ gf_dirent_t *entry = NULL;
+ uuid_t gfid;
+ int ret = 0;
+ int count = 0;
+
+ this = healer->this;
+ child = healer->subvol;
+ priv = this->private;
+ subvol = priv->children[child];
+
+ fd = afr_shd_index_opendir (this, child);
+ if (!fd) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "unable to opendir index-dir on %s", subvol->name);
+ return -errno;
+ }
+
+ INIT_LIST_HEAD (&entries.list);
+
+ while ((ret = syncop_readdir (subvol, fd, 131072, offset, &entries))) {
+ if (ret > 0)
+ ret = 0;
+ list_for_each_entry (entry, &entries.list, list) {
+ offset = entry->d_off;
+
+ if (!priv->shd.enabled) {
+ ret = -EBUSY;
+ break;
+ }
+
+ if (!strcmp (entry->d_name, ".") ||
+ !strcmp (entry->d_name, ".."))
+ continue;
+
+ gf_log (this->name, GF_LOG_DEBUG, "got entry: %s",
+ entry->d_name);
+
+ ret = uuid_parse (entry->d_name, gfid);
+ if (ret)
+ continue;
+
+ ret = afr_shd_selfheal (healer, child, gfid);
+ if (ret == 0)
+ count++;
+
+ if (ret == -ENOENT || ret == -ESTALE) {
+ afr_shd_index_purge (subvol, fd->inode,
+ entry->d_name);
+ ret = 0;
+ }
+ }
+
+ gf_dirent_free (&entries);
+ if (ret)
+ break;
+ }
+
+ if (fd)
+ fd_unref (fd);
+ if (!ret)
+ ret = count;
+ return ret;
}
+
int
-_add_all_subvols_eh_to_dict (xlator_t *this, eh_t *eh, dict_t *dict)
+afr_shd_full_sweep (struct subvol_healer *healer, inode_t *inode)
{
- afr_private_t *priv = NULL;
- afr_self_heald_t *shd = NULL;
- int i = 0;
+ fd_t *fd = NULL;
+ xlator_t *this = NULL;
+ xlator_t *subvol = NULL;
+ afr_private_t *priv = NULL;
+ off_t offset = 0;
+ gf_dirent_t entries;
+ gf_dirent_t *entry = NULL;
+ int ret = 0;
+
+ this = healer->this;
+ priv = this->private;
+ subvol = priv->children[healer->subvol];
+
+ fd = fd_anonymous (inode);
+ if (!fd)
+ return -errno;
+
+ INIT_LIST_HEAD (&entries.list);
+
+ while ((ret = syncop_readdirp (subvol, fd, 131072, offset, 0, &entries))) {
+ if (ret < 0)
+ break;
+
+ ret = gf_link_inodes_from_dirent (this, fd->inode, &entries);
+ if (ret)
+ break;
+
+ list_for_each_entry (entry, &entries.list, list) {
+ offset = entry->d_off;
+
+ if (!priv->shd.enabled) {
+ ret = -EBUSY;
+ break;
+ }
+
+ if (!strcmp (entry->d_name, ".") ||
+ !strcmp (entry->d_name, ".."))
+ continue;
+
+ afr_shd_selfheal_name (healer, healer->subvol,
+ inode->gfid, entry->d_name);
+
+ afr_shd_selfheal (healer, healer->subvol,
+ entry->d_stat.ia_gfid);
+
+ if (entry->d_stat.ia_type == IA_IFDIR) {
+ ret = afr_shd_full_sweep (healer, entry->inode);
+ if (ret)
+ break;
+ }
+ }
+
+ gf_dirent_free (&entries);
+ if (ret)
+ break;
+ }
+
+ if (fd)
+ fd_unref (fd);
+ return ret;
+}
- priv = this->private;
- shd = &priv->shd;
- for (i = 0; i < priv->child_count; i++) {
- if (shd->pos[i] != AFR_POS_LOCAL)
- continue;
- _add_eh_to_dict (this, eh, dict, i);
- }
- return 0;
+void *
+afr_shd_index_healer (void *data)
+{
+ struct subvol_healer *healer = NULL;
+ xlator_t *this = NULL;
+ int ret = 0;
+
+ healer = data;
+ THIS = this = healer->this;
+
+ for (;;) {
+ afr_shd_healer_wait (healer);
+
+ ASSERT_LOCAL(this, healer);
+
+ do {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "starting index sweep on subvol %s",
+ afr_subvol_name (this, healer->subvol));
+
+ afr_shd_sweep_prepare (healer);
+
+ ret = afr_shd_index_sweep (healer);
+
+ afr_shd_sweep_done (healer);
+ /*
+ As long as at least one gfid was
+ healed, keep retrying. We may have
+ just healed a directory and thereby
+ created entries for other gfids which
+ could not be healed thus far.
+ */
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "finished index sweep on subvol %s",
+ afr_subvol_name (this, healer->subvol));
+ /*
+ Give a pause before retrying to avoid a busy loop
+ in case the only entry in index is because of
+ an ongoing I/O.
+ */
+ sleep (1);
+ } while (ret > 0);
+ }
+
+ return NULL;
}
-int
-afr_xl_op (xlator_t *this, dict_t *input, dict_t *output)
+
+void *
+afr_shd_full_healer (void *data)
{
- gf_xl_afr_op_t op = GF_AFR_OP_INVALID;
- int ret = 0;
- afr_private_t *priv = NULL;
- afr_self_heald_t *shd = NULL;
- int xl_id = 0;
+ struct subvol_healer *healer = NULL;
+ xlator_t *this = NULL;
+ int run = 0;
- priv = this->private;
- shd = &priv->shd;
+ healer = data;
+ THIS = this = healer->this;
- ret = dict_get_int32 (input, "xl-op", (int32_t*)&op);
- if (ret)
- goto out;
- ret = dict_get_int32 (input, this->name, &xl_id);
- if (ret)
- goto out;
- ret = dict_set_int32 (output, this->name, xl_id);
- if (ret)
- goto out;
- switch (op) {
- case GF_AFR_OP_HEAL_INDEX:
- ret = _do_self_heal_on_local_subvols (this, INDEX, output);
- break;
- case GF_AFR_OP_HEAL_FULL:
- ret = _do_self_heal_on_local_subvols (this, FULL, output);
- break;
- case GF_AFR_OP_INDEX_SUMMARY:
- (void)_get_index_summary_on_local_subvols (this, output);
- ret = 0;
- break;
- case GF_AFR_OP_HEALED_FILES:
- ret = _add_all_subvols_eh_to_dict (this, shd->healed, output);
- break;
- case GF_AFR_OP_HEAL_FAILED_FILES:
- ret = _add_all_subvols_eh_to_dict (this, shd->heal_failed,
- output);
- break;
- case GF_AFR_OP_SPLIT_BRAIN_FILES:
- ret = _add_all_subvols_eh_to_dict (this, shd->split_brain,
- output);
- break;
- default:
- gf_log (this->name, GF_LOG_ERROR, "Unknown set op %d", op);
- break;
- }
-out:
- dict_del (output, this->name);
- return ret;
-}
+ for (;;) {
+ pthread_mutex_lock (&healer->mutex);
+ {
+ run = __afr_shd_healer_wait (healer);
+ if (!run)
+ healer->running = _gf_false;
+ }
+ pthread_mutex_unlock (&healer->mutex);
-void
-afr_poll_self_heal (void *data)
-{
- afr_private_t *priv = NULL;
- afr_self_heald_t *shd = NULL;
- struct timeval timeout = {0};
- xlator_t *this = NULL;
- long child = (long)data;
- gf_timer_t *old_timer = NULL;
- gf_timer_t *new_timer = NULL;
- shd_pos_t pos_data = {0};
- int ret = 0;
+ if (!run)
+ break;
- this = THIS;
- priv = this->private;
- shd = &priv->shd;
-
- if (shd->pos[child] == AFR_POS_UNKNOWN) {
- pos_data.this = this;
- pos_data.child = child;
- ret = synctask_new (this->ctx->env,
- afr_syncop_find_child_position,
- NULL, NULL, &pos_data);
- if (!ret)
- shd->pos[child] = pos_data.pos;
- }
- if (shd->enabled && (shd->pos[child] == AFR_POS_LOCAL))
- _do_self_heal_on_subvol (this, child, INDEX);
- timeout.tv_sec = shd->timeout;
- timeout.tv_usec = 0;
- //notify and previous timer should be synchronized.
- LOCK (&priv->lock);
- {
- old_timer = shd->timer[child];
- if (shd->pos[child] == AFR_POS_REMOTE)
- goto unlock;
- shd->timer[child] = gf_timer_call_after (this->ctx, timeout,
- afr_poll_self_heal,
- data);
- new_timer = shd->timer[child];
- }
-unlock:
- UNLOCK (&priv->lock);
-
- if (old_timer)
- gf_timer_call_cancel (this->ctx, old_timer);
- if (!new_timer && (shd->pos[child] != AFR_POS_REMOTE)) {
- gf_log (this->name, GF_LOG_WARNING,
- "Could not create self-heal polling timer for %s",
- priv->children[child]->name);
- }
- return;
-}
+ ASSERT_LOCAL(this, healer);
-static int
-afr_handle_child_up (int ret, call_frame_t *sync_frame, void *data)
-{
- afr_self_heald_t *shd = NULL;
- shd_pos_t *pos_data = data;
- afr_private_t *priv = NULL;
+ gf_log (this->name, GF_LOG_INFO,
+ "starting full sweep on subvol %s",
+ afr_subvol_name (this, healer->subvol));
- if (ret)
- goto out;
+ afr_shd_sweep_prepare (healer);
- priv = pos_data->this->private;
- shd = &priv->shd;
- shd->pos[pos_data->child] = pos_data->pos;
- if (pos_data->pos != AFR_POS_REMOTE)
- afr_poll_self_heal ((void*)(long)pos_data->child);
- _do_self_heal_on_local_subvols (THIS, INDEX, NULL);
-out:
- GF_FREE (data);
- return 0;
-}
+ afr_shd_full_sweep (healer, this->itable->root);
-void
-afr_proactive_self_heal (void *data)
-{
- xlator_t *this = NULL;
- long child = (long)data;
- shd_pos_t *pos_data = NULL;
- int ret = 0;
+ afr_shd_sweep_done (healer);
- this = THIS;
+ gf_log (this->name, GF_LOG_INFO,
+ "finished full sweep on subvol %s",
+ afr_subvol_name (this, healer->subvol));
+ }
- //Position of brick could have changed and it could be local now.
- //Compute the position again
- pos_data = GF_CALLOC (1, sizeof (*pos_data), gf_afr_mt_pos_data_t);
- if (!pos_data)
- goto out;
- pos_data->this = this;
- pos_data->child = child;
- ret = synctask_new (this->ctx->env, afr_syncop_find_child_position,
- afr_handle_child_up, NULL, pos_data);
- if (ret)
- goto out;
-out:
- return;
+ return NULL;
}
-static int
-get_pathinfo_host (char *pathinfo, char *hostname, size_t size)
+
+int
+afr_shd_healer_init (xlator_t *this, struct subvol_healer *healer)
{
- char *start = NULL;
- char *end = NULL;
- int ret = -1;
- int i = 0;
+ int ret = 0;
- if (!pathinfo)
- goto out;
+ ret = pthread_mutex_init (&healer->mutex, NULL);
+ if (ret)
+ goto out;
- start = strchr (pathinfo, ':');
- if (!start)
- goto out;
- end = strrchr (pathinfo, ':');
- if (start == end)
- goto out;
+ ret = pthread_cond_init (&healer->cond, NULL);
+ if (ret)
+ goto out;
- memset (hostname, 0, size);
- i = 0;
- while (++start != end)
- hostname[i++] = *start;
- ret = 0;
+ healer->this = this;
+ healer->running = _gf_false;
+ healer->rerun = _gf_false;
+ healer->local = _gf_false;
out:
- return ret;
+ return ret;
}
+
int
-_link_inode_update_loc (xlator_t *this, loc_t *loc, struct iatt *iattr)
+afr_shd_healer_spawn (xlator_t *this, struct subvol_healer *healer,
+ void *(threadfn)(void *))
{
- inode_t *link_inode = NULL;
- int ret = -1;
+ int ret = 0;
+
+ pthread_mutex_lock (&healer->mutex);
+ {
+ if (healer->running) {
+ pthread_cond_signal (&healer->cond);
+ } else {
+ ret = gf_thread_create (&healer->thread, NULL,
+ threadfn, healer);
+ if (ret)
+ goto unlock;
+ healer->running = 1;
+ }
+
+ healer->rerun = 1;
+ }
+unlock:
+ pthread_mutex_unlock (&healer->mutex);
- link_inode = inode_link (loc->inode, NULL, NULL, iattr);
- if (link_inode == NULL) {
- gf_log (this->name, GF_LOG_ERROR, "inode link failed "
- "on the inode (%s)", uuid_utoa (iattr->ia_gfid));
- goto out;
- }
- inode_unref (loc->inode);
- loc->inode = link_inode;
- ret = 0;
-out:
- return ret;
+ return ret;
}
+
int
-afr_local_pathinfo (char *pathinfo, gf_boolean_t *local)
+afr_shd_full_healer_spawn (xlator_t *this, int subvol)
{
- int ret = 0;
- char pathinfohost[1024] = {0};
- char localhost[1024] = {0};
- xlator_t *this = THIS;
-
- *local = _gf_false;
- ret = get_pathinfo_host (pathinfo, pathinfohost, sizeof (pathinfohost));
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Invalid pathinfo: %s",
- pathinfo);
- goto out;
- }
-
- ret = gethostname (localhost, sizeof (localhost));
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "gethostname() failed, "
- "reason: %s", strerror (errno));
- goto out;
- }
-
- if (!strcmp (localhost, pathinfohost))
- *local = _gf_true;
-out:
- return ret;
+ return afr_shd_healer_spawn (this, NTH_FULL_HEALER (this, subvol),
+ afr_shd_full_healer);
}
+
int
-afr_crawl_build_start_loc (xlator_t *this, afr_crawl_data_t *crawl_data,
- loc_t *dirloc)
+afr_shd_index_healer_spawn (xlator_t *this, int subvol)
{
- afr_private_t *priv = NULL;
- dict_t *xattr = NULL;
- void *index_gfid = NULL;
- loc_t rootloc = {0};
- struct iatt iattr = {0};
- struct iatt parent = {0};
- int ret = 0;
- xlator_t *readdir_xl = crawl_data->readdir_xl;
-
- priv = this->private;
- if (crawl_data->crawl == FULL) {
- afr_build_root_loc (this, dirloc);
- } else {
- afr_build_root_loc (this, &rootloc);
- ret = syncop_getxattr (readdir_xl, &rootloc, &xattr,
- GF_XATTROP_INDEX_GFID);
- if (ret < 0)
- goto out;
- ret = dict_get_ptr (xattr, GF_XATTROP_INDEX_GFID, &index_gfid);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR, "failed to get index "
- "dir gfid on %s", readdir_xl->name);
- goto out;
- }
- if (!index_gfid) {
- gf_log (this->name, GF_LOG_ERROR, "index gfid empty "
- "on %s", readdir_xl->name);
- ret = -1;
- goto out;
- }
- uuid_copy (dirloc->gfid, index_gfid);
- dirloc->path = "";
- dirloc->inode = inode_new (priv->root_inode->table);
- ret = syncop_lookup (readdir_xl, dirloc, NULL,
- &iattr, NULL, &parent);
- if (ret < 0) {
- if (errno != ENOENT) {
- gf_log (this->name, GF_LOG_ERROR, "lookup "
- "failed on index dir on %s - (%s)",
- readdir_xl->name, strerror (errno));
- }
- goto out;
- }
- ret = _link_inode_update_loc (this, dirloc, &iattr);
- if (ret)
- goto out;
- }
- ret = 0;
-out:
- if (xattr)
- dict_unref (xattr);
- loc_wipe (&rootloc);
- return ret;
+ return afr_shd_healer_spawn (this, NTH_INDEX_HEALER (this, subvol),
+ afr_shd_index_healer);
}
+
int
-afr_crawl_opendir (xlator_t *this, afr_crawl_data_t *crawl_data, fd_t **dirfd,
- loc_t *dirloc)
+afr_shd_dict_add_crawl_event (xlator_t *this, dict_t *output,
+ crawl_event_t *crawl_event)
{
- fd_t *fd = NULL;
- int ret = 0;
-
- if (crawl_data->crawl == FULL) {
- fd = fd_create (dirloc->inode, crawl_data->pid);
- if (!fd) {
- gf_log (this->name, GF_LOG_ERROR,
- "Failed to create fd for %s", dirloc->path);
- ret = -1;
- goto out;
- }
+ int ret = 0;
+ uint64_t count = 0;
+ char key[256] = {0};
+ int xl_id = 0;
+ uint64_t healed_count = 0;
+ uint64_t split_brain_count = 0;
+ uint64_t heal_failed_count = 0;
+ char *start_time_str = 0;
+ char *end_time_str = NULL;
+ char *crawl_type = NULL;
+ int progress = -1;
+ int child = -1;
- ret = syncop_opendir (crawl_data->readdir_xl, dirloc, fd);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "opendir failed on %s", dirloc->path);
- goto out;
- }
- } else {
- fd = fd_anonymous (dirloc->inode);
- }
- ret = 0;
-out:
- if (!ret)
- *dirfd = fd;
- return ret;
-}
+ child = crawl_event->child;
+ healed_count = crawl_event->healed_count;
+ split_brain_count = crawl_event->split_brain_count;
+ heal_failed_count = crawl_event->heal_failed_count;
+ crawl_type = crawl_event->crawl_type;
-xlator_t*
-afr_crawl_readdir_xl_get (xlator_t *this, afr_crawl_data_t *crawl_data)
-{
- afr_private_t *priv = this->private;
+ if (!crawl_event->start_time)
+ goto out;
- if (crawl_data->crawl == FULL) {
- return this;
- } else {
- return priv->children[crawl_data->child];
- }
- return NULL;
-}
+ start_time_str = gf_strdup (ctime (&crawl_event->start_time));
-int
-afr_crawl_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent,
- gf_dirent_t *entry, afr_crawl_data_t *crawl_data)
-{
- int ret = -1;
- afr_private_t *priv = NULL;
+ if (crawl_event->end_time)
+ end_time_str = gf_strdup (ctime (&crawl_event->end_time));
- priv = this->private;
- if (crawl_data->crawl == FULL) {
- ret = afr_build_child_loc (this, child, parent, entry->d_name);
- } else {
- child->inode = inode_new (priv->root_inode->table);
- if (!child->inode)
- goto out;
- uuid_parse (entry->d_name, child->gfid);
- ret = _loc_assign_gfid_path (child);
+ ret = dict_get_int32 (output, this->name, &xl_id);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "xl does not have id");
+ goto out;
}
-out:
- return ret;
-}
-static int
-_process_entries (xlator_t *this, loc_t *parentloc, gf_dirent_t *entries,
- off_t *offset, afr_crawl_data_t *crawl_data)
-{
- gf_dirent_t *entry = NULL;
- gf_dirent_t *tmp = NULL;
- int ret = 0;
- loc_t entry_loc = {0};
- fd_t *fd = NULL;
- struct iatt iattr = {0};
-
- list_for_each_entry_safe (entry, tmp, &entries->list, list) {
- if (!_crawl_proceed (this, crawl_data->child,
- crawl_data->crawl_flags, NULL)) {
- ret = -1;
- goto out;
- }
- *offset = entry->d_off;
- if (IS_ENTRY_CWD (entry->d_name) ||
- IS_ENTRY_PARENT (entry->d_name))
- continue;
- if ((crawl_data->crawl == FULL) &&
- uuid_is_null (entry->d_stat.ia_gfid)) {
- gf_log (this->name, GF_LOG_WARNING, "%s/%s: No "
- "gfid present skipping",
- parentloc->path, entry->d_name);
- continue;
- }
-
- loc_wipe (&entry_loc);
- ret = afr_crawl_build_child_loc (this, &entry_loc, parentloc,
- entry, crawl_data);
- if (ret)
- goto out;
-
- ret = crawl_data->process_entry (this, crawl_data, entry,
- &entry_loc, parentloc, &iattr);
+ snprintf (key, sizeof (key), "statistics-%d-%d-count", xl_id, child);
+ ret = dict_get_uint64 (output, key, &count);
- if (ret)
- continue;
- ret = _link_inode_update_loc (this, &entry_loc, &iattr);
- if (ret)
- goto out;
- if (crawl_data->crawl == INDEX)
- continue;
-
- if (!IA_ISDIR (iattr.ia_type))
- continue;
- fd = NULL;
- ret = afr_crawl_opendir (this, crawl_data, &fd, &entry_loc);
- if (ret)
- continue;
- ret = _crawl_directory (fd, &entry_loc, crawl_data);
- if (fd)
- fd_unref (fd);
+ snprintf (key, sizeof (key), "statistics_healed_cnt-%d-%d-%"PRIu64,
+ xl_id, child, count);
+ ret = dict_set_uint64(output, key, healed_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not add statistics_healed_count to outout");
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "statistics_sb_cnt-%d-%d-%"PRIu64,
+ xl_id, child, count);
+ ret = dict_set_uint64 (output, key, split_brain_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not add statistics_split_brain_count to outout");
+ goto out;
}
- ret = 0;
-out:
- loc_wipe (&entry_loc);
- return ret;
-}
-static int
-_crawl_directory (fd_t *fd, loc_t *loc, afr_crawl_data_t *crawl_data)
-{
- xlator_t *this = NULL;
- off_t offset = 0;
- gf_dirent_t entries;
- int ret = 0;
- gf_boolean_t free_entries = _gf_false;
- xlator_t *readdir_xl = crawl_data->readdir_xl;
+ snprintf (key, sizeof (key), "statistics_crawl_type-%d-%d-%"PRIu64,
+ xl_id, child, count);
+ ret = dict_set_str (output, key, crawl_type);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not add statistics_crawl_type to output");
+ goto out;
+ }
- INIT_LIST_HEAD (&entries.list);
- this = THIS;
+ snprintf (key, sizeof (key), "statistics_heal_failed_cnt-%d-%d-%"PRIu64,
+ xl_id, child, count);
+ ret = dict_set_uint64 (output, key, heal_failed_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not add statistics_healed_failed_count to outout");
+ goto out;
+ }
- GF_ASSERT (loc->inode);
+ snprintf (key, sizeof (key), "statistics_strt_time-%d-%d-%"PRIu64,
+ xl_id, child, count);
+ ret = dict_set_dynstr (output, key, start_time_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not add statistics_crawl_start_time to outout");
+ goto out;
+ } else {
+ start_time_str = NULL;
+ }
- if (crawl_data->crawl == FULL)
- gf_log (this->name, GF_LOG_DEBUG, "crawling %s", loc->path);
+ if (!end_time_str)
+ progress = 1;
else
- gf_log (this->name, GF_LOG_DEBUG, "crawling INDEX %s",
- uuid_utoa (loc->gfid));
-
- while (1) {
- if (crawl_data->crawl == FULL)
- ret = syncop_readdirp (readdir_xl, fd, 131072, offset,
- NULL, &entries);
- else
- ret = syncop_readdir (readdir_xl, fd, 131072, offset,
- &entries);
- if (ret <= 0)
- break;
- ret = 0;
- free_entries = _gf_true;
-
- if (!_crawl_proceed (this, crawl_data->child,
- crawl_data->crawl_flags, NULL)) {
- ret = -1;
- goto out;
- }
- if (list_empty (&entries.list))
- goto out;
+ progress = 0;
+
+ snprintf (key, sizeof (key), "statistics_end_time-%d-%d-%"PRIu64,
+ xl_id, child, count);
+ if (!end_time_str)
+ end_time_str = gf_strdup ("Could not determine the end time");
+ ret = dict_set_dynstr (output, key, end_time_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not add statistics_crawl_end_time to outout");
+ goto out;
+ } else {
+ end_time_str = NULL;
+ }
- ret = _process_entries (this, loc, &entries, &offset,
- crawl_data);
- gf_dirent_free (&entries);
- free_entries = _gf_false;
+ snprintf (key, sizeof (key), "statistics_inprogress-%d-%d-%"PRIu64,
+ xl_id, child, count);
+
+ ret = dict_set_int32 (output, key, progress);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not add statistics_inprogress to outout");
+ goto out;
}
- ret = 0;
+
+ snprintf (key, sizeof (key), "statistics-%d-%d-count", xl_id, child);
+ ret = dict_set_uint64 (output, key, count + 1);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not increment the counter.");
+ goto out;
+ }
out:
- if (free_entries)
- gf_dirent_free (&entries);
+ GF_FREE (start_time_str);
+ GF_FREE (end_time_str);
return ret;
}
-static char*
-position_str_get (afr_child_pos_t pos)
-{
- switch (pos) {
- case AFR_POS_UNKNOWN:
- return "unknown";
- case AFR_POS_LOCAL:
- return "local";
- case AFR_POS_REMOTE:
- return "remote";
- }
- return NULL;
-}
int
-afr_find_child_position (xlator_t *this, int child, afr_child_pos_t *pos)
+afr_shd_dict_add_path (xlator_t *this, dict_t *output, int child, char *path,
+ struct timeval *tv)
{
- afr_private_t *priv = NULL;
- afr_self_heald_t *shd = NULL;
- dict_t *xattr_rsp = NULL;
- loc_t loc = {0};
- int ret = 0;
- char *node_uuid = NULL;
+ int ret = -1;
+ uint64_t count = 0;
+ char key[256] = {0};
+ int xl_id = 0;
+
+ ret = dict_get_int32 (output, this->name, &xl_id);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "xl does not have id");
+ goto out;
+ }
- priv = this->private;
- shd = &priv->shd;
+ snprintf (key, sizeof (key), "%d-%d-count", xl_id, child);
+ ret = dict_get_uint64 (output, key, &count);
- afr_build_root_loc (this, &loc);
+ snprintf (key, sizeof (key), "%d-%d-%"PRIu64, xl_id, child, count);
+ ret = dict_set_dynstr (output, key, path);
- ret = syncop_getxattr (priv->children[child], &loc, &xattr_rsp,
- GF_XATTR_NODE_UUID_KEY);
if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "getxattr failed on %s - "
- "(%s)", priv->children[child]->name, strerror (errno));
+ gf_log (this->name, GF_LOG_ERROR, "%s: Could not add to output",
+ path);
goto out;
}
- ret = dict_get_str (xattr_rsp, GF_XATTR_NODE_UUID_KEY, &node_uuid);
+ if (tv) {
+ snprintf (key, sizeof (key), "%d-%d-%"PRIu64"-time", xl_id,
+ child, count);
+ ret = dict_set_uint32 (output, key, tv->tv_sec);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "%s: Could not set time",
+ path);
+ goto out;
+ }
+ }
+
+ snprintf (key, sizeof (key), "%d-%d-count", xl_id, child);
+
+ ret = dict_set_uint64 (output, key, count + 1);
if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "node-uuid key not found on "
- "child %s", priv->children[child]->name);
+ gf_log (this->name, GF_LOG_ERROR, "Could not increment count");
goto out;
}
- if (!strcmp (node_uuid, shd->node_uuid))
- *pos = AFR_POS_LOCAL;
- else
- *pos = AFR_POS_REMOTE;
-
- gf_log (this->name, GF_LOG_DEBUG, "child %s is %s",
- priv->children[child]->name, position_str_get (*pos));
+ ret = 0;
out:
- if (ret)
- *pos = AFR_POS_UNKNOWN;
- loc_wipe (&loc);
return ret;
}
+
int
-afr_syncop_find_child_position (void *data)
+afr_shd_gfid_to_path (xlator_t *this, xlator_t *subvol, uuid_t gfid, char **path_p)
{
- shd_pos_t *pos_data = data;
- int ret = 0;
-
- ret = afr_find_child_position (pos_data->this, pos_data->child,
- &pos_data->pos);
- return ret;
+ loc_t loc = {0,};
+ char *path = NULL;
+ dict_t *xattr = NULL;
+ int ret = 0;
+
+ uuid_copy (loc.gfid, gfid);
+ loc.inode = inode_new (this->itable);
+
+ ret = syncop_getxattr (subvol, &loc, &xattr, GFID_TO_PATH_KEY);
+ loc_wipe (&loc);
+ if (ret)
+ return ret;
+
+ ret = dict_get_str (xattr, GFID_TO_PATH_KEY, &path);
+ if (ret || !path)
+ return -EINVAL;
+
+ *path_p = gf_strdup (path);
+ if (!*path_p)
+ return -ENOMEM;
+ return 0;
}
-static int
-afr_dir_crawl (void *data)
+
+int
+afr_shd_gather_index_entries (xlator_t *this, int child, dict_t *output)
{
- xlator_t *this = NULL;
- int ret = -1;
- xlator_t *readdir_xl = NULL;
- fd_t *fd = NULL;
- loc_t dirloc = {0};
- afr_crawl_data_t *crawl_data = data;
+ fd_t *fd = NULL;
+ xlator_t *subvol = NULL;
+ afr_private_t *priv = NULL;
+ off_t offset = 0;
+ gf_dirent_t entries;
+ gf_dirent_t *entry = NULL;
+ uuid_t gfid;
+ int ret = 0;
+ int count = 0;
+ char *path = NULL;
+
+ priv = this->private;
+ subvol = priv->children[child];
+
+ fd = afr_shd_index_opendir (this, child);
+ if (!fd) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "unable to opendir index-dir on %s", subvol->name);
+ return -errno;
+ }
+
+ INIT_LIST_HEAD (&entries.list);
+
+ while ((ret = syncop_readdir (subvol, fd, 131072, offset, &entries))) {
+ if (ret > 0)
+ ret = 0;
+ list_for_each_entry (entry, &entries.list, list) {
+ offset = entry->d_off;
+
+ if (!strcmp (entry->d_name, ".") ||
+ !strcmp (entry->d_name, ".."))
+ continue;
+
+ gf_log (this->name, GF_LOG_DEBUG, "got entry: %s",
+ entry->d_name);
+
+ ret = uuid_parse (entry->d_name, gfid);
+ if (ret)
+ continue;
+
+ path = NULL;
+ ret = afr_shd_gfid_to_path (this, subvol, gfid, &path);
+
+ if (ret == -ENOENT || ret == -ESTALE) {
+ afr_shd_index_purge (subvol, fd->inode,
+ entry->d_name);
+ ret = 0;
+ continue;
+ }
+
+ ret = afr_shd_dict_add_path (this, output, child, path,
+ NULL);
+ }
+
+ gf_dirent_free (&entries);
+ if (ret)
+ break;
+ }
+
+ if (fd)
+ fd_unref (fd);
+ if (!ret)
+ ret = count;
+ return ret;
+}
- this = THIS;
- if (!_crawl_proceed (this, crawl_data->child, crawl_data->crawl_flags,
- NULL))
- goto out;
+int
+afr_add_shd_event (circular_buffer_t *cb, void *data)
+{
+ dict_t *output = NULL;
+ xlator_t *this = THIS;
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ shd_event_t *shd_event = NULL;
+ char *path = NULL;
+
+ output = data;
+ priv = this->private;
+ shd = &priv->shd;
+ shd_event = cb->data;
+
+ if (!shd->index_healers[shd_event->child].local)
+ return 0;
+
+ path = gf_strdup (shd_event->path);
+ if (!path)
+ return -ENOMEM;
+
+ afr_shd_dict_add_path (this, output, shd_event->child, path,
+ &cb->tv);
+ return 0;
+}
- readdir_xl = afr_crawl_readdir_xl_get (this, crawl_data);
- if (!readdir_xl)
- goto out;
- crawl_data->readdir_xl = readdir_xl;
+int
+afr_add_crawl_event (circular_buffer_t *cb, void *data)
+{
+ dict_t *output = NULL;
+ xlator_t *this = THIS;
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ crawl_event_t *crawl_event = NULL;
- ret = afr_crawl_build_start_loc (this, crawl_data, &dirloc);
- if (ret)
- goto out;
+ output = data;
+ priv = this->private;
+ shd = &priv->shd;
+ crawl_event = cb->data;
- ret = afr_crawl_opendir (this, crawl_data, &fd, &dirloc);
- if (ret)
- goto out;
+ if (!shd->index_healers[crawl_event->child].local)
+ return 0;
- ret = _crawl_directory (fd, &dirloc, crawl_data);
- if (ret)
- gf_log (this->name, GF_LOG_ERROR, "Crawl failed on %s",
- readdir_xl->name);
- else
- gf_log (this->name, GF_LOG_DEBUG, "Crawl completed "
- "on %s", readdir_xl->name);
- if (crawl_data->crawl == INDEX)
- dirloc.path = NULL;
-out:
- if (fd)
- fd_unref (fd);
- if (crawl_data->crawl == INDEX)
- dirloc.path = NULL;
- loc_wipe (&dirloc);
- return ret;
+ afr_shd_dict_add_crawl_event (this, output, crawl_event);
+
+ return 0;
}
-static int
-afr_dir_exclusive_crawl (void *data)
-{
- afr_private_t *priv = NULL;
- afr_self_heald_t *shd = NULL;
- gf_boolean_t crawl = _gf_false;
- int ret = 0;
- int child = -1;
- xlator_t *this = NULL;
- afr_crawl_data_t *crawl_data = data;
-
- this = THIS;
- priv = this->private;
- shd = &priv->shd;
- child = crawl_data->child;
-
- LOCK (&priv->lock);
- {
- if (shd->inprogress[child]) {
- if (shd->pending[child] != FULL)
- shd->pending[child] = crawl_data->crawl;
- } else {
- shd->inprogress[child] = _gf_true;
- crawl = _gf_true;
- }
- }
- UNLOCK (&priv->lock);
- if (!crawl) {
- gf_log (this->name, GF_LOG_INFO, "Another crawl is in progress "
- "for %s", priv->children[child]->name);
+int
+afr_selfheal_daemon_init (xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ int ret = -1;
+ int i = 0;
+
+ priv = this->private;
+ shd = &priv->shd;
+
+ this->itable = inode_table_new (SHD_INODE_LRU_LIMIT, this);
+ if (!this->itable)
+ goto out;
+
+ shd->index_healers = GF_CALLOC (sizeof(*shd->index_healers),
+ priv->child_count,
+ gf_afr_mt_subvol_healer_t);
+ if (!shd->index_healers)
+ goto out;
+
+ for (i = 0; i < priv->child_count; i++) {
+ shd->index_healers[i].subvol = i;
+ ret = afr_shd_healer_init (this, &shd->index_healers[i]);
+ if (ret)
+ goto out;
+ }
+
+ shd->full_healers = GF_CALLOC (sizeof(*shd->full_healers),
+ priv->child_count,
+ gf_afr_mt_subvol_healer_t);
+ if (!shd->full_healers)
+ goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ shd->full_healers[i].subvol = i;
+ ret = afr_shd_healer_init (this, &shd->full_healers[i]);
+ if (ret)
+ goto out;
+ }
+
+ shd->healed = eh_new (AFR_EH_HEALED_LIMIT, _gf_false,
+ afr_destroy_shd_event_data);
+ if (!shd->healed)
+ goto out;
+
+ shd->heal_failed = eh_new (AFR_EH_HEAL_FAIL_LIMIT, _gf_false,
+ afr_destroy_shd_event_data);
+ if (!shd->heal_failed)
+ goto out;
+
+ shd->split_brain = eh_new (AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false,
+ afr_destroy_shd_event_data);
+ if (!shd->split_brain)
+ goto out;
+
+ shd->statistics = GF_CALLOC (sizeof(eh_t *), priv->child_count,
+ gf_common_mt_eh_t);
+ if (!shd->statistics)
goto out;
+
+ for (i = 0; i < priv->child_count ; i++) {
+ shd->statistics[i] = eh_new (AFR_STATISTICS_HISTORY_SIZE,
+ _gf_false,
+ afr_destroy_crawl_event_data);
+ if (!shd->statistics[i])
+ goto out;
+ shd->full_healers[i].crawl_event.child = i;
+ shd->full_healers[i].crawl_event.crawl_type = "FULL";
+ shd->index_healers[i].crawl_event.child = i;
+ shd->index_healers[i].crawl_event.crawl_type = "INDEX";
}
- do {
- afr_dir_crawl (data);
- LOCK (&priv->lock);
- {
- if (shd->pending[child] != NONE) {
- crawl_data->crawl = shd->pending[child];
- shd->pending[child] = NONE;
- } else {
- shd->inprogress[child] = _gf_false;
- crawl = _gf_false;
- }
- }
- UNLOCK (&priv->lock);
- } while (crawl);
+ ret = 0;
out:
- return ret;
+ return ret;
}
-void
-afr_start_crawl (xlator_t *this, int idx, afr_crawl_type_t crawl,
- process_entry_cbk_t process_entry, void *op_data,
- gf_boolean_t exclusive, int crawl_flags,
- afr_crawl_done_cbk_t crawl_done)
-{
- afr_private_t *priv = NULL;
- call_frame_t *frame = NULL;
- afr_crawl_data_t *crawl_data = NULL;
- int ret = 0;
- int (*crawler) (void*) = NULL;
-
- priv = this->private;
- frame = create_frame (this, this->ctx->pool);
- if (!frame)
- goto out;
+int
+afr_selfheal_childup (xlator_t *this, int subvol)
+{
+ afr_shd_index_healer_spawn (this, subvol);
- afr_set_lk_owner (frame, this, frame->root);
- afr_set_low_priority (frame);
- crawl_data = GF_CALLOC (1, sizeof (*crawl_data),
- gf_afr_mt_crawl_data_t);
- if (!crawl_data)
- goto out;
- crawl_data->process_entry = process_entry;
- crawl_data->child = idx;
- crawl_data->pid = frame->root->pid;
- crawl_data->crawl = crawl;
- crawl_data->op_data = op_data;
- crawl_data->crawl_flags = crawl_flags;
- gf_log (this->name, GF_LOG_DEBUG, "starting crawl %d for %s",
- crawl_data->crawl, priv->children[idx]->name);
-
- if (exclusive)
- crawler = afr_dir_exclusive_crawl;
- else
- crawler = afr_dir_crawl;
- ret = synctask_new (this->ctx->env, crawler,
- crawl_done, frame, crawl_data);
- if (ret)
- gf_log (this->name, GF_LOG_ERROR, "Could not create the "
- "task for %d ret %d", idx, ret);
-out:
- return;
+ return 0;
}
-void
-afr_build_root_loc (xlator_t *this, loc_t *loc)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- loc->path = gf_strdup ("/");
- loc->name = "";
- loc->inode = inode_ref (priv->root_inode);
- uuid_copy (loc->gfid, loc->inode->gfid);
+int64_t
+afr_shd_get_index_count (xlator_t *this, int i)
+{
+ afr_private_t *priv = NULL;
+ xlator_t *subvol = NULL;
+ uint64_t count = 0;
+ loc_t rootloc = {0, };
+ dict_t *xattr = NULL;
+ int ret = -1;
+
+ priv = this->private;
+ subvol = priv->children[i];
+
+ rootloc.inode = inode_ref (this->itable->root);
+ uuid_copy (rootloc.gfid, rootloc.inode->gfid);
+
+ ret = syncop_getxattr (subvol, &rootloc, &xattr,
+ GF_XATTROP_INDEX_COUNT);
+ loc_wipe (&rootloc);
+
+ if (ret < 0)
+ return -1;
+
+ ret = dict_get_uint64 (xattr, GF_XATTROP_INDEX_COUNT, &count);
+ if (ret)
+ return -1;
+ return count;
}
+
int
-afr_set_root_gfid (dict_t *dict)
+afr_xl_op (xlator_t *this, dict_t *input, dict_t *output)
{
- uuid_t gfid;
- int ret = 0;
+ gf_xl_afr_op_t op = GF_AFR_OP_INVALID;
+ int ret = 0;
+ int xl_id = 0;
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ struct subvol_healer *healer = NULL;
+ int i = 0;
+ char key[64];
+ int op_ret = 0;
+ int64_t cnt = 0;
- memset (gfid, 0, 16);
- gfid[15] = 1;
+ priv = this->private;
+ shd = &priv->shd;
- ret = afr_set_dict_gfid (dict, gfid);
+ for (i = 0; i < priv->child_count; i++)
+ if (priv->child_up[i] == -1)
+ goto out;
- return ret;
-}
+ ret = dict_get_int32 (input, "xl-op", (int32_t*)&op);
+ if (ret)
+ goto out;
+ ret = dict_get_int32 (input, this->name, &xl_id);
+ if (ret)
+ goto out;
+ ret = dict_set_int32 (output, this->name, xl_id);
+ if (ret)
+ goto out;
+ switch (op) {
+ case GF_AFR_OP_HEAL_INDEX:
+ op_ret = -1;
+
+ for (i = 0; i < priv->child_count; i++) {
+ healer = &shd->index_healers[i];
+ snprintf (key, 64, "%d-%d-status", xl_id, i);
+
+ if (!priv->child_up[i]) {
+ ret = dict_set_str (output, key,
+ "Brick is not connected");
+ } else if (AFR_COUNT (priv->child_up,
+ priv->child_count) < 2) {
+ ret = dict_set_str (output, key,
+ "< 2 bricks in replica are up");
+ } else if (!afr_shd_is_subvol_local (this, healer->subvol)) {
+ ret = dict_set_str (output, key,
+ "Brick is remote");
+ } else {
+ ret = dict_set_str (output, key,
+ "Started self-heal");
+ afr_shd_index_healer_spawn (this, i);
+ op_ret = 0;
+ }
+ }
+ break;
+ case GF_AFR_OP_HEAL_FULL:
+ op_ret = -1;
+
+ for (i = 0; i < priv->child_count; i++) {
+ healer = &shd->full_healers[i];
+ snprintf (key, 64, "%d-%d-status", xl_id, i);
+
+ if (!priv->child_up[i]) {
+ ret = dict_set_str (output, key,
+ "Brick is not connected");
+ } else if (AFR_COUNT (priv->child_up,
+ priv->child_count) < 2) {
+ ret = dict_set_str (output, key,
+ "< 2 bricks in replica are up");
+ } else if (!afr_shd_is_subvol_local (this, healer->subvol)) {
+ ret = dict_set_str (output, key,
+ "Brick is remote");
+ } else {
+ ret = dict_set_str (output, key,
+ "Started self-heal");
+ afr_shd_full_healer_spawn (this, i);
+ op_ret = 0;
+ }
+ }
+ break;
+ case GF_AFR_OP_INDEX_SUMMARY:
+ for (i = 0; i < priv->child_count; i++)
+ if (shd->index_healers[i].local)
+ afr_shd_gather_index_entries (this, i, output);
+ break;
+ case GF_AFR_OP_HEALED_FILES:
+ eh_dump (shd->healed, output, afr_add_shd_event);
+ break;
+ case GF_AFR_OP_HEAL_FAILED_FILES:
+ eh_dump (shd->heal_failed, output, afr_add_shd_event);
+ break;
+ case GF_AFR_OP_SPLIT_BRAIN_FILES:
+ eh_dump (shd->split_brain, output, afr_add_shd_event);
+ break;
+ case GF_AFR_OP_STATISTICS:
+ for (i = 0; i < priv->child_count; i++) {
+ eh_dump (shd->statistics[i], output,
+ afr_add_crawl_event);
+ afr_shd_dict_add_crawl_event (this, output,
+ &shd->index_healers[i].crawl_event);
+ afr_shd_dict_add_crawl_event (this, output,
+ &shd->full_healers[i].crawl_event);
+ }
+ break;
+ case GF_AFR_OP_STATISTICS_HEAL_COUNT:
+ case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA:
+ op_ret = -1;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!priv->child_up[i]) {
+ snprintf (key, 64, "%d-%d-status", xl_id, i);
+ ret = dict_set_str (output, key,
+ "Brick is not connected");
+ } else {
+ snprintf (key, 64, "%d-%d-hardlinks", xl_id, i);
+ cnt = afr_shd_get_index_count (this, i);
+ if (cnt >= 0) {
+ ret = dict_set_uint64 (output, key, cnt);
+ }
+ op_ret = 0;
+ }
+ }
+
+// ret = _do_crawl_op_on_local_subvols (this, INDEX_TO_BE_HEALED,
+// STATISTICS_TO_BE_HEALED,
+// output);
+ break;
+ default:
+ gf_log (this->name, GF_LOG_ERROR, "Unknown set op %d", op);
+ break;
+ }
+out:
+ dict_del (output, this->name);
+ return op_ret;
+}
diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h
index 32a8aaca5..10e229ee7 100644
--- a/xlators/cluster/afr/src/afr-self-heald.h
+++ b/xlators/cluster/afr/src/afr-self-heald.h
@@ -1,5 +1,5 @@
/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
@@ -8,45 +8,65 @@
cases as published by the Free Software Foundation.
*/
-#ifndef __AFR_SELF_HEALD_H__
-#define __AFR_SELF_HEALD_H__
-#include "xlator.h"
-#define IS_ROOT_PATH(path) (!strcmp (path, "/"))
-#define IS_ENTRY_CWD(entry) (!strcmp (entry, "."))
-#define IS_ENTRY_PARENT(entry) (!strcmp (entry, ".."))
-#define AFR_ALL_CHILDREN -1
+#ifndef _AFR_SELF_HEALD_H
+#define _AFR_SELF_HEALD_H
-typedef struct afr_crawl_data_ {
- int child;
- pid_t pid;
- afr_crawl_type_t crawl;
- xlator_t *readdir_xl;
- void *op_data;
- int crawl_flags;
- int (*process_entry) (xlator_t *this, struct afr_crawl_data_ *crawl_data,
- gf_dirent_t *entry, loc_t *child, loc_t *parent,
- struct iatt *iattr);
-} afr_crawl_data_t;
+#include <pthread.h>
-typedef int (*process_entry_cbk_t) (xlator_t *this, afr_crawl_data_t *crawl_data,
- gf_dirent_t *entry, loc_t *child, loc_t *parent,
- struct iatt *iattr);
-void afr_build_root_loc (xlator_t *this, loc_t *loc);
+typedef struct {
+ int child;
+ char *path;
+} shd_event_t;
-int afr_set_root_gfid (dict_t *dict);
+typedef struct {
+ int child;
+ uint64_t healed_count;
+ uint64_t split_brain_count;
+ uint64_t heal_failed_count;
+
+ /* If start_time is 0, it means crawler is not in progress
+ and stats are not valid */
+ time_t start_time;
+ /* If start_time is NOT 0 and end_time is 0, it means
+ cralwer is in progress */
+ time_t end_time;
+ char *crawl_type;
+} crawl_event_t;
+
+struct subvol_healer {
+ xlator_t *this;
+ int subvol;
+ gf_boolean_t local;
+ gf_boolean_t running;
+ gf_boolean_t rerun;
+ crawl_event_t crawl_event;
+ pthread_mutex_t mutex;
+ pthread_cond_t cond;
+ pthread_t thread;
+};
+
+typedef struct {
+ gf_boolean_t iamshd;
+ gf_boolean_t enabled;
+ struct subvol_healer *index_healers;
+ struct subvol_healer *full_healers;
+
+ eh_t *healed;
+ eh_t *heal_failed;
+ eh_t *split_brain;
+ eh_t **statistics;
+} afr_self_heald_t;
-void
-afr_proactive_self_heal (void *data);
int
-afr_xl_op (xlator_t *this, dict_t *input, dict_t *output);
+afr_selfheal_childup (xlator_t *this, int subvol);
+
+int
+afr_selfheal_daemon_init (xlator_t *this);
-/*
- * In addition to its self-heal use, this is used to find a local default
- * read_child.
- */
int
-afr_local_pathinfo (char *pathinfo, gf_boolean_t *local);
-#endif /* __AFR_SELF_HEALD_H__ */
+afr_xl_op (xlator_t *this, dict_t *input, dict_t *output);
+
+#endif /* !_AFR_SELF_HEALD_H */
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index 265d93b61..205ff759e 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -18,189 +18,156 @@
#include <signal.h>
+gf_boolean_t
+afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this);
+
+gf_boolean_t
+afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this);
+
+int
+afr_changelog_do (call_frame_t *frame, xlator_t *this, dict_t *xattr,
+ afr_changelog_resume_t changelog_resume);
-#define LOCKED_NO 0x0 /* no lock held */
-#define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path
- of RENAME */
-#define LOCKED_LOWER 0x2 /* for lower_path of RENAME */
-afr_fd_ctx_t *
-__afr_fd_ctx_get (fd_t *fd, xlator_t *this)
+int
+__afr_txn_write_fop (call_frame_t *frame, xlator_t *this)
{
- uint64_t ctx = 0;
- int ret = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
- int i = 0;
+ afr_local_t *local = NULL;
afr_private_t *priv = NULL;
+ int call_count = -1;
+ int i = 0;
+ local = frame->local;
priv = this->private;
- ret = __fd_ctx_get (fd, this, &ctx);
-
- if (ret < 0 && fd_is_anonymous (fd)) {
- ret = __afr_fd_ctx_set (this, fd);
- if (ret < 0)
- goto out;
+ call_count = AFR_COUNT (local->transaction.pre_op, priv->child_count);
- ret = __fd_ctx_get (fd, this, &ctx);
- if (ret < 0)
- goto out;
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
- for (i = 0; i < priv->child_count; i++)
- fd_ctx->opened_on[i] = AFR_FD_OPENED;
+ if (call_count == 0) {
+ local->transaction.resume (frame, this);
+ return 0;
}
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-out:
- return fd_ctx;
-}
-
+ local->call_count = call_count;
-afr_fd_ctx_t *
-afr_fd_ctx_get (fd_t *fd, xlator_t *this)
-{
- afr_fd_ctx_t *fd_ctx = NULL;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i]) {
+ local->transaction.wind (frame, this, i);
- LOCK(&fd->lock);
- {
- fd_ctx = __afr_fd_ctx_get (fd, this);
+ if (!--call_count)
+ break;
+ }
}
- UNLOCK(&fd->lock);
- return fd_ctx;
+ return 0;
}
-static void
-afr_save_lk_owner (call_frame_t *frame)
+int
+__afr_txn_write_done (call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
+ afr_local_t *local = NULL;
local = frame->local;
- local->saved_lk_owner = frame->root->lk_owner;
-}
+ local->transaction.unwind (frame, this);
+ AFR_STACK_DESTROY (frame);
-static void
-afr_restore_lk_owner (call_frame_t *frame)
-{
- afr_local_t * local = NULL;
-
- local = frame->local;
-
- frame->root->lk_owner = local->saved_lk_owner;
+ return 0;
}
-static void
-__mark_all_pending (int32_t *pending[], int child_count,
- afr_transaction_type type)
+call_frame_t*
+afr_transaction_detach_fop_frame (call_frame_t *frame)
{
- int i = 0;
- int j = 0;
-
- for (i = 0; i < child_count; i++) {
- j = afr_index_for_transaction_type (type);
- pending[i][j] = hton32 (1);
- }
-}
-
+ afr_local_t * local = NULL;
+ call_frame_t *fop_frame = NULL;
-static void
-__mark_child_dead (int32_t *pending[], int child_count, int child,
- afr_transaction_type type)
-{
- int j = 0;
+ local = frame->local;
- j = afr_index_for_transaction_type (type);
+ LOCK (&frame->lock);
+ {
+ fop_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK (&frame->lock);
- pending[child][j] = 0;
+ return fop_frame;
}
static void
-__mark_pre_op_done_on_fd (call_frame_t *frame, xlator_t *this, int child_index)
+afr_save_lk_owner (call_frame_t *frame)
{
- afr_local_t *local = NULL;
- afr_fd_ctx_t *fd_ctx = NULL;
+ afr_local_t * local = NULL;
local = frame->local;
- if (!local->fd)
- return;
-
- fd_ctx = afr_fd_ctx_get (local->fd, this);
-
- if (!fd_ctx)
- goto out;
-
- LOCK (&local->fd->lock);
- {
- if (local->transaction.type == AFR_DATA_TRANSACTION)
- fd_ctx->pre_op_done[child_index]++;
- }
- UNLOCK (&local->fd->lock);
-out:
- return;
+ local->saved_lk_owner = frame->root->lk_owner;
}
static void
-__mark_pre_op_undone_on_fd (call_frame_t *frame, xlator_t *this, int child_index)
+afr_restore_lk_owner (call_frame_t *frame)
{
- afr_local_t *local = NULL;
- afr_fd_ctx_t *fd_ctx = NULL;
+ afr_local_t * local = NULL;
local = frame->local;
- if (!local->fd)
- return;
-
- fd_ctx = afr_fd_ctx_get (local->fd, this);
-
- if (!fd_ctx)
- goto out;
-
- LOCK (&local->fd->lock);
- {
- if (local->transaction.type == AFR_DATA_TRANSACTION)
- fd_ctx->pre_op_done[child_index]--;
- }
- UNLOCK (&local->fd->lock);
-out:
- return;
+ frame->root->lk_owner = local->saved_lk_owner;
}
-
-static void
-__mark_non_participant_children (int32_t *pending[], int child_count,
- unsigned char *participants,
- afr_transaction_type type)
+void
+__mark_all_success (call_frame_t *frame, xlator_t *this)
{
- int i = 0;
- int j = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i;
- j = afr_index_for_transaction_type (type);
- for (i = 0; i < child_count; i++) {
- if (!participants[i])
- pending[i][j] = 0;
- }
+ local = frame->local;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ local->transaction.failed_subvols[i] = 0;
+ }
}
-static void
-__mark_all_success (int32_t *pending[], int child_count,
- afr_transaction_type type)
+int
+afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)
{
- int i;
- int j;
+ afr_local_t *local = NULL;
+ fd_t *fd = NULL;
- for (i = 0; i < child_count; i++) {
- j = afr_index_for_transaction_type (type);
- pending[i][j] = hton32 (-1);
- }
+ local = frame->local;
+ fd = local->fd;
+
+ /* Perform fops with the lk-owner from top xlator.
+ * Eg: lk-owner of posix-lk and flush should be same,
+ * flush cant clear the posix-lks without that lk-owner.
+ */
+ afr_save_lk_owner (frame);
+ frame->root->lk_owner =
+ local->transaction.main_frame->root->lk_owner;
+
+ if (local->pre_op_compat)
+ /* old mode, pre-op was done as afr_changelog_do()
+ just now, before OP */
+ afr_changelog_pre_op_update (frame, this);
+
+ /* The wake up needs to happen independent of
+ what type of fop arrives here. If it was
+ a write, then it has already inherited the
+ lock and changelog. If it was not a write,
+ then the presumption of the optimization (of
+ optimizing for successive write operations)
+ fails.
+ */
+ if (fd)
+ afr_delayed_changelog_wake_up (this, fd);
+ local->transaction.fop (frame, this);
+
+ return 0;
}
@@ -266,39 +233,28 @@ __fop_changelog_needed (call_frame_t *frame, xlator_t *this)
return op_ret;
}
+
int
-afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending,
- int child, afr_xattrop_type_t op)
+afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int **pending)
{
int i = 0;
int ret = 0;
+ int pending_zero[AFR_NUM_CHANGE_LOGS] = {0, };
- if (op == LOCAL_FIRST) {
- ret = dict_set_static_bin (xattr, priv->pending_key[child],
- pending[child],
- AFR_NUM_CHANGE_LOGS * sizeof (int32_t));
- if (ret)
- goto out;
- }
for (i = 0; i < priv->child_count; i++) {
- if (i == child)
- continue;
+ if (!memcmp (pending_zero, pending[i], sizeof (pending_zero)))
+ /* don't set xattrs for non-pending servers */
+ continue;
+
ret = dict_set_static_bin (xattr, priv->pending_key[i],
- pending[i],
- AFR_NUM_CHANGE_LOGS * sizeof (int32_t));
+ pending[i],
+ AFR_NUM_CHANGE_LOGS * sizeof (int));
/* 3 = data+metadata+entry */
- if (ret < 0)
- goto out;
- }
- if (op == LOCAL_LAST) {
- ret = dict_set_static_bin (xattr, priv->pending_key[child],
- pending[child],
- AFR_NUM_CHANGE_LOGS * sizeof (int32_t));
if (ret)
- goto out;
+ break;
}
-out:
+
return ret;
}
@@ -327,475 +283,532 @@ afr_lock_server_count (afr_private_t *priv, afr_transaction_type type)
/* {{{ pending */
-int32_t
-afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xattr,
- dict_t *xdata)
+
+int
+afr_changelog_post_op_done (call_frame_t *frame, xlator_t *this)
{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
afr_internal_lock_t *int_lock = NULL;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int call_count = -1;
- priv = this->private;
- local = frame->local;
+ local = frame->local;
+ priv = this->private;
int_lock = &local->internal_lock;
- LOCK (&frame->lock);
- {
- call_count = --local->call_count;
- }
- UNLOCK (&frame->lock);
+ if (local->transaction.resume_stub) {
+ call_resume (local->transaction.resume_stub);
+ local->transaction.resume_stub = NULL;
+ }
- if (call_count == 0) {
- if (afr_lock_server_count (priv, local->transaction.type) == 0) {
- local->transaction.done (frame, this);
- } else {
- int_lock->lock_cbk = local->transaction.done;
- afr_unlock (frame, this);
- }
- }
+ if (afr_lock_server_count (priv, local->transaction.type) == 0) {
+ local->transaction.done (frame, this);
+ } else {
+ int_lock->lock_cbk = local->transaction.done;
+ afr_unlock (frame, this);
+ }
- return 0;
+ return 0;
}
-void
-afr_transaction_rm_stale_children (call_frame_t *frame, xlator_t *this,
- inode_t *inode, afr_transaction_type type)
+afr_inodelk_t*
+afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom)
{
- int i = -1;
- int count = 0;
- int read_child = -1;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int **pending = NULL;
- int idx = 0;
- int32_t *stale_children = NULL;
- int32_t *fresh_children = NULL;
- gf_boolean_t rm_stale_children = _gf_false;
+ afr_inodelk_t *inodelk = NULL;
+ int i = 0;
- idx = afr_index_for_transaction_type (type);
-
- priv = this->private;
- local = frame->local;
- pending = local->pending;
-
- if (local->op_ret < 0)
- goto out;
- fresh_children = local->fresh_children;
- read_child = afr_inode_get_read_ctx (this, inode, fresh_children);
- if (read_child < 0) {
- gf_log (this->name, GF_LOG_DEBUG, "Possible split-brain "
- "for %s", uuid_utoa (inode->gfid));
- goto out;
+ for (i = 0; int_lock->inodelk[i].domain; i++) {
+ inodelk = &int_lock->inodelk[i];
+ if (strcmp (dom, inodelk->domain) == 0)
+ return inodelk;
}
-
- for (i = 0; i < priv->child_count; i++) {
- if (!afr_is_child_present (fresh_children,
- priv->child_count, i))
- continue;
- if (pending[i][idx])
- continue;
- /* child is down or op failed on it */
- if (!stale_children)
- stale_children = afr_children_create (priv->child_count);
- if (!stale_children)
- goto out;
-
- rm_stale_children = _gf_true;
- stale_children[count++] = i;
- gf_log (this->name, GF_LOG_DEBUG, "Removing stale child "
- "%d for %s", i, uuid_utoa (inode->gfid));
- }
-
- if (!rm_stale_children)
- goto out;
-
- afr_inode_rm_stale_children (this, inode, stale_children);
-out:
- GF_FREE (stale_children);
- return;
+ return NULL;
}
unsigned char*
afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock)
{
unsigned char *locked_nodes = NULL;
+ afr_inodelk_t *inodelk = NULL;
switch (type) {
case AFR_DATA_TRANSACTION:
case AFR_METADATA_TRANSACTION:
- locked_nodes = int_lock->inode_locked_nodes;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+ locked_nodes = inodelk->locked_nodes;
break;
case AFR_ENTRY_TRANSACTION:
case AFR_ENTRY_RENAME_TRANSACTION:
- locked_nodes = int_lock->entry_locked_nodes;
+ /*Because same set of subvols participate in all lockee
+ * entities*/
+ locked_nodes = int_lock->lockee[0].locked_nodes;
break;
}
return locked_nodes;
}
+
int
-afr_changelog_pre_op_call_count (afr_transaction_type type,
- afr_internal_lock_t *int_lock,
- unsigned int child_count)
+afr_changelog_call_count (afr_transaction_type type,
+ unsigned char *pre_op_subvols,
+ unsigned int child_count)
{
- int call_count = 0;
- unsigned char *locked_nodes = NULL;
+ int call_count = 0;
- locked_nodes = afr_locked_nodes_get (type, int_lock);
- GF_ASSERT (locked_nodes);
+ call_count = AFR_COUNT(pre_op_subvols, child_count);
- call_count = afr_locked_children_count (locked_nodes, child_count);
if (type == AFR_ENTRY_RENAME_TRANSACTION)
call_count *= 2;
return call_count;
}
-int
-afr_changelog_post_op_call_count (afr_transaction_type type,
- unsigned char *pre_op,
- unsigned int child_count)
-{
- int call_count = 0;
- call_count = afr_pre_op_done_children_count (pre_op, child_count);
- if (type == AFR_ENTRY_RENAME_TRANSACTION)
- call_count *= 2;
+gf_boolean_t
+afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
- return call_count;
-}
+ local = frame->local;
+ priv = this->private;
-void
-afr_compute_txn_changelog (afr_local_t *local, afr_private_t *priv)
-{
- int i = 0;
- int index = 0;
- int32_t postop = 0;
- int32_t preop = 1;
- int32_t **txn_changelog = NULL;
-
- txn_changelog = local->transaction.txn_changelog;
- index = afr_index_for_transaction_type (local->transaction.type);
for (i = 0; i < priv->child_count; i++) {
- postop = ntoh32 (local->pending[i][index]);
- txn_changelog[i][index] = hton32 (postop + preop);
+ if (local->transaction.failed_subvols[i])
+ return _gf_false;
}
-}
-
-afr_xattrop_type_t
-afr_get_postop_xattrop_type (int32_t **pending, int optimized, int child,
- afr_transaction_type type)
-{
- int index = 0;
- afr_xattrop_type_t op = LOCAL_LAST;
- index = afr_index_for_transaction_type (type);
- if (optimized && !pending[child][index])
- op = LOCAL_FIRST;
- return op;
+ return _gf_true;
}
+
void
-afr_set_postop_dict (afr_local_t *local, xlator_t *this, dict_t *xattr,
- int optimized, int child)
+afr_handle_symmetric_errors (call_frame_t *frame, xlator_t *this)
{
- int32_t **txn_changelog = NULL;
- int32_t **changelog = NULL;
- afr_private_t *priv = NULL;
- int ret = 0;
- afr_xattrop_type_t op = LOCAL_LAST;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int op_errno = 0;
+ int i_errno = 0;
+ gf_boolean_t matching_errors = _gf_true;
+ int i = 0;
- priv = this->private;
- txn_changelog = local->transaction.txn_changelog;
- op = afr_get_postop_xattrop_type (local->pending, optimized, child,
- local->transaction.type);
- if (optimized)
- changelog = txn_changelog;
- else
- changelog = local->pending;
- ret = afr_set_pending_dict (priv, xattr, changelog, child, op);
- if (ret < 0)
- gf_log (this->name, GF_LOG_INFO,
- "failed to set pending entry");
+ priv = this->private;
+ local = frame->local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret != -1) {
+ /* Operation succeeded on at least on subvol,
+ so it is not a failed-everywhere situation.
+ */
+ matching_errors = _gf_false;
+ break;
+ }
+ i_errno = local->replies[i].op_errno;
+
+ if (i_errno == ENOTCONN) {
+ /* ENOTCONN is not a symmetric error. We do not
+ know if the operation was performed on the
+ backend or not.
+ */
+ matching_errors = _gf_false;
+ break;
+ }
+
+ if (!op_errno) {
+ op_errno = i_errno;
+ } else if (op_errno != i_errno) {
+ /* Mismatching op_errno's */
+ matching_errors = _gf_false;
+ break;
+ }
+ }
+
+ if (matching_errors)
+ __mark_all_success (frame, this);
}
+
int
afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
{
afr_private_t * priv = this->private;
- afr_internal_lock_t *int_lock = NULL;
int i = 0;
- int call_count = 0;
-
+ int ret = 0;
+ int idx = 0;
afr_local_t * local = NULL;
- afr_fd_ctx_t *fdctx = NULL;
- dict_t **xattr = NULL;
- int piggyback = 0;
- int index = 0;
+ dict_t *xattr = NULL;
int nothing_failed = 1;
+ gf_boolean_t need_undirty = _gf_false;
- local = frame->local;
- int_lock = &local->internal_lock;
+ local = frame->local;
+ idx = afr_index_for_transaction_type (local->transaction.type);
- __mark_non_participant_children (local->pending, priv->child_count,
- local->transaction.pre_op,
- local->transaction.type);
+ nothing_failed = afr_txn_nothing_failed (frame, this);
- if (local->fd)
- afr_transaction_rm_stale_children (frame, this,
- local->fd->inode,
- local->transaction.type);
+ if (afr_changelog_pre_op_uninherit (frame, this))
+ need_undirty = _gf_false;
+ else
+ need_undirty = _gf_true;
- xattr = alloca (priv->child_count * sizeof (*xattr));
- memset (xattr, 0, (priv->child_count * sizeof (*xattr)));
- for (i = 0; i < priv->child_count; i++) {
- xattr[i] = dict_new ();
- }
+ if (nothing_failed && !need_undirty) {
+ afr_changelog_post_op_done (frame, this);
+ goto out;
+ }
- call_count = afr_changelog_post_op_call_count (local->transaction.type,
- local->transaction.pre_op,
- priv->child_count);
- local->call_count = call_count;
+ xattr = dict_new ();
+ if (!xattr) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ afr_changelog_post_op_done (frame, this);
+ goto out;
+ }
- if (local->fd)
- fdctx = afr_fd_ctx_get (local->fd, this);
+ if (need_undirty) {
+ local->dirty[idx] = hton32(-1);
- if (call_count == 0) {
- /* no child is up */
- int_lock->lock_cbk = local->transaction.done;
- afr_unlock (frame, this);
- goto out;
- }
+ ret = dict_set_static_bin (xattr, AFR_DIRTY, local->dirty,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ afr_changelog_post_op_done (frame, this);
+ goto out;
+ }
- /* check if something has failed, to handle piggybacking */
- nothing_failed = 1;
- index = afr_index_for_transaction_type (local->transaction.type);
- for (i = 0; i < priv->child_count; i++) {
- if (local->pending[i][index] == 0) {
- nothing_failed = 0;
- break;
- }
- }
+ }
- afr_compute_txn_changelog (local , priv);
+ if (!nothing_failed) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.failed_subvols[i])
+ local->pending[i][idx] = hton32(1);
+ }
+ ret = afr_set_pending_dict (priv, xattr, local->pending);
+ if (ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ afr_changelog_post_op_done (frame, this);
+ goto out;
+ }
- for (i = 0; i < priv->child_count; i++) {
- if (!local->transaction.pre_op[i])
- continue;
+ }
- if (local->transaction.type != AFR_DATA_TRANSACTION)
- afr_set_postop_dict (local, this, xattr[i],
- local->optimistic_change_log, i);
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- {
- if (!fdctx) {
- afr_set_postop_dict (local, this, xattr[i],
- 0, i);
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->loc,
- GF_XATTROP_ADD_ARRAY, xattr[i],
- NULL);
- break;
- }
+ afr_changelog_do (frame, this, xattr, afr_changelog_post_op_done);
+out:
+ if (xattr)
+ dict_unref (xattr);
- LOCK (&local->fd->lock);
- {
- piggyback = 0;
- if (fdctx->pre_op_piggyback[i]) {
- fdctx->pre_op_piggyback[i]--;
- piggyback = 1;
- }
- }
- UNLOCK (&local->fd->lock);
+ return 0;
+}
- afr_set_postop_dict (local, this, xattr[i],
- piggyback, i);
- if (nothing_failed && piggyback) {
- afr_changelog_post_op_cbk (frame, (void *)(long)i,
- this, 1, 0, xattr[i], NULL);
- } else {
- __mark_pre_op_undone_on_fd (frame, this, i);
- STACK_WIND_COOKIE (frame,
- afr_changelog_post_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i],
- NULL);
- }
- }
- break;
- case AFR_METADATA_TRANSACTION:
- {
- if (nothing_failed && local->optimistic_change_log) {
- afr_changelog_post_op_cbk (frame, (void *)(long)i,
- this, 1, 0, xattr[i],
- NULL);
- break;
- }
+gf_boolean_t
+afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ fd_t *fd = NULL;
+ int i = 0;
+ gf_boolean_t ret = _gf_false;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int type = 0;
- if (local->fd)
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i],
- NULL);
- else
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->loc,
- GF_XATTROP_ADD_ARRAY, xattr[i],
- NULL);
- }
- break;
+ local = frame->local;
+ priv = this->private;
+ fd = local->fd;
- case AFR_ENTRY_RENAME_TRANSACTION:
- {
- if (nothing_failed && local->optimistic_change_log) {
- afr_changelog_post_op_cbk (frame, (void *)(long)i,
- this, 1, 0, xattr[i],
- NULL);
- } else {
- STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.new_parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr[i],
- NULL);
- }
- call_count--;
- }
+ type = afr_index_for_transaction_type (local->transaction.type);
+ if (type != AFR_DATA_TRANSACTION)
+ return !local->transaction.dirtied;
- /*
- set it again because previous stack_wind
- might have already returned (think of case
- where subvolume is posix) and would have
- used the dict as placeholder for return
- value
- */
+ if (!fd)
+ return !local->transaction.dirtied;
- afr_set_postop_dict (local, this, xattr[i],
- local->optimistic_change_log, i);
+ fd_ctx = afr_fd_ctx_get (fd, this);
+ if (!fd_ctx)
+ return _gf_false;
- /* fall through */
+ if (local->transaction.no_uninherit)
+ return _gf_false;
- case AFR_ENTRY_TRANSACTION:
- {
- if (nothing_failed && local->optimistic_change_log) {
- afr_changelog_post_op_cbk (frame, (void *)(long)i,
- this, 1, 0, xattr[i],
- NULL);
- break;
- }
+ /* This function must be idempotent. So check if we
+ were called before and return the same answer again.
- if (local->fd)
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i],
- NULL);
- else
- STACK_WIND (frame, afr_changelog_post_op_cbk,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr[i],
- NULL);
- }
- break;
- }
+ It is important to keep this function idempotent for
+ the call in afr_changelog_post_op_safe() to not have
+ side effects on the call from afr_changelog_post_op_now()
+ */
+ if (local->transaction.uninherit_done)
+ return local->transaction.uninherit_value;
- if (!--call_count)
- break;
- }
+ LOCK(&fd->lock);
+ {
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i] !=
+ fd_ctx->pre_op_done[type][i]) {
+ ret = !local->transaction.dirtied;
+ goto unlock;
+ }
+ }
+
+ if (fd_ctx->inherited[type]) {
+ ret = _gf_true;
+ fd_ctx->inherited[type]--;
+ } else if (fd_ctx->on_disk[type]) {
+ ret = _gf_false;
+ fd_ctx->on_disk[type]--;
+ } else {
+ /* ASSERT */
+ ret = _gf_false;
+ }
+
+ if (!fd_ctx->inherited[type] && !fd_ctx->on_disk[type]) {
+ for (i = 0; i < priv->child_count; i++)
+ fd_ctx->pre_op_done[type][i] = 0;
+ }
+ }
+unlock:
+ UNLOCK(&fd->lock);
-out:
- for (i = 0; i < priv->child_count; i++) {
- dict_unref (xattr[i]);
- }
+ local->transaction.uninherit_done = _gf_true;
+ local->transaction.uninherit_value = ret;
- return 0;
+ return ret;
}
-int32_t
-afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xattr,
- dict_t *xdata)
+gf_boolean_t
+afr_changelog_pre_op_inherit (call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = this->private;
- int call_count = -1;
- int child_index = (long) cookie;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ fd_t *fd = NULL;
+ int i = 0;
+ gf_boolean_t ret = _gf_false;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int type = 0;
+
+ local = frame->local;
+ priv = this->private;
+ fd = local->fd;
+
+ if (local->transaction.type != AFR_DATA_TRANSACTION)
+ return _gf_false;
+
+ type = afr_index_for_transaction_type (local->transaction.type);
+
+ if (!fd)
+ return _gf_false;
+
+ fd_ctx = afr_fd_ctx_get (fd, this);
+ if (!fd_ctx)
+ return _gf_false;
+
+ LOCK(&fd->lock);
+ {
+ if (!fd_ctx->on_disk[type]) {
+ /* nothing to inherit yet */
+ ret = _gf_false;
+ goto unlock;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i] !=
+ fd_ctx->pre_op_done[type][i]) {
+ /* either inherit exactly, or don't */
+ ret = _gf_false;
+ goto unlock;
+ }
+ }
+
+ fd_ctx->inherited[type]++;
+
+ ret = _gf_true;
+
+ local->transaction.inherited = _gf_true;
+ }
+unlock:
+ UNLOCK(&fd->lock);
+
+ return ret;
+}
+
+
+gf_boolean_t
+afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ fd_t *fd = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int i = 0;
+ gf_boolean_t ret = _gf_false;
+ int type = 0;
+
+ local = frame->local;
+ priv = this->private;
+ fd = local->fd;
+
+ if (!fd)
+ return _gf_false;
+
+ fd_ctx = afr_fd_ctx_get (fd, this);
+ if (!fd_ctx)
+ return _gf_false;
+
+ if (local->transaction.inherited)
+ /* was already inherited in afr_changelog_pre_op */
+ return _gf_false;
+
+ if (!local->transaction.dirtied)
+ return _gf_false;
+
+ if (!afr_txn_nothing_failed (frame, this))
+ return _gf_false;
+
+ type = afr_index_for_transaction_type (local->transaction.type);
+
+ ret = _gf_false;
+
+ LOCK(&fd->lock);
+ {
+ if (!fd_ctx->on_disk[type]) {
+ for (i = 0; i < priv->child_count; i++)
+ fd_ctx->pre_op_done[type][i] =
+ local->transaction.pre_op[i];
+ } else {
+ for (i = 0; i < priv->child_count; i++)
+ if (fd_ctx->pre_op_done[type][i] !=
+ local->transaction.pre_op[i]) {
+ local->transaction.no_uninherit = 1;
+ goto unlock;
+ }
+ }
+ fd_ctx->on_disk[type]++;
+
+ ret = _gf_true;
+ }
+unlock:
+ UNLOCK(&fd->lock);
+
+ return ret;
+}
+
+
+int
+afr_changelog_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int call_count = -1;
local = frame->local;
- LOCK (&frame->lock);
- {
- switch (op_ret) {
- case 0:
- __mark_pre_op_done_on_fd (frame, this, child_index);
- //fallthrough we need to mark the pre_op
- case 1:
- local->transaction.pre_op[child_index] = 1;
- /* special op_ret for piggyback */
- break;
- case -1:
- if (op_errno == ENOTSUP) {
- gf_log (this->name, GF_LOG_ERROR,
- "xattrop not supported by %s",
- priv->children[child_index]->name);
- local->op_ret = -1;
-
- } else if (!child_went_down (op_ret, op_errno)) {
- gf_log (this->name, GF_LOG_ERROR,
- "xattrop failed on child %s: %s",
- priv->children[child_index]->name,
- strerror (op_errno));
+ if (op_ret == -1)
+ afr_transaction_fop_failed (frame, this, (long) cookie);
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ local->transaction.changelog_resume (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_changelog_do (call_frame_t *frame, xlator_t *this, dict_t *xattr,
+ afr_changelog_resume_t changelog_resume)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int call_count = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = afr_changelog_call_count (local->transaction.type,
+ local->transaction.pre_op,
+ priv->child_count);
+
+ if (call_count == 0) {
+ changelog_resume (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ local->transaction.changelog_resume = changelog_resume;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.pre_op[i])
+ continue;
+
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ if (!local->fd) {
+ STACK_WIND_COOKIE (frame, afr_changelog_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->loc,
+ GF_XATTROP_ADD_ARRAY, xattr,
+ NULL);
+ } else {
+ STACK_WIND_COOKIE (frame, afr_changelog_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ local->fd,
+ GF_XATTROP_ADD_ARRAY, xattr,
+ NULL);
}
- local->op_errno = op_errno;
- break;
- }
+ break;
+ case AFR_ENTRY_RENAME_TRANSACTION:
- call_count = --local->call_count;
- }
- UNLOCK (&frame->lock);
+ STACK_WIND_COOKIE (frame, afr_changelog_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.new_parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr,
+ NULL);
+ call_count--;
- if (call_count == 0) {
- if ((local->op_ret == -1) &&
- (local->op_errno == ENOTSUP)) {
- local->transaction.resume (frame, this);
- } else {
- __mark_all_success (local->pending, priv->child_count,
- local->transaction.type);
-
- /* Perform fops with the lk-owner from top xlator.
- * Eg: lk-owner of posix-lk and flush should be same,
- * flush cant clear the posix-lks without that lk-owner.
- */
- afr_save_lk_owner (frame);
- frame->root->lk_owner =
- local->transaction.main_frame->root->lk_owner;
-
- local->transaction.fop (frame, this);
- }
+ /* fall through */
+
+ case AFR_ENTRY_TRANSACTION:
+ if (local->fd)
+ STACK_WIND_COOKIE (frame, afr_changelog_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->fxattrop,
+ local->fd,
+ GF_XATTROP_ADD_ARRAY, xattr,
+ NULL);
+ else
+ STACK_WIND_COOKIE (frame, afr_changelog_cbk,
+ (void *) (long) i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr,
+ NULL);
+ break;
+ }
+
+ if (!--call_count)
+ break;
}
- return 0;
+ return 0;
}
+
int
afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
{
@@ -803,206 +816,122 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
int i = 0;
int ret = 0;
int call_count = 0;
- dict_t **xattr = NULL;
- afr_fd_ctx_t *fdctx = NULL;
+ int op_errno = 0;
afr_local_t *local = NULL;
- int piggyback = 0;
afr_internal_lock_t *int_lock = NULL;
unsigned char *locked_nodes = NULL;
+ unsigned char *pending_subvols = NULL;
+ int idx = -1;
+ gf_boolean_t pre_nop = _gf_true;
+ dict_t *xdata_req = NULL;
local = frame->local;
int_lock = &local->internal_lock;
+ idx = afr_index_for_transaction_type (local->transaction.type);
- xattr = alloca (priv->child_count * sizeof (*xattr));
- memset (xattr, 0, (priv->child_count * sizeof (*xattr)));
-
- for (i = 0; i < priv->child_count; i++) {
- xattr[i] = dict_new ();
- }
-
- call_count = afr_changelog_pre_op_call_count (local->transaction.type,
- int_lock,
- priv->child_count);
- if (call_count == 0) {
- local->internal_lock.lock_cbk =
- local->transaction.done;
- afr_unlock (frame, this);
- goto out;
- }
+ locked_nodes = afr_locked_nodes_get (local->transaction.type, int_lock);
- local->call_count = call_count;
+ pending_subvols = alloca0 (priv->child_count);
- __mark_all_pending (local->pending, priv->child_count,
- local->transaction.type);
+ for (i = 0; i < priv->child_count; i++) {
+ if (locked_nodes[i]) {
+ local->transaction.pre_op[i] = 1;
+ call_count++;
+ } else {
+ pending_subvols[i] = 1;
+ }
+ }
- if (local->fd)
- fdctx = afr_fd_ctx_get (local->fd, this);
+ /* TBD: quorum check w/ call_count */
- locked_nodes = afr_locked_nodes_get (local->transaction.type, int_lock);
- for (i = 0; i < priv->child_count; i++) {
- if (!locked_nodes[i])
- continue;
- ret = afr_set_pending_dict (priv, xattr[i], local->pending,
- i, LOCAL_FIRST);
+ if (call_count == 0) {
+ op_errno = ENOTCONN;
+ goto err;
+ }
- if (ret < 0)
- gf_log (this->name, GF_LOG_INFO,
- "failed to set pending entry");
+ xdata_req = dict_new();
+ if (!xdata_req) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ pre_nop = _gf_true;
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- {
- if (!fdctx) {
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &(local->loc),
- GF_XATTROP_ADD_ARRAY, xattr[i],
- NULL);
- break;
- }
+ if (afr_changelog_pre_op_inherit (frame, this))
+ goto next;
- LOCK (&local->fd->lock);
- {
- piggyback = 0;
- if (fdctx->pre_op_done[i]) {
- fdctx->pre_op_piggyback[i]++;
- piggyback = 1;
- fdctx->hit++;
- } else {
- fdctx->miss++;
- }
- }
- UNLOCK (&local->fd->lock);
+ if (call_count < priv->child_count) {
+ /* For subvols we are not performing operation on,
+ mark them as pending up-front along with the FOP
+ so that we can safely defer unmarking dirty until
+ later.
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if (pending_subvols[i])
+ local->pending[i][idx] = hton32(1);
+ }
+ ret = afr_set_pending_dict (priv, xdata_req,
+ local->pending);
+ if (ret < 0) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ pre_nop = _gf_false;
+ }
- afr_set_delayed_post_op (frame, this);
+ if (call_count > 1 &&
+ (local->transaction.type == AFR_DATA_TRANSACTION ||
+ !local->optimistic_change_log)) {
- if (piggyback)
- afr_changelog_pre_op_cbk (frame, (void *)(long)i,
- this, 1, 0, xattr[i],
- NULL);
- else
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i],
- NULL);
- }
- break;
- case AFR_METADATA_TRANSACTION:
- {
- if (local->optimistic_change_log) {
- afr_changelog_pre_op_cbk (frame, (void *)(long)i,
- this, 1, 0, xattr[i],
- NULL);
- break;
- }
+ /* If we are performing change on only one subvol, no
+ need to mark dirty, because we are setting the pending
+ counts already anyways
+ */
+ local->dirty[idx] = hton32(1);
- if (local->fd)
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i],
- NULL);
- else
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &(local->loc),
- GF_XATTROP_ADD_ARRAY, xattr[i],
- NULL);
- }
- break;
+ ret = dict_set_static_bin (xdata_req, AFR_DIRTY, local->dirty,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret) {
+ op_errno = ENOMEM;
+ goto err;
+ }
- case AFR_ENTRY_RENAME_TRANSACTION:
- {
- if (local->optimistic_change_log) {
- afr_changelog_pre_op_cbk (frame, (void *)(long)i,
- this, 1, 0, xattr[i],
- NULL);
- } else {
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.new_parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr[i],
- NULL);
- }
+ pre_nop = _gf_false;
+ local->transaction.dirtied = 1;
+ }
- call_count--;
- }
+ if (pre_nop)
+ goto next;
+ if (!local->pre_op_compat) {
+ dict_copy (xdata_req, local->xdata_req);
+ goto next;
+ }
- /*
- set it again because previous stack_wind
- might have already returned (think of case
- where subvolume is posix) and would have
- used the dict as placeholder for return
- value
- */
+ afr_changelog_do (frame, this, xdata_req, afr_transaction_perform_fop);
- ret = afr_set_pending_dict (priv, xattr[i], local->pending,
- i, LOCAL_FIRST);
+ if (xdata_req)
+ dict_unref (xdata_req);
- if (ret < 0)
- gf_log (this->name, GF_LOG_INFO,
- "failed to set pending entry");
+ return 0;
+next:
+ afr_transaction_perform_fop (frame, this);
- /* fall through */
+ if (xdata_req)
+ dict_unref (xdata_req);
- case AFR_ENTRY_TRANSACTION:
- {
- if (local->optimistic_change_log) {
- afr_changelog_pre_op_cbk (frame, (void *)(long)i,
- this, 1, 0, xattr[i],
- NULL);
- break;
- }
+ return 0;
+err:
+ local->internal_lock.lock_cbk = local->transaction.done;
+ local->op_ret = -1;
+ local->op_errno = op_errno;
- if (local->fd)
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr[i],
- NULL);
- else
- STACK_WIND_COOKIE (frame,
- afr_changelog_pre_op_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr[i],
- NULL);
- }
- break;
- }
+ afr_unlock (frame, this);
- if (!--call_count)
- break;
- }
-out:
- for (i = 0; i < priv->child_count; i++) {
- dict_unref (xattr[i]);
- }
+ if (xdata_req)
+ dict_unref (xdata_req);
- return 0;
+ return 0;
}
@@ -1151,12 +1080,14 @@ int
afr_set_transaction_flock (afr_local_t *local)
{
afr_internal_lock_t *int_lock = NULL;
+ afr_inodelk_t *inodelk = NULL;
int_lock = &local->internal_lock;
+ inodelk = afr_get_inodelk (int_lock, int_lock->domain);
- int_lock->lk_flock.l_len = local->transaction.len;
- int_lock->lk_flock.l_start = local->transaction.start;
- int_lock->lk_flock.l_type = F_WRLCK;
+ inodelk->flock.l_len = local->transaction.len;
+ inodelk->flock.l_start = local->transaction.start;
+ inodelk->flock.l_type = F_WRLCK;
return 0;
}
@@ -1171,6 +1102,7 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this)
int_lock = &local->internal_lock;
int_lock->transaction_lk_type = AFR_TRANSACTION_LK;
+ int_lock->domain = this->name;
switch (local->transaction.type) {
case AFR_DATA_TRANSACTION:
@@ -1184,8 +1116,8 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this)
case AFR_ENTRY_RENAME_TRANSACTION:
- int_lock->lock_cbk = afr_post_blocking_rename_cbk;
- afr_blocking_lock (frame, this);
+ int_lock->lock_cbk = afr_post_nonblocking_entrylk_cbk;
+ afr_nonblocking_entrylk (frame, this);
break;
case AFR_ENTRY_TRANSACTION:
@@ -1218,28 +1150,10 @@ afr_lock (call_frame_t *frame, xlator_t *this)
int
afr_internal_lock_finish (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- priv = this->private;
- local = frame->local;
-
if (__fop_changelog_needed (frame, this)) {
afr_changelog_pre_op (frame, this);
} else {
- __mark_all_success (local->pending, priv->child_count,
- local->transaction.type);
-
-
- /* Perform fops with the lk-owner from top xlator.
- * Eg: lk-owner of posix-lk and flush should be same,
- * flush cant clear the posix-lks without that lk-owner.
- */
- afr_save_lk_owner (frame);
- frame->root->lk_owner =
- local->transaction.main_frame->root->lk_owner;
-
- local->transaction.fop (frame, this);
+ afr_transaction_perform_fop (frame, this);
}
return 0;
@@ -1249,87 +1163,322 @@ afr_internal_lock_finish (call_frame_t *frame, xlator_t *this)
void
afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- /* call this function from any of the related optimizations
- which benefit from delaying post op are enabled, namely:
+ /* call this function from any of the related optimizations
+ which benefit from delaying post op are enabled, namely:
- - changelog piggybacking
- - eager locking
- */
+ - changelog piggybacking
+ - eager locking
+ */
- priv = this->private;
- if (!priv)
- return;
+ priv = this->private;
+ if (!priv)
+ return;
- if (!priv->post_op_delay_secs)
- return;
+ if (!priv->post_op_delay_secs)
+ return;
- local = frame->local;
- if (!local)
- return;
+ local = frame->local;
+ if (!local->transaction.eager_lock_on)
+ return;
- if (!local->fd)
- return;
+ if (!local)
+ return;
+
+ if (!local->fd)
+ return;
- if (local->op == GF_FOP_WRITE)
- local->delayed_post_op = _gf_true;
+ if (local->op == GF_FOP_WRITE)
+ local->delayed_post_op = _gf_true;
+}
+
+gf_boolean_t
+afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this)
+{
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ if (!fd) {
+ /* If false is returned, it may keep on taking eager-lock
+ * which may lead to starvation, so return true to avoid that.
+ */
+ gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid fd");
+ return _gf_true;
+ }
+ /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock
+ * is taken mount2 opened the same file, it won't be able to
+ * perform any data operations until mount1 releases eager-lock.
+ * To avoid such scenario do not enable eager-lock for this transaction
+ * if open-fd-count is > 1
+ */
+
+ fd_ctx = afr_fd_ctx_get (fd, this);
+ if (!fd_ctx)
+ return _gf_true;
+
+ if (fd_ctx->open_fd_count > 1)
+ return _gf_true;
+
+ return _gf_false;
}
gf_boolean_t
is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- gf_boolean_t res = _gf_false;
+ afr_local_t *local = NULL;
+ gf_boolean_t res = _gf_false;
- local = frame->local;
- if (!local)
- goto out;
+ local = frame->local;
+ if (!local)
+ goto out;
- if (!local->delayed_post_op)
- goto out;
+ if (!local->delayed_post_op)
+ goto out;
- res = _gf_true;
+ //Mark pending changelog ASAP
+ if (!afr_txn_nothing_failed (frame, this))
+ goto out;
+
+ if (local->fd && afr_are_multiple_fds_opened (local->fd, this))
+ goto out;
+
+ res = _gf_true;
out:
- return res;
+ return res;
}
void
-afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd);
-
-void
-afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd);
+afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd,
+ call_stub_t *stub);
void
afr_delayed_changelog_wake_up_cbk (void *data)
{
- fd_t *fd = NULL;
+ fd_t *fd = NULL;
+
+ fd = data;
+
+ afr_delayed_changelog_wake_up (THIS, fd);
+}
+
+
+/* SET operation */
+int
+afr_fd_report_unstable_write (xlator_t *this, fd_t *fd)
+{
+ afr_fd_ctx_t *fdctx = NULL;
+
+ fdctx = afr_fd_ctx_get (fd, this);
+
+ LOCK(&fd->lock);
+ {
+ fdctx->witnessed_unstable_write = _gf_true;
+ }
+ UNLOCK(&fd->lock);
+
+ return 0;
+}
+
+/* TEST and CLEAR operation */
+gf_boolean_t
+afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd)
+{
+ afr_fd_ctx_t *fdctx = NULL;
+ gf_boolean_t witness = _gf_false;
+
+ fdctx = afr_fd_ctx_get (fd, this);
+ if (!fdctx)
+ return _gf_true;
+
+ LOCK(&fd->lock);
+ {
+ if (fdctx->witnessed_unstable_write) {
+ witness = _gf_true;
+ fdctx->witnessed_unstable_write = _gf_false;
+ }
+ }
+ UNLOCK (&fd->lock);
+
+ return witness;
+}
+
+
+int
+afr_changelog_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ int child_index = (long) cookie;
+ int call_count = -1;
+ afr_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (op_ret != 0) {
+ /* Failure of fsync() is as good as failure of previous
+ write(). So treat it like one.
+ */
+ gf_log (this->name, GF_LOG_WARNING,
+ "fsync(%s) failed on subvolume %s. Transaction was %s",
+ uuid_utoa (local->fd->inode->gfid),
+ priv->children[child_index]->name,
+ gf_fop_list[local->op]);
+
+ afr_transaction_fop_failed (frame, this, child_index);
+ }
+
+ call_count = afr_frame_return (frame);
+
+ if (call_count == 0)
+ afr_changelog_post_op_now (frame, this);
+
+ return 0;
+}
+
+
+int
+afr_changelog_fsync (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ int i = 0;
+ int call_count = 0;
+ afr_private_t *priv = NULL;
+ dict_t *xdata = NULL;
+ GF_UNUSED int ret = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ call_count = AFR_COUNT (local->transaction.pre_op, priv->child_count);
+
+ if (!call_count) {
+ /* will go straight to unlock */
+ afr_changelog_post_op_now (frame, this);
+ return 0;
+ }
+
+ local->call_count = call_count;
+
+ xdata = dict_new();
+ if (xdata)
+ ret = dict_set_int32 (xdata, "batch-fsync", 1);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.pre_op[i])
+ continue;
+
+ STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk,
+ (void *) (long) i, priv->children[i],
+ priv->children[i]->fops->fsync, local->fd,
+ 1, xdata);
+ if (!--call_count)
+ break;
+ }
+
+ if (xdata)
+ dict_unref (xdata);
+
+ return 0;
+}
+
+
+int
+afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) {
+ afr_changelog_post_op_now (frame, this);
+ return 0;
+ }
+
+ if (afr_changelog_pre_op_uninherit (frame, this) &&
+ afr_txn_nothing_failed (frame, this)) {
+ /* just detected that this post-op is about to
+ be optimized away as a new write() has
+ already piggybacked on this frame's changelog.
+ */
+ afr_changelog_post_op_now (frame, this);
+ return 0;
+ }
- fd = data;
+ /* Calling afr_changelog_post_op_now() now will result in
+ issuing ->[f]xattrop().
+
+ Performing a hard POST-OP (->[f]xattrop() FOP) is a more
+ responsible operation that what it might appear on the surface.
+
+ The changelog of a file (in the xattr of the file on the server)
+ stores information (pending count) about the state of the file
+ on the OTHER server. This changelog is blindly trusted, and must
+ therefore be updated in such a way it remains trustworthy. This
+ implies that decrementing the pending count (essentially "clearing
+ the dirty flag") must be done STRICTLY after we are sure that the
+ operation on the other server has reached stable storage.
+
+ While the backend filesystem on that server will eventually flush
+ it to stable storage, we (being in userspace) have no mechanism
+ to get notified when the write became "stable".
+
+ This means we need take matter into our own hands and issue an
+ fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES,
+ and get an acknowledgement for it. And we need to wait for the
+ fsync() acknowledgement before initiating the hard POST-OP.
+
+ However if the FD itself was opened in O_SYNC or O_DSYNC then
+ we are already guaranteed that the writes were made stable as
+ part of the FOP itself. The same holds true for NFS stable
+ writes which happen on an anonymous FD with O_DSYNC or O_SYNC
+ flag set in the writev() @flags param. For all other write types,
+ mark a flag in the fdctx whenever an unstable write is witnessed.
+ */
+
+ if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) {
+ afr_changelog_post_op_now (frame, this);
+ return 0;
+ }
- afr_delayed_changelog_wake_up (THIS, fd);
+ /* Check whether users want durability and perform fsync/post-op
+ * accordingly.
+ */
+ if (priv->ensure_durability) {
+ /* Time to fsync() */
+ afr_changelog_fsync (frame, this);
+ } else {
+ afr_changelog_post_op_now (frame, this);
+ }
+
+ return 0;
}
void
-afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd)
+afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd,
+ call_stub_t *stub)
{
afr_fd_ctx_t *fd_ctx = NULL;
call_frame_t *prev_frame = NULL;
- struct timeval delta = {0, };
+ struct timespec delta = {0, };
afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
priv = this->private;
fd_ctx = afr_fd_ctx_get (fd, this);
if (!fd_ctx)
- return;
+ goto out;
delta.tv_sec = priv->post_op_delay_secs;
- delta.tv_usec = 0;
+ delta.tv_nsec = 0;
pthread_mutex_lock (&fd_ctx->delay_lock);
{
@@ -1348,8 +1497,13 @@ afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd)
unlock:
pthread_mutex_unlock (&fd_ctx->delay_lock);
+out:
if (prev_frame) {
+ local = prev_frame->local;
+ local->transaction.resume_stub = stub;
afr_changelog_post_op_now (prev_frame, this);
+ } else if (stub) {
+ call_resume (stub);
}
}
@@ -1357,59 +1511,66 @@ unlock:
void
afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- if (is_afr_delayed_changelog_post_op_needed (frame, this))
- afr_delayed_changelog_post_op (this, frame, local->fd);
- else
- afr_changelog_post_op_now (frame, this);
+ if (is_afr_delayed_changelog_post_op_needed (frame, this))
+ afr_delayed_changelog_post_op (this, frame, local->fd, NULL);
+ else
+ afr_changelog_post_op_safe (frame, this);
+}
+
+
+
+/* Wake up the sleeping/delayed post-op, and also register
+ a stub to have it resumed after this transaction
+ completely finishes.
+
+ The @stub gets saved in @local and gets resumed in
+ afr_local_cleanup()
+ */
+void
+afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub)
+{
+ afr_delayed_changelog_post_op (this, NULL, fd, stub);
}
void
afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd)
{
- afr_delayed_changelog_post_op (this, NULL, fd);
+ afr_delayed_changelog_post_op (this, NULL, fd, NULL);
}
int
afr_transaction_resume (call_frame_t *frame, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- fd_t *fd = NULL;
local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
- fd = local->fd;
-
- if (fd)
- /* The wake up needs to happen independent of
- what type of fop arrives here. If it was
- a write, then it has already inherited the
- lock and changelog. If it was not a write,
- then the presumption of the optimization (of
- optimizing for successive write operations)
- fails.
- */
- afr_delayed_changelog_wake_up (this, fd);
- afr_restore_lk_owner (frame);
+ if (local->transaction.eager_lock_on) {
+ /* We don't need to retain "local" in the
+ fd list anymore, writes to all subvols
+ are finished by now */
+ afr_remove_eager_lock_stub (local);
+ }
+
+ afr_restore_lk_owner (frame);
+
+ afr_handle_symmetric_errors (frame, this);
+
+ if (!local->pre_op_compat)
+ /* new mode, pre-op was done along
+ with OP */
+ afr_changelog_pre_op_update (frame, this);
if (__fop_changelog_needed (frame, this)) {
afr_changelog_post_op (frame, this);
} else {
- if (afr_lock_server_count (priv, local->transaction.type) == 0) {
- local->transaction.done (frame, this);
- } else {
- int_lock->lock_cbk = local->transaction.done;
- afr_unlock (frame, this);
- }
+ afr_changelog_post_op_done (frame, this);
}
return 0;
@@ -1421,31 +1582,99 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this)
*/
void
-afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, int child_index)
+afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,
+ int child_index)
{
afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
local = frame->local;
- priv = this->private;
- __mark_child_dead (local->pending, priv->child_count,
- child_index, local->transaction.type);
+ local->transaction.failed_subvols[child_index] = 1;
}
-gf_boolean_t
-_does_transaction_conflict_with_delayed_post_op (call_frame_t *frame)
+
+
+ static gf_boolean_t
+afr_locals_overlap (afr_local_t *local1, afr_local_t *local2)
{
- afr_local_t *local = frame->local;
- //check if it is going to compete with inode lock from the fd
- if (local->fd)
- return _gf_false;
- if ((local->transaction.type == AFR_DATA_TRANSACTION) ||
- (local->transaction.type == AFR_METADATA_TRANSACTION))
- return _gf_true;
- return _gf_false;
+ uint64_t start1 = local1->transaction.start;
+ uint64_t start2 = local2->transaction.start;
+ uint64_t end1 = 0;
+ uint64_t end2 = 0;
+
+ if (local1->transaction.len)
+ end1 = start1 + local1->transaction.len - 1;
+ else
+ end1 = ULLONG_MAX;
+
+ if (local2->transaction.len)
+ end2 = start2 + local2->transaction.len - 1;
+ else
+ end2 = ULLONG_MAX;
+
+ return ((end1 >= start2) && (end2 >= start1));
}
+void
+afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_fd_ctx_t *fdctx = NULL;
+ afr_local_t *each = NULL;
+
+ priv = this->private;
+
+ if (!local->fd)
+ return;
+
+ if (local->transaction.type != AFR_DATA_TRANSACTION)
+ return;
+
+ if (!priv->eager_lock)
+ return;
+
+ fdctx = afr_fd_ctx_get (local->fd, this);
+ if (!fdctx)
+ return;
+
+ if (afr_are_multiple_fds_opened (local->fd, this))
+ return;
+ /*
+ * Once full file lock is acquired in eager-lock phase, overlapping
+ * writes do not compete for inode-locks, instead are transferred to the
+ * next writes. Because of this overlapping writes are not ordered.
+ * This can cause inconsistencies in replication.
+ * Example:
+ * Two overlapping writes w1, w2 are sent in parallel on same fd
+ * in two threads t1, t2.
+ * Both threads can execute afr_writev_wind in the following manner.
+ * t1 winds w1 on brick-0
+ * t2 winds w2 on brick-0
+ * t2 winds w2 on brick-1
+ * t1 winds w1 on brick-1
+ *
+ * This check makes sure the locks are not transferred for
+ * overlapping writes.
+ */
+ LOCK (&local->fd->lock);
+ {
+ list_for_each_entry (each, &fdctx->eager_locked,
+ transaction.eager_locked) {
+ if (afr_locals_overlap (each, local)) {
+ local->transaction.eager_lock_on = _gf_false;
+ goto unlock;
+ }
+ }
+
+ local->transaction.eager_lock_on = _gf_true;
+ list_add_tail (&local->transaction.eager_locked,
+ &fdctx->eager_locked);
+ }
+unlock:
+ UNLOCK (&local->fd->lock);
+}
+
+
int
afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
{
@@ -1457,23 +1686,25 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
local = frame->local;
priv = this->private;
- if (local->fd && priv->eager_lock &&
- local->transaction.type == AFR_DATA_TRANSACTION)
- afr_set_lk_owner (frame, this, local->fd);
- else
- afr_set_lk_owner (frame, this, frame->root);
+ local->transaction.resume = afr_transaction_resume;
+ local->transaction.type = type;
ret = afr_transaction_local_init (local, this);
- if (ret < 0) {
+ if (ret < 0)
goto out;
- }
- local->transaction.resume = afr_transaction_resume;
- local->transaction.type = type;
+ afr_transaction_eager_lock_init (local, this);
+
+ if (local->fd && local->transaction.eager_lock_on)
+ afr_set_lk_owner (frame, this, local->fd);
+ else
+ afr_set_lk_owner (frame, this, frame->root);
- if (_does_transaction_conflict_with_delayed_post_op (frame) &&
- local->loc.inode) {
+ if (!local->transaction.eager_lock_on && local->loc.inode) {
fd = fd_lookup (local->loc.inode, frame->root->pid);
+ if (fd == NULL)
+ fd = fd_lookup_anonymous (local->loc.inode);
+
if (fd) {
afr_delayed_changelog_wake_up (this, fd);
fd_unref (fd);
diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h
index 102599633..77cc8eed0 100644
--- a/xlators/cluster/afr/src/afr-transaction.h
+++ b/xlators/cluster/afr/src/afr-transaction.h
@@ -11,10 +11,7 @@
#ifndef __TRANSACTION_H__
#define __TRANSACTION_H__
-typedef enum {
- LOCAL_FIRST = 1,
- LOCAL_LAST = 2
-} afr_xattrop_type_t;
+#include "afr.h"
void
afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,
@@ -23,14 +20,34 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,
int
afr_lock_server_count (afr_private_t *priv, afr_transaction_type type);
+afr_inodelk_t*
+afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom);
+
int32_t
afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type);
-afr_fd_ctx_t *
-afr_fd_ctx_get (fd_t *fd, xlator_t *this);
int
-afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending,
- int child, afr_xattrop_type_t op);
+afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending);
+
void
afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this);
+
+void
+afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd);
+
+void
+__mark_all_success (call_frame_t *frame, xlator_t *this);
+
+gf_boolean_t
+afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this);
+
+int afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ afr_read_txn_wind_t readfn, afr_transaction_type type);
+
+int afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol);
+
+int __afr_txn_write_fop (call_frame_t *frame, xlator_t *this);
+int __afr_txn_write_done (call_frame_t *frame, xlator_t *this);
+call_frame_t *afr_transaction_detach_fop_frame (call_frame_t *frame);
+
#endif /* __TRANSACTION_H__ */
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index ea686dfde..ead08425f 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -21,11 +21,6 @@
#endif
#include "afr-common.c"
-#define SHD_INODE_LRU_LIMIT 2048
-#define AFR_EH_HEALED_LIMIT 1024
-#define AFR_EH_HEAL_FAIL_LIMIT 1024
-#define AFR_EH_SPLIT_BRAIN_LIMIT 1024
-
struct volume_options options[];
int32_t
@@ -114,6 +109,14 @@ reconfigure (xlator_t *this, dict_t *options)
priv = this->private;
+ GF_OPTION_RECONF ("afr-dirty-xattr",
+ priv->afr_dirty, options, str,
+ out);
+
+ GF_OPTION_RECONF ("metadata-splitbrain-forced-heal",
+ priv->metadata_splitbrain_forced_heal, options, bool,
+ out);
+
GF_OPTION_RECONF ("background-self-heal-count",
priv->background_self_heal_count, options, uint32,
out);
@@ -127,9 +130,6 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("entry-self-heal", priv->entry_self_heal, options,
bool, out);
- GF_OPTION_RECONF ("strict-readdir", priv->strict_readdir, options, bool,
- out);
-
GF_OPTION_RECONF ("data-self-heal-window-size",
priv->data_self_heal_window_size, options,
uint32, out);
@@ -146,8 +146,6 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("data-self-heal-algorithm",
priv->data_self_heal_algorithm, options, str, out);
- GF_OPTION_RECONF ("self-heal-daemon", priv->shd.enabled, options, bool, out);
-
GF_OPTION_RECONF ("read-subvolume", read_subvol, options, xlator, out);
GF_OPTION_RECONF ("read-hash-mode", priv->hash_mode,
@@ -175,20 +173,29 @@ reconfigure (xlator_t *this, dict_t *options)
priv->read_child = index;
}
+ GF_OPTION_RECONF ("pre-op-compat", priv->pre_op_compat, options, bool, out);
+
GF_OPTION_RECONF ("eager-lock", priv->eager_lock, options, bool, out);
GF_OPTION_RECONF ("quorum-type", qtype, options, str, out);
GF_OPTION_RECONF ("quorum-count", priv->quorum_count, options,
uint32, out);
fix_quorum_options(this,priv,qtype);
- GF_OPTION_RECONF ("heal-timeout", priv->shd.timeout, options,
- int32, out);
GF_OPTION_RECONF ("post-op-delay-secs", priv->post_op_delay_secs, options,
uint32, out);
GF_OPTION_RECONF (AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size,
- options, size, out);
+ options, size_uint64, out);
/* Reset this so we re-discover in case the topology changed. */
+ GF_OPTION_RECONF ("ensure-durability", priv->ensure_durability, options,
+ bool, out);
+
+ GF_OPTION_RECONF ("self-heal-daemon", priv->shd.enabled, options,
+ bool, out);
+
+ GF_OPTION_RECONF ("iam-self-heal-daemon", priv->shd.iamshd, options,
+ bool, out);
+
priv->did_discovery = _gf_false;
ret = 0;
@@ -240,10 +247,6 @@ init (xlator_t *this)
priv = this->private;
LOCK_INIT (&priv->lock);
- LOCK_INIT (&priv->read_child_lock);
- //lock recovery is not done in afr
- pthread_mutex_init (&priv->mutex, NULL);
- INIT_LIST_HEAD (&priv->saved_fds);
child_count = xlator_subvolume_count (this);
@@ -251,6 +254,11 @@ init (xlator_t *this)
priv->read_child = -1;
+ GF_OPTION_INIT ("afr-dirty-xattr", priv->afr_dirty, str, out);
+
+ GF_OPTION_INIT ("metadata-splitbrain-forced-heal",
+ priv->metadata_splitbrain_forced_heal, bool, out);
+
GF_OPTION_INIT ("read-subvolume", read_subvol, xlator, out);
if (read_subvol) {
priv->read_child = xlator_subvolume_index (this, read_subvol);
@@ -304,10 +312,6 @@ init (xlator_t *this)
GF_OPTION_INIT ("entry-self-heal", priv->entry_self_heal, bool, out);
- GF_OPTION_INIT ("self-heal-daemon", priv->shd.enabled, bool, out);
-
- GF_OPTION_INIT ("iam-self-heal-daemon", priv->shd.iamshd, bool, out);
-
GF_OPTION_INIT ("data-change-log", priv->data_change_log, bool, out);
GF_OPTION_INIT ("metadata-change-log", priv->metadata_change_log, bool,
@@ -322,16 +326,22 @@ init (xlator_t *this)
GF_OPTION_INIT ("entrylk-trace", priv->entrylk_trace, bool, out);
- GF_OPTION_INIT ("strict-readdir", priv->strict_readdir, bool, out);
+ GF_OPTION_INIT ("pre-op-compat", priv->pre_op_compat, bool, out);
GF_OPTION_INIT ("eager-lock", priv->eager_lock, bool, out);
GF_OPTION_INIT ("quorum-type", qtype, str, out);
GF_OPTION_INIT ("quorum-count", priv->quorum_count, uint32, out);
- GF_OPTION_INIT (AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, size,
+ GF_OPTION_INIT (AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, size_uint64,
out);
fix_quorum_options(this,priv,qtype);
GF_OPTION_INIT ("post-op-delay-secs", priv->post_op_delay_secs, uint32, out);
+ GF_OPTION_INIT ("ensure-durability", priv->ensure_durability, bool,
+ out);
+
+ GF_OPTION_INIT ("self-heal-daemon", priv->shd.enabled, bool, out);
+
+ GF_OPTION_INIT ("iam-self-heal-daemon", priv->shd.iamshd, bool, out);
priv->wait_count = 1;
@@ -373,8 +383,6 @@ init (xlator_t *this)
AFR_XATTR_PREFIX,
trav->xlator->name);
if (-1 == ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "asprintf failed to set pending key");
ret = -ENOMEM;
goto out;
}
@@ -383,6 +391,13 @@ init (xlator_t *this)
i++;
}
+ ret = gf_asprintf (&priv->sh_domain, AFR_SH_DATA_DOMAIN_FMT,
+ this->name);
+ if (-1 == ret) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
priv->last_event = GF_CALLOC (child_count, sizeof (*priv->last_event),
gf_afr_mt_int32_t);
if (!priv->last_event) {
@@ -390,6 +405,12 @@ init (xlator_t *this)
goto out;
}
+ ret = afr_selfheal_daemon_init (this);
+ if (ret) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
/* keep more local here as we may need them for self-heal etc */
this->local_pool = mem_pool_new (afr_local_t, 512);
if (!this->local_pool) {
@@ -399,53 +420,8 @@ init (xlator_t *this)
goto out;
}
- priv->first_lookup = 1;
priv->root_inode = NULL;
- if (!priv->shd.iamshd) {
- ret = 0;
- goto out;
- }
-
- ret = -ENOMEM;
- priv->shd.pos = GF_CALLOC (sizeof (*priv->shd.pos), child_count,
- gf_afr_mt_brick_pos_t);
- if (!priv->shd.pos)
- goto out;
-
- priv->shd.pending = GF_CALLOC (sizeof (*priv->shd.pending), child_count,
- gf_afr_mt_int32_t);
- if (!priv->shd.pending)
- goto out;
-
- priv->shd.inprogress = GF_CALLOC (sizeof (*priv->shd.inprogress),
- child_count, gf_afr_mt_shd_bool_t);
- if (!priv->shd.inprogress)
- goto out;
- priv->shd.timer = GF_CALLOC (sizeof (*priv->shd.timer), child_count,
- gf_afr_mt_shd_timer_t);
- if (!priv->shd.timer)
- goto out;
-
- priv->shd.healed = eh_new (AFR_EH_HEALED_LIMIT, _gf_false);
- if (!priv->shd.healed)
- goto out;
-
- priv->shd.heal_failed = eh_new (AFR_EH_HEAL_FAIL_LIMIT, _gf_false);
- if (!priv->shd.heal_failed)
- goto out;
-
- priv->shd.split_brain = eh_new (AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false);
- if (!priv->shd.split_brain)
- goto out;
-
- this->itable = inode_table_new (SHD_INODE_LRU_LIMIT, this);
- if (!this->itable)
- goto out;
- priv->root_inode = inode_ref (this->itable->root);
- GF_OPTION_INIT ("node-uuid", priv->shd.node_uuid, str, out);
- GF_OPTION_INIT ("heal-timeout", priv->shd.timeout, int32, out);
-
ret = 0;
out:
return ret;
@@ -460,7 +436,7 @@ fini (xlator_t *this)
priv = this->private;
this->private = NULL;
afr_priv_destroy (priv);
- if (this->itable);//I dont see any destroy func
+ //if (this->itable);//I dont see any destroy func
return 0;
}
@@ -480,6 +456,9 @@ struct xlator_fops fops = {
.finodelk = afr_finodelk,
.entrylk = afr_entrylk,
.fentrylk = afr_fentrylk,
+ .fallocate = afr_fallocate,
+ .discard = afr_discard,
+ .zerofill = afr_zerofill,
/* inode read */
.access = afr_access,
@@ -532,35 +511,53 @@ struct xlator_cbks cbks = {
struct volume_options options[] = {
{ .key = {"read-subvolume" },
- .type = GF_OPTION_TYPE_XLATOR
+ .type = GF_OPTION_TYPE_XLATOR,
+ .description = "inode-read fops happen only on one of the bricks in "
+ "replicate. Afr will prefer the one specified using "
+ "this option if it is not stale. Option value must be "
+ "one of the xlator names of the children. "
+ "Ex: <volname>-client-0 till "
+ "<volname>-client-<number-of-bricks - 1>"
},
{ .key = {"read-subvolume-index" },
.type = GF_OPTION_TYPE_INT,
.default_value = "-1",
+ .description = "inode-read fops happen only on one of the bricks in "
+ "replicate. AFR will prefer the one specified using "
+ "this option if it is not stale. allowed options"
+ " include -1 till replica-count - 1"
},
{ .key = {"read-hash-mode" },
.type = GF_OPTION_TYPE_INT,
.min = 0,
.max = 2,
- .default_value = "0",
- .description = "0 = first responder, "
- "1 = hash by GFID (all clients use same subvolume), "
- "2 = hash by GFID and client PID",
+ .default_value = "1",
+ .description = "inode-read fops happen only on one of the bricks in "
+ "replicate. AFR will prefer the one computed using "
+ "the method specified using this option"
+ "0 = first up server, "
+ "1 = hash by GFID of file (all clients use "
+ "same subvolume), "
+ "2 = hash by GFID of file and client PID",
},
{ .key = {"choose-local" },
.type = GF_OPTION_TYPE_BOOL,
.default_value = "true",
- .description = "Choose a local subvolume to read from if "
- "read-subvolume is not explicitly set.",
+ .description = "Choose a local subvolume (i.e. Brick) to read from"
+ " if read-subvolume is not explicitly set.",
},
{ .key = {"favorite-child"},
- .type = GF_OPTION_TYPE_XLATOR
+ .type = GF_OPTION_TYPE_XLATOR,
+ .description = "If a split-brain happens choose subvol/brick set by "
+ "this option as source."
},
{ .key = {"background-self-heal-count"},
.type = GF_OPTION_TYPE_INT,
.min = 0,
.default_value = "16",
.validate = GF_OPT_VALIDATE_MIN,
+ .description = "This specifies the number of self-heals that can be "
+ " performed in background without blocking the fop"
},
{ .key = {"data-self-heal"},
.type = GF_OPTION_TYPE_STR,
@@ -568,18 +565,25 @@ struct volume_options options[] = {
"0", "off", "no", "false", "disable",
"open"},
.default_value = "on",
- .description = "\"open\" means data self-heal action will"
- "only be triggered by file open operations."
+ .description = "Using this option we can enable/disable data "
+ "self-heal on the file. \"open\" means data "
+ "self-heal action will only be triggered by file "
+ "open operations."
},
{ .key = {"data-self-heal-algorithm"},
.type = GF_OPTION_TYPE_STR,
- .default_value = "",
.description = "Select between \"full\", \"diff\". The "
"\"full\" algorithm copies the entire file from "
"source to sink. The \"diff\" algorithm copies to "
"sink only those blocks whose checksums don't match "
- "with those of source.",
- .value = { "diff", "full", "" }
+ "with those of source. If no option is configured "
+ "the option is chosen dynamically as follows: "
+ "If the file does not exist on one of the sinks "
+ "or empty file exists or if the source file size is "
+ "about the same as page size the entire file will "
+ "be read and written i.e \"full\" algo, "
+ "otherwise \"diff\" algo is chosen.",
+ .value = { "diff", "full"}
},
{ .key = {"data-self-heal-window-size"},
.type = GF_OPTION_TYPE_INT,
@@ -592,54 +596,102 @@ struct volume_options options[] = {
{ .key = {"metadata-self-heal"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "on",
+ .description = "Using this option we can enable/disable metadata "
+ "i.e. Permissions, ownerships, xattrs self-heal on "
+ "the file/directory."
},
{ .key = {"entry-self-heal"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "on",
+ .description = "Using this option we can enable/disable entry "
+ "self-heal on the directory."
},
{ .key = {"data-change-log"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "on",
+ .description = "Data fops like write/truncate will not perform "
+ "pre/post fop changelog operations in afr transaction "
+ "if this option is disabled"
},
{ .key = {"metadata-change-log"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "on",
+ .description = "Metadata fops like setattr/setxattr will not perform "
+ "pre/post fop changelog operations in afr transaction "
+ "if this option is disabled"
},
{ .key = {"entry-change-log"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "on",
+ .description = "Entry fops like create/unlink will not perform "
+ "pre/post fop changelog operations in afr transaction "
+ "if this option is disabled"
},
{ .key = {"optimistic-change-log"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "on",
- },
- { .key = {"strict-readdir"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "off",
+ .description = "Entry/Metadata fops will not perform "
+ "pre fop changelog operations in afr transaction "
+ "if this option is enabled."
},
{ .key = {"inodelk-trace"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
+ .description = "Enabling this option logs inode lock/unlocks"
},
{ .key = {"entrylk-trace"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
+ .description = "Enabling this option logs entry lock/unlocks"
},
+ { .key = {"pre-op-compat"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "Use separate pre-op xattrop() FOP rather than "
+ "overloading xdata of the OP"
+ },
{ .key = {"eager-lock"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "on",
+ .description = "Lock phase of a transaction has two sub-phases. "
+ "First is an attempt to acquire locks in parallel by "
+ "broadcasting non-blocking lock requests. If lock "
+ "acquisition fails on any server, then the held locks "
+ "are unlocked and revert to a blocking locked mode "
+ "sequentially on one server after another. If this "
+ "option is enabled the initial broadcasting lock "
+ "request attempt to acquire lock on the entire file. "
+ "If this fails, we revert back to the sequential "
+ "\"regional\" blocking lock as before. In the case "
+ "where such an \"eager\" lock is granted in the "
+ "non-blocking phase, it gives rise to an opportunity "
+ "for optimization. i.e, if the next write transaction "
+ "on the same FD arrives before the unlock phase of "
+ "the first transaction, it \"takes over\" the full "
+ "file lock. Similarly if yet another data transaction "
+ "arrives before the unlock phase of the \"optimized\" "
+ "transaction, that in turn \"takes over\" the lock as "
+ "well. The actual unlock now happens at the end of "
+ "the last \"optimized\" transaction."
+
},
{ .key = {"self-heal-daemon"},
.type = GF_OPTION_TYPE_BOOL,
- .default_value = "off",
+ .default_value = "on",
+ .description = "This option applies to only self-heal-daemon. "
+ "Index directory crawl and automatic healing of files "
+ "will not be performed if this option is turned off."
},
{ .key = {"iam-self-heal-daemon"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
+ .description = "This option differentiates if the replicate "
+ "translator is running as part of self-heal-daemon "
+ "or not."
},
{ .key = {"quorum-type"},
.type = GF_OPTION_TYPE_STR,
- .value = { "none", "auto", "fixed", "" },
+ .value = { "none", "auto", "fixed"},
.default_value = "none",
.description = "If value is \"fixed\" only allow writes if "
"quorum-count bricks are present. If value is "
@@ -658,14 +710,9 @@ struct volume_options options[] = {
},
{ .key = {"node-uuid"},
.type = GF_OPTION_TYPE_STR,
- .description = "Local glusterd uuid string",
- },
- { .key = {"heal-timeout"},
- .type = GF_OPTION_TYPE_INT,
- .min = 60,
- .max = INT_MAX,
- .default_value = "600",
- .description = "Poll timeout for checking the need to self-heal"
+ .description = "Local glusterd uuid string, used in starting "
+ "self-heal-daemon so that it can crawl only on "
+ "local index directories.",
},
{ .key = {"post-op-delay-secs"},
.type = GF_OPTION_TYPE_INT,
@@ -683,5 +730,20 @@ struct volume_options options[] = {
.max = 131072,
.default_value = "1KB",
},
+ { .key = {"ensure-durability"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .description = "Afr performs fsyncs for transactions if this "
+ "option is on to make sure the changelogs/data is "
+ "written to the disk",
+ .default_value = "on",
+ },
+ { .key = {"afr-dirty-xattr"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = AFR_DIRTY_DEFAULT,
+ },
+ { .key = {"metadata-splitbrain-forced-heal"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ },
{ .key = {NULL} },
};
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 3f2bbbebc..36042f7b2 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -20,96 +20,42 @@
#include "call-stub.h"
#include "compat-errno.h"
#include "afr-mem-types.h"
-#include "afr-self-heal-algorithm.h"
#include "libxlator.h"
#include "timer.h"
+#include "syncop.h"
+
+#include "afr-self-heald.h"
#define AFR_XATTR_PREFIX "trusted.afr"
#define AFR_PATHINFO_HEADER "REPLICATE:"
#define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size"
+#define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal"
+#define AFR_DIRTY_DEFAULT AFR_XATTR_PREFIX ".dirty"
+#define AFR_DIRTY (((afr_private_t *) (THIS->private))->afr_dirty)
-struct _pump_private;
-
-typedef int (*afr_expunge_done_cbk_t) (call_frame_t *frame, xlator_t *this,
- int child, int32_t op_error,
- int32_t op_errno);
-
-typedef int (*afr_impunge_done_cbk_t) (call_frame_t *frame, xlator_t *this,
- int32_t op_error, int32_t op_errno);
-typedef int (*afr_post_remove_call_t) (call_frame_t *frame, xlator_t *this);
+#define AFR_LOCKEE_COUNT_MAX 3
+#define AFR_DOM_COUNT_MAX 3
+#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/
typedef int (*afr_lock_cbk_t) (call_frame_t *frame, xlator_t *this);
-typedef void (*afr_lookup_done_cbk_t) (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno);
-typedef enum {
- AFR_POS_UNKNOWN,
- AFR_POS_LOCAL,
- AFR_POS_REMOTE
-} afr_child_pos_t;
+typedef int (*afr_read_txn_wind_t) (call_frame_t *frame, xlator_t *this, int subvol);
-typedef enum {
- SPLIT_BRAIN = 1,
- ALL_FOOLS = 2
-} afr_subvol_status_t;
+typedef int (*afr_inode_refresh_cbk_t) (call_frame_t *frame, xlator_t *this, int err);
-typedef enum {
- AFR_INODE_SET_READ_CTX = 1,
- AFR_INODE_RM_STALE_CHILDREN,
- AFR_INODE_SET_OPENDIR_DONE,
- AFR_INODE_SET_SPLIT_BRAIN,
- AFR_INODE_GET_READ_CTX,
- AFR_INODE_GET_OPENDIR_DONE,
- AFR_INODE_GET_SPLIT_BRAIN,
-} afr_inode_op_t;
-
-typedef struct afr_inode_params_ {
- afr_inode_op_t op;
- union {
- gf_boolean_t value;
- struct {
- int32_t read_child;
- int32_t *children;
- } read_ctx;
- } u;
-} afr_inode_params_t;
+typedef int (*afr_changelog_resume_t) (call_frame_t *frame, xlator_t *this);
-typedef struct afr_inode_ctx_ {
- uint64_t masks;
- int32_t *fresh_children;//increasing order of latency
-} afr_inode_ctx_t;
-
-typedef enum {
- NONE,
- INDEX,
- FULL,
-} afr_crawl_type_t;
-
-typedef struct afr_self_heald_ {
- gf_boolean_t enabled;
- gf_boolean_t iamshd;
- afr_crawl_type_t *pending;
- gf_boolean_t *inprogress;
- afr_child_pos_t *pos;
- gf_timer_t **timer;
- eh_t *healed;
- eh_t *heal_failed;
- eh_t *split_brain;
- char *node_uuid;
- int timeout;
-} afr_self_heald_t;
+#define alloca0(size) ({void *__ptr; __ptr = alloca(size); memset(__ptr, 0, size); __ptr;})
+#define AFR_COUNT(array,max) ({int __i; int __res = 0; for (__i = 0; __i < max; __i++) if (array[__i]) __res++; __res;})
+#define AFR_INTERSECT(dst,src1,src2,max) ({int __i; for (__i = 0; __i < max; __i++) dst[__i] = src1[__i] && src2[__i];})
typedef struct _afr_private {
gf_lock_t lock; /* to guard access to child_count, etc */
unsigned int child_count; /* total number of children */
- unsigned int read_child_rr; /* round-robin index of the read_child */
- gf_lock_t read_child_lock; /* lock to protect above */
-
xlator_t **children;
- int first_lookup;
inode_t *root_inode;
unsigned char *child_up;
@@ -130,6 +76,7 @@ typedef struct _afr_private {
gf_boolean_t metadata_change_log; /* on/off */
gf_boolean_t entry_change_log; /* on/off */
+ gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */
int read_child; /* read-subvolume */
unsigned int hash_mode; /* for when read_child is not set */
int favorite_child; /* subvolume to be preferred in resolving
@@ -138,143 +85,45 @@ typedef struct _afr_private {
gf_boolean_t inodelk_trace;
gf_boolean_t entrylk_trace;
- gf_boolean_t strict_readdir;
-
unsigned int wait_count; /* # of servers to wait for success */
uint64_t up_count; /* number of CHILD_UPs we have seen */
uint64_t down_count; /* number of CHILD_DOWNs we have seen */
- struct _pump_private *pump_private; /* Set if we are loaded as pump */
- int use_afr_in_pump;
-
- pthread_mutex_t mutex;
- struct list_head saved_fds; /* list of fds on which locks have succeeded */
gf_boolean_t optimistic_change_log;
gf_boolean_t eager_lock;
+ gf_boolean_t pre_op_compat; /* on/off */
uint32_t post_op_delay_secs;
unsigned int quorum_count;
char vol_uuid[UUID_SIZE + 1];
int32_t *last_event;
- afr_self_heald_t shd;
+
+ /* @event_generation: Keeps count of number of events received which can
+ potentially impact consistency decisions. The events are CHILD_UP
+ and CHILD_DOWN, when we have to recalculate the freshness/staleness
+ of copies to detect if changes had happened while the other server
+ was down. CHILD_DOWN and CHILD_UP can also be received on network
+ disconnect/reconnects and not necessarily server going down/up.
+ Recalculating freshness/staleness on network events is equally
+ important as we might have had a network split brain.
+ */
+ uint32_t event_generation;
+
gf_boolean_t choose_local;
gf_boolean_t did_discovery;
uint64_t sh_readdir_size;
-} afr_private_t;
+ gf_boolean_t ensure_durability;
+ char *sh_domain;
+ char *afr_dirty;
-typedef struct {
- /* External interface: These are variables (some optional) that
- are set by whoever has triggered self-heal */
-
- gf_boolean_t do_data_self_heal;
- gf_boolean_t do_metadata_self_heal;
- gf_boolean_t do_entry_self_heal;
- gf_boolean_t do_gfid_self_heal;
- gf_boolean_t do_missing_entry_self_heal;
- gf_boolean_t force_confirm_spb; /* Check for split-brains even when
- self-heal is turned off */
-
- gf_boolean_t forced_merge; /* Is this a self-heal triggered to
- forcibly merge the directories? */
-
- gf_boolean_t background; /* do self-heal in background
- if possible */
- ia_type_t type; /* st_mode of the entry we're doing
- self-heal on */
- inode_t *inode; /* inode on which the self-heal is
- performed on */
- uuid_t sh_gfid_req; /* gfid self-heal needs to be done
- with this gfid if it is not null */
-
- /* Function to call to unwind. If self-heal is being done in the
- background, this function will be called as soon as possible. */
-
- int (*unwind) (call_frame_t *frame, xlator_t *this, int32_t op_ret,
- int32_t op_errno, int32_t sh_failed);
-
- /* End of external interface members */
-
-
- /* array of stat's, one for each child */
- struct iatt *buf;
- struct iatt *parentbufs;
- struct iatt parentbuf;
- struct iatt entrybuf;
-
- afr_expunge_done_cbk_t expunge_done;
- afr_impunge_done_cbk_t impunge_done;
-
- /* array of xattr's, one for each child */
- dict_t **xattr;
-
- /* array containing if the lookups succeeded in the order of response
- */
- int32_t *success_children;
- int success_count;
- /* array containing the fresh children found in the self-heal process */
- int32_t *fresh_children;
- /* array containing the fresh children found in the parent lookup */
- int32_t *fresh_parent_dirs;
- /* array of errno's, one for each child */
- int *child_errno;
- /*loc used for lookup*/
- loc_t lookup_loc;
- int32_t lookup_flags;
- afr_lookup_done_cbk_t lookup_done;
-
- int32_t **pending_matrix;
- int32_t **delta_matrix;
+ afr_self_heald_t shd;
- int32_t op_ret;
- int32_t op_errno;
+ /* pump dependencies */
+ void *pump_private;
+ gf_boolean_t use_afr_in_pump;
+} afr_private_t;
- int *sources;
- int source;
- int active_source;
- int active_sinks;
- unsigned char *success;
- unsigned char *locked_nodes;
- int lock_count;
-
- const char *linkname;
- gf_boolean_t entries_skipped;
-
- int op_failed;
-
- gf_boolean_t sync_done;
- gf_boolean_t data_lock_held;
- gf_boolean_t eof_reached;
- fd_t *healing_fd;
- int file_has_holes;
- blksize_t block_size;
- off_t file_size;
- off_t offset;
- unsigned char *write_needed;
- uint8_t *checksum;
- afr_post_remove_call_t post_remove_call;
-
- loc_t parent_loc;
-
- call_frame_t *orig_frame;
- call_frame_t *old_loop_frame;
- gf_boolean_t unwound;
-
- afr_sh_algo_private_t *private;
-
- struct afr_sh_algorithm *algo;
- afr_lock_cbk_t data_lock_success_handler;
- afr_lock_cbk_t data_lock_failure_handler;
- int (*completion_cbk) (call_frame_t *frame, xlator_t *this);
- int (*sh_data_algo_start) (call_frame_t *frame, xlator_t *this);
- int (*algo_completion_cbk) (call_frame_t *frame, xlator_t *this);
- int (*algo_abort_cbk) (call_frame_t *frame, xlator_t *this);
- void (*gfid_sh_success_cbk) (call_frame_t *sh_frame, xlator_t *this);
- gf_boolean_t mdata_spb;
- gf_boolean_t data_spb;
-
- call_frame_t *sh_frame;
-} afr_self_heal_t;
typedef enum {
AFR_DATA_TRANSACTION, /* truncate, write, ... */
@@ -336,11 +185,31 @@ afr_index_for_transaction_type (afr_transaction_type type)
return -1; /* make gcc happy */
}
+typedef struct {
+ loc_t loc;
+ char *basename;
+ unsigned char *locked_nodes;
+ int locked_count;
+
+} afr_entry_lockee_t;
+
+int
+afr_entry_lockee_cmp (const void *l1, const void *l2);
+
+typedef struct {
+ char *domain; /* Domain on which inodelk is taken */
+ struct gf_flock flock;
+ unsigned char *locked_nodes;
+ int32_t lock_count;
+} afr_inodelk_t;
typedef struct {
loc_t *lk_loc;
- struct gf_flock lk_flock;
+ int lockee_count;
+ afr_entry_lockee_t lockee[AFR_LOCKEE_COUNT_MAX];
+
+ afr_inodelk_t inodelk[AFR_DOM_COUNT_MAX];
const char *lk_basename;
const char *lower_basename;
const char *higher_basename;
@@ -349,43 +218,90 @@ typedef struct {
unsigned char *locked_nodes;
unsigned char *lower_locked_nodes;
- unsigned char *inode_locked_nodes;
- unsigned char *entry_locked_nodes;
selfheal_lk_type_t selfheal_lk_type;
transaction_lk_type_t transaction_lk_type;
int32_t lock_count;
- int32_t inodelk_lock_count;
int32_t entrylk_lock_count;
uint64_t lock_number;
int32_t lk_call_count;
int32_t lk_expected_count;
+ int32_t lk_attempted_count;
int32_t lock_op_ret;
int32_t lock_op_errno;
afr_lock_cbk_t lock_cbk;
+ char *domain; /* Domain on which inode/entry lock/unlock in progress.*/
} afr_internal_lock_t;
-typedef struct _afr_locked_fd {
- fd_t *fd;
- struct list_head list;
-} afr_locked_fd_t;
+struct afr_reply {
+ int valid;
+ int32_t op_ret;
+ int32_t op_errno;
+ dict_t *xdata;
+ struct iatt poststat;
+ struct iatt postparent;
+ struct iatt prestat;
+ struct iatt preparent;
+ struct iatt preparent2;
+ struct iatt postparent2;
+ uint8_t checksum[MD5_DIGEST_LENGTH];
+};
+
+typedef enum {
+ AFR_FD_NOT_OPENED,
+ AFR_FD_OPENED,
+ AFR_FD_OPENING
+} afr_fd_open_status_t;
+
+typedef struct {
+ unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS];
+ int inherited[AFR_NUM_CHANGE_LOGS];
+ int on_disk[AFR_NUM_CHANGE_LOGS];
+ afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */
+
+ unsigned int *lock_piggyback;
+ unsigned int *lock_acquired;
+
+ int flags;
+
+ /* used for delayed-post-op optimization */
+ pthread_mutex_t delay_lock;
+ gf_timer_t *delay_timer;
+ call_frame_t *delay_frame;
+
+ /* set if any write on this fd was a non stable write
+ (i.e, without O_SYNC or O_DSYNC)
+ */
+ gf_boolean_t witnessed_unstable_write;
+
+ /* @open_fd_count:
+ Number of open FDs queried from the server, as queried through
+ xdata in FOPs. Currently, used to decide if eager-locking must be
+ temporarily disabled.
+ */
+ uint32_t open_fd_count;
+
+
+ /* list of frames currently in progress */
+ struct list_head eager_locked;
+} afr_fd_ctx_t;
+
typedef struct _afr_local {
- int uid;
- int gid;
+ glusterfs_fop_t op;
unsigned int call_count;
- unsigned int success_count;
- unsigned int enoent_count;
+ /* @event_generation: copy of priv->event_generation taken at the
+ time of starting the transaction. The copy is made so that we
+ have a stable value through the various phases of the transaction.
+ */
+ unsigned int event_generation;
- unsigned int govinda_gOvinda;
-
- unsigned int read_child_index;
- unsigned char read_child_returned;
- unsigned int first_up_child;
+ uint32_t open_fd_count;
+ gf_boolean_t update_open_fd_count;
gf_lkowner_t saved_lk_owner;
@@ -394,43 +310,111 @@ typedef struct _afr_local {
int32_t **pending;
+ int dirty[AFR_NUM_CHANGE_LOGS];
+
loc_t loc;
loc_t newloc;
fd_t *fd;
- unsigned char *fd_open_on;
-
- glusterfs_fop_t fop;
+ afr_fd_ctx_t *fd_ctx;
+ /* @child_up: copy of priv->child_up taken at the time of transaction
+ start. The copy is taken so that we have a stable child_up array
+ through the phases of the transaction as priv->child_up[i] can keep
+ changing through time.
+ */
unsigned char *child_up;
- int32_t *fresh_children; //in the order of response
- int32_t *child_errno;
+ /* @read_attempted:
+ array of flags representing subvolumes where read operations of
+ the read transaction have already been attempted. The array is
+ first pre-filled with down subvolumes, and as reads are performed
+ on other subvolumes, those are set as well. This way if the read
+ operation fails we do not retry on that subvolume again.
+ */
+ unsigned char *read_attempted;
- dict_t *xattr_req;
+ /* @readfn:
- int32_t inodelk_count;
- int32_t entrylk_count;
+ pointer to function which will perform the read operation on a given
+ subvolume. Used in read transactions.
+ */
- afr_internal_lock_t internal_lock;
+ afr_read_txn_wind_t readfn;
+
+ /* @refreshed:
+
+ the inode was "refreshed" (i.e, pending xattrs from all subvols
+ freshly inspected and inode ctx updated accordingly) as part of
+ this transaction already.
+ */
+ gf_boolean_t refreshed;
+
+ /* @inode:
+
+ the inode on which the read txn is performed on. ref'ed and copied
+ from either fd->inode or loc.inode
+ */
+
+ inode_t *inode;
+
+ /* @parent[2]:
+
+ parent inode[s] on which directory transactions are performed.
+ */
+
+ inode_t *parent;
+ inode_t *parent2;
+
+ /* @readable:
+
+ array of flags representing servers from which a read can be
+ performed. This is the output of afr_inode_refresh()
+ */
+ unsigned char *readable;
+
+ afr_inode_refresh_cbk_t refreshfn;
+
+ /* @refreshinode:
+
+ Inode currently getting refreshed.
+ */
+ inode_t *refreshinode;
- afr_locked_fd_t *locked_fd;
- int32_t source_child;
- int32_t lock_recovery_child;
+ /*
+ @pre_op_compat:
+
+ compatibility mode of pre-op. send a separate pre-op and
+ op operations as part of transaction, rather than combining
+ */
+
+ gf_boolean_t pre_op_compat;
+
+ dict_t *xattr_req;
+
+ afr_internal_lock_t internal_lock;
dict_t *dict;
+
int optimistic_change_log;
gf_boolean_t delayed_post_op;
- gf_boolean_t fop_paused;
- int (*fop_call_continue) (call_frame_t *frame, xlator_t *this);
+ /* Is the current writev() going to perform a stable write?
+ i.e, is fd->flags or @flags writev param have O_SYNC or
+ O_DSYNC?
+ */
+ gf_boolean_t stable_write;
+
+ /* This write appended to the file. Nnot necessarily O_APPEND,
+ just means the offset of write was at the end of file.
+ */
+ gf_boolean_t append_write;
/*
This struct contains the arguments for the "continuation"
(scheme-like) of fops
*/
- int op;
struct {
struct {
unsigned char buf_set;
@@ -438,24 +422,6 @@ typedef struct _afr_local {
} statfs;
struct {
- uint32_t parent_entrylk;
- uuid_t gfid_req;
- inode_t *inode;
- struct iatt buf;
- struct iatt postparent;
- dict_t **xattrs;
- dict_t *xattr;
- struct iatt *postparents;
- struct iatt *bufs;
- int32_t read_child;
- int32_t *sources;
- int32_t *success_children;
- int32_t **pending_matrix;
- gf_boolean_t fresh_lookup;
- gf_boolean_t possible_spb;
- } lookup;
-
- struct {
int32_t flags;
} open;
@@ -523,7 +489,9 @@ typedef struct _afr_local {
struct {
struct iatt prebuf;
struct iatt postbuf;
+ } inode_wfop; //common structure for all inode-write-fops
+ struct {
int32_t op_ret;
struct iovec *vector;
@@ -534,34 +502,21 @@ typedef struct _afr_local {
} writev;
struct {
- struct iatt prebuf;
- struct iatt postbuf;
- } fsync;
-
- struct {
off_t offset;
- struct iatt prebuf;
- struct iatt postbuf;
} truncate;
struct {
off_t offset;
- struct iatt prebuf;
- struct iatt postbuf;
} ftruncate;
struct {
struct iatt in_buf;
int32_t valid;
- struct iatt preop_buf;
- struct iatt postop_buf;
} setattr;
struct {
struct iatt in_buf;
int32_t valid;
- struct iatt preop_buf;
- struct iatt postop_buf;
} fsetattr;
struct {
@@ -623,11 +578,32 @@ typedef struct _afr_local {
dict_t *params;
char *linkpath;
} symlink;
+
+ struct {
+ int32_t mode;
+ off_t offset;
+ size_t len;
+ } fallocate;
+
+ struct {
+ off_t offset;
+ size_t len;
+ } discard;
+
+ struct {
+ off_t offset;
+ off_t len;
+ struct iatt prebuf;
+ struct iatt postbuf;
+ } zerofill;
+
+
} cont;
struct {
off_t start, len;
+ gf_boolean_t eager_lock_on;
int *eager_lock;
char *basename;
@@ -638,11 +614,67 @@ typedef struct _afr_local {
afr_transaction_type type;
- int32_t **txn_changelog;//changelog after pre+post ops
+ /* stub to resume on destruction
+ of the transaction frame */
+ call_stub_t *resume_stub;
+
+ struct list_head eager_locked;
+
unsigned char *pre_op;
+ /* @fop_subvols: subvolumes on which FOP will be attempted */
+ unsigned char *fop_subvols;
+
+ /* @failed_subvols: subvolumes on which FOP failed. Always
+ a subset of @fop_subvols */
+ unsigned char *failed_subvols;
+
+ /* @dirtied: flag which indicates whether we set dirty flag
+ in the OP. Typically true when we are performing operation
+ on more than one subvol and optimistic changelog is disabled
+
+ A 'true' value set in @dirtied flag means an 'undirtying'
+ has to be done in POST-OP phase.
+ */
+ gf_boolean_t dirtied;
+
+ /* @inherited: flag which indicates that the dirty flags
+ of the previous transaction were inherited
+ */
+ gf_boolean_t inherited;
+
+ /*
+ @no_uninherit: flag which indicates that a pre_op_uninherit()
+ must _not_ be attempted (and returned as failure) always. This
+ flag is set when a hard pre-op is performed, but not accounted
+ for it in fd_ctx->on_disk[]. Such transactions are "isolated"
+ from the pre-op piggybacking entirely and therefore uninherit
+ must not be attempted.
+ */
+ gf_boolean_t no_uninherit;
+
+ /* @uninherit_done:
+ @uninherit_value:
+
+ The above pair variables make pre_op_uninherit() idempotent.
+ Both are FALSE initially. The first call to pre_op_uninherit
+ sets @uninherit_done to TRUE and the return value to
+ @uninherit_value. Further calls will check for @uninherit_done
+ to be TRUE and if so will simply return @uninherit_value.
+ */
+ gf_boolean_t uninherit_done;
+ gf_boolean_t uninherit_value;
+
+ /* @changelog_resume: function to be called after changlogging
+ (either pre-op or post-op) is done
+ */
+
+ afr_changelog_resume_t changelog_resume;
+
call_frame_t *main_frame;
+ int (*wind) (call_frame_t *frame, xlator_t *this, int subvol);
+
int (*fop) (call_frame_t *frame, xlator_t *this);
int (*done) (call_frame_t *frame, xlator_t *this);
@@ -654,7 +686,7 @@ typedef struct _afr_local {
/* post-op hook */
} transaction;
- afr_self_heal_t self_heal;
+ syncbarrier_t barrier;
struct marker_str marker;
@@ -665,83 +697,74 @@ typedef struct _afr_local {
mode_t umask;
int xflag;
gf_boolean_t do_discovery;
+ struct afr_reply *replies;
} afr_local_t;
-typedef enum {
- AFR_FD_NOT_OPENED,
- AFR_FD_OPENED,
- AFR_FD_OPENING
-} afr_fd_open_status_t;
-
-typedef struct {
- struct list_head call_list;
- call_frame_t *frame;
-} afr_fd_paused_call_t;
-
-typedef struct {
- unsigned int *pre_op_done;
- afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */
- unsigned int *pre_op_piggyback;
-
- unsigned int *lock_piggyback;
- unsigned int *lock_acquired;
-
- int flags;
- uint64_t up_count; /* number of CHILD_UPs this fd has seen */
- uint64_t down_count; /* number of CHILD_DOWNs this fd has seen */
-
- int32_t last_tried;
-
- int hit, miss;
- gf_boolean_t failed_over;
- struct list_head entries; /* needed for readdir failover */
-
- unsigned char *locked_on; /* which subvolumes locks have been successful */
- struct list_head paused_calls; /* queued calls while fix_open happens */
-
- /* used for delayed-post-op optimization */
- pthread_mutex_t delay_lock;
- gf_timer_t *delay_timer;
- call_frame_t *delay_frame;
-} afr_fd_ctx_t;
-
-
-/* try alloc and if it fails, goto label */
-#define AFR_LOCAL_ALLOC_OR_GOTO(var, label) do { \
- var = mem_get0 (THIS->local_pool); \
- if (!var) { \
- gf_log (this->name, GF_LOG_ERROR, \
- "out of memory :("); \
- op_errno = ENOMEM; \
- goto label; \
- } \
- } while (0);
-
/* did a call fail due to a child failing? */
#define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \
((op_errno == ENOTCONN) || \
(op_errno == EBADFD)))
-#define afr_fop_failed(op_ret, op_errno) ((op_ret) == -1)
+int
+afr_inode_read_subvol_get (inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvols,
+ int *event_generation);
+int
+__afr_inode_read_subvol_get (inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvols,
+ int *event_generation);
+
+int
+__afr_inode_read_subvol_set (inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvol,
+ int event_generation);
+int
+afr_inode_read_subvol_set (inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvols,
+ int event_generation);
-/* have we tried all children? */
-#define all_tried(i, count) ((i) == (count) - 1)
+int
+afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this);
-int32_t
-afr_set_dict_gfid (dict_t *dict, uuid_t gfid);
+int
+afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this,
+ unsigned char *readable);
int
-pump_command_reply (call_frame_t *frame, xlator_t *this);
+afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this,
+ unsigned char *readable, int *event_p,
+ int type);
+int
+afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p,
+ int *event_p, afr_transaction_type type);
+
+#define afr_data_subvol_get(i, t, s, e) \
+ afr_read_subvol_get(i, t, s, e, AFR_DATA_TRANSACTION)
+
+#define afr_metadata_subvol_get(i, t, s, e) \
+ afr_read_subvol_get(i, t, s, e, AFR_METADATA_TRANSACTION)
+
+int
+afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode,
+ afr_inode_refresh_cbk_t cbk);
int32_t
afr_notify (xlator_t *this, int32_t event, void *data, void *data2);
int
-afr_attempt_lock_recovery (xlator_t *this, int32_t child_index);
+afr_init_entry_lockee (afr_entry_lockee_t *lockee, afr_local_t *local,
+ loc_t *loc, char *basename, int child_count);
+
+void
+afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock);
int
-afr_save_locked_fd (xlator_t *this, fd_t *fd);
+afr_attempt_lock_recovery (xlator_t *this, int32_t child_index);
int
afr_mark_locked_nodes (xlator_t *this, fd_t *fd,
@@ -753,10 +776,6 @@ afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner);
int
afr_set_lock_number (call_frame_t *frame, xlator_t *this);
-
-loc_t *
-lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2);
-
int32_t
afr_unlock (call_frame_t *frame, xlator_t *this);
@@ -772,46 +791,30 @@ afr_blocking_lock (call_frame_t *frame, xlator_t *this);
int
afr_internal_lock_finish (call_frame_t *frame, xlator_t *this);
-void
-afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src,
+int
+afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom,
unsigned int child_count);
-int pump_start (call_frame_t *frame, xlator_t *this);
-
int
__afr_fd_ctx_set (xlator_t *this, fd_t *fd);
int
afr_fd_ctx_set (xlator_t *this, fd_t *fd);
-int32_t
-afr_inode_get_read_ctx (xlator_t *this, inode_t *inode, int32_t *fresh_children);
-
-void
-afr_inode_set_read_ctx (xlator_t *this, inode_t *inode, int32_t read_child,
- int32_t *fresh_children);
+afr_fd_ctx_t *
+afr_fd_ctx_get (fd_t *fd, xlator_t *this);
int
afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno);
-unsigned int
-afr_up_children_count (unsigned char *child_up, unsigned int child_count);
-
-unsigned int
-afr_locked_children_count (unsigned char *children, unsigned int child_count);
-
-unsigned int
-afr_pre_op_done_children_count (unsigned char *pre_op,
- unsigned int child_count);
+int
+afr_locked_nodes_count (unsigned char *locked_nodes, int child_count);
-gf_boolean_t
-afr_is_fresh_lookup (loc_t *loc, xlator_t *this);
+int
+afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode);
void
-afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent);
-
-int
-afr_locked_nodes_count (unsigned char *locked_nodes, int child_count);
+afr_replies_wipe (afr_local_t *local, afr_private_t *priv);
void
afr_local_cleanup (afr_local_t *local, xlator_t *this);
@@ -819,31 +822,16 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this);
int
afr_frame_return (call_frame_t *frame);
-gf_boolean_t
-afr_is_split_brain (xlator_t *this, inode_t *inode);
-
-void
-afr_set_split_brain (xlator_t *this, inode_t *inode, gf_boolean_t set);
-
int
afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
fd_t *fd, dict_t *xdata);
void
-afr_set_opendir_done (xlator_t *this, inode_t *inode);
-
-gf_boolean_t
-afr_is_opendir_done (xlator_t *this, inode_t *inode);
-
-void
afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this);
int
afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd);
-int
-afr_launch_openfd_self_heal (call_frame_t *frame, xlator_t *this, fd_t *fd);
-
#define AFR_STACK_UNWIND(fop, frame, params ...) \
do { \
afr_local_t *__local = NULL; \
@@ -874,7 +862,16 @@ afr_launch_openfd_self_heal (call_frame_t *frame, xlator_t *this, fd_t *fd);
} \
} while (0);
-#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/
+#define AFR_FRAME_INIT(frame, op_errno) \
+ ({frame->local = mem_get0 (THIS->local_pool); \
+ if (afr_local_init (frame->local, THIS->private, &op_errno)) { \
+ afr_local_cleanup (frame->local, THIS); \
+ mem_put (frame->local); \
+ frame->local = NULL; }; \
+ frame->local;})
+
+#define AFR_STACK_RESET(frame) do { int opr; STACK_RESET (frame->root); AFR_FRAME_INIT(frame, opr);} while (0)
+
/* allocate and return a string that is the basename of argument */
static inline char *
AFR_BASENAME (const char *str)
@@ -887,6 +884,9 @@ AFR_BASENAME (const char *str)
return __basename_str;
}
+call_frame_t *
+afr_copy_frame (call_frame_t *base);
+
int
afr_transaction_local_init (afr_local_t *local, xlator_t *this);
@@ -894,9 +894,6 @@ int32_t
afr_marker_getxattr (call_frame_t *frame, xlator_t *this,
loc_t *loc, const char *name,afr_local_t *local, afr_private_t *priv );
-int32_t *
-afr_children_create (int32_t child_count);
-
int
afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno);
@@ -905,100 +902,20 @@ afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count,
transaction_lk_type_t lk_type);
int
-afr_first_up_child (unsigned char *child_up, size_t child_count);
+afr_higher_errno (int32_t old_errno, int32_t new_errno);
int
-afr_select_read_child_from_policy (int32_t *fresh_children, int32_t child_count,
- int32_t prev_read_child,
- int32_t config_read_child, int32_t *sources,
- unsigned int hmode, uuid_t gfid);
+afr_final_errno (afr_local_t *local, afr_private_t *priv);
-void
-afr_set_read_ctx_from_policy (xlator_t *this, inode_t *inode,
- int32_t *fresh_children, int32_t prev_read_child,
- int32_t config_read_child, uuid_t gfid);
-
-int32_t
-afr_get_call_child (xlator_t *this, unsigned char *child_up, int32_t read_child,
- int32_t *fresh_children,
- int32_t *call_child, int32_t *last_index);
-
-int32_t
-afr_next_call_child (int32_t *fresh_children, unsigned char *child_up,
- size_t child_count, int32_t *last_index,
- int32_t read_child);
-void
-afr_get_fresh_children (int32_t *success_children, int32_t *sources,
- int32_t *children, unsigned int child_count);
-void
-afr_children_add_child (int32_t *children, int32_t child,
- int32_t child_count);
-void
-afr_children_rm_child (int32_t *children, int32_t child,
- int32_t child_count);
-void
-afr_reset_children (int32_t *children, int32_t child_count);
-gf_boolean_t
-afr_error_more_important (int32_t old_errno, int32_t new_errno);
int
-afr_errno_count (int32_t *children, int *child_errno,
- unsigned int child_count, int32_t op_errno);
-int
-afr_get_children_count (int32_t *children, unsigned int child_count);
-gf_boolean_t
-afr_is_child_present (int32_t *success_children, int32_t child_count,
- int32_t child);
-void
-afr_update_gfid_from_iatts (uuid_t uuid, struct iatt *bufs,
- int32_t *success_children,
- unsigned int child_count);
-void
-afr_reset_xattr (dict_t **xattr, unsigned int child_count);
-gf_boolean_t
-afr_conflicting_iattrs (struct iatt *bufs, int32_t *success_children,
- unsigned int child_count, const char *path,
- const char *xlator_name);
-unsigned int
-afr_gfid_missing_count (const char *xlator_name, int32_t *children,
- struct iatt *bufs, unsigned int child_count,
- const char *path);
-void
-afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req, const char *path);
-void
-afr_children_copy (int32_t *dst, int32_t *src, unsigned int child_count);
-afr_transaction_type
-afr_transaction_type_get (ia_type_t ia_type);
-int32_t
-afr_resultant_errno_get (int32_t *children,
- int *child_errno, unsigned int child_count);
-void
-afr_inode_rm_stale_children (xlator_t *this, inode_t *inode,
- int32_t *stale_children);
+afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req);
+
void
-afr_launch_self_heal (call_frame_t *frame, xlator_t *this, inode_t *inode,
- gf_boolean_t background, ia_type_t ia_type, char *reason,
- void (*gfid_sh_success_cbk) (call_frame_t *sh_frame,
- xlator_t *this),
- int (*unwind) (call_frame_t *frame, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- int32_t sh_failed));
-int
-afr_fix_open (call_frame_t *frame, xlator_t *this, afr_fd_ctx_t *fd_ctx,
- int need_open_count, int *need_open);
-int
-afr_open_fd_fix (call_frame_t *frame, xlator_t *this, gf_boolean_t pause_fop);
-int
-afr_set_elem_count_get (unsigned char *elems, int child_count);
+afr_fix_open (fd_t *fd, xlator_t *this);
afr_fd_ctx_t *
afr_fd_ctx_get (fd_t *fd, xlator_t *this);
-gf_boolean_t
-afr_open_only_data_self_heal (char *data_self_heal);
-
-gf_boolean_t
-afr_data_self_heal_enabled (char *data_self_heal);
-
void
afr_set_low_priority (call_frame_t *frame);
int
@@ -1013,6 +930,10 @@ afr_matrix_cleanup (int32_t **pending, unsigned int m);
int32_t**
afr_matrix_create (unsigned int m, unsigned int n);
+
+void
+afr_filter_xattrs (dict_t *xattr);
+
/*
* Special value indicating we should use the "auto" quorum method instead of
* a fixed value (including zero to turn off quorum enforcement).
@@ -1032,4 +953,24 @@ afr_matrix_create (unsigned int m, unsigned int n);
} \
} while (0);
+int
+afr_fd_report_unstable_write (xlator_t *this, fd_t *fd);
+
+gf_boolean_t
+afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd);
+
+void
+afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub);
+
+int
+afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count);
+
+void
+afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this);
+
+int
+afr_local_pathinfo (char *pathinfo, gf_boolean_t *is_local);
+
+void
+afr_remove_eager_lock_stub (afr_local_t *local);
#endif /* __AFR_H__ */
diff --git a/xlators/cluster/afr/src/pump.c b/xlators/cluster/afr/src/pump.c
index ef49ef784..eed509956 100644
--- a/xlators/cluster/afr/src/pump.c
+++ b/xlators/cluster/afr/src/pump.c
@@ -21,6 +21,120 @@
#include "afr-common.c"
#include "defaults.c"
#include "glusterfs.h"
+#include "pump.h"
+
+
+static int
+afr_set_dict_gfid (dict_t *dict, uuid_t gfid)
+{
+ int ret = 0;
+ uuid_t *pgfid = NULL;
+
+ GF_ASSERT (gfid);
+
+ pgfid = GF_CALLOC (1, sizeof (uuid_t), gf_common_mt_char);
+ if (!pgfid) {
+ ret = -1;
+ goto out;
+ }
+
+ uuid_copy (*pgfid, gfid);
+
+ ret = dict_set_dynptr (dict, "gfid-req", pgfid, sizeof (uuid_t));
+ if (ret)
+ gf_log (THIS->name, GF_LOG_ERROR, "gfid set failed");
+
+out:
+ if (ret && pgfid)
+ GF_FREE (pgfid);
+ return ret;
+}
+
+static int
+afr_set_root_gfid (dict_t *dict)
+{
+ uuid_t gfid;
+ int ret = 0;
+
+ memset (gfid, 0, 16);
+ gfid[15] = 1;
+
+ ret = afr_set_dict_gfid (dict, gfid);
+
+ return ret;
+}
+
+static int
+afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name)
+{
+ int ret = -1;
+ uuid_t pargfid = {0};
+
+ if (!child)
+ goto out;
+
+ if (!uuid_is_null (parent->inode->gfid))
+ uuid_copy (pargfid, parent->inode->gfid);
+ else if (!uuid_is_null (parent->gfid))
+ uuid_copy (pargfid, parent->gfid);
+
+ if (uuid_is_null (pargfid))
+ goto out;
+
+ if (strcmp (parent->path, "/") == 0)
+ ret = gf_asprintf ((char **)&child->path, "/%s", name);
+ else
+ ret = gf_asprintf ((char **)&child->path, "%s/%s", parent->path,
+ name);
+
+ if (-1 == ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "asprintf failed while setting child path");
+ }
+
+ child->name = strrchr (child->path, '/');
+ if (child->name)
+ child->name++;
+
+ child->parent = inode_ref (parent->inode);
+ child->inode = inode_new (parent->inode->table);
+ uuid_copy (child->pargfid, pargfid);
+
+ if (!child->inode) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if ((ret == -1) && child)
+ loc_wipe (child);
+
+ return ret;
+}
+
+static void
+afr_build_root_loc (xlator_t *this, loc_t *loc)
+{
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ loc->path = gf_strdup ("/");
+ loc->name = "";
+ loc->inode = inode_ref (priv->root_inode);
+ uuid_copy (loc->gfid, loc->inode->gfid);
+}
+
+static void
+afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent)
+{
+ GF_ASSERT (loc);
+ GF_ASSERT (buf);
+
+ uuid_copy (loc->gfid, buf->ia_gfid);
+ if (postparent)
+ uuid_copy (loc->pargfid, postparent->ia_gfid);
+}
static uint64_t pump_pid = 0;
static inline void
@@ -387,54 +501,68 @@ gf_pump_traverse_directory (loc_t *loc)
if (ret)
goto out;
- if (!IS_ENTRY_CWD (entry->d_name) &&
- !IS_ENTRY_PARENT (entry->d_name)) {
-
- is_directory_empty = _gf_false;
- gf_log (this->name, GF_LOG_DEBUG,
- "lookup %s => %"PRId64,
- entry_loc.path,
- iatt.ia_ino);
-
- ret = syncop_lookup (this, &entry_loc, NULL,
- &iatt, &xattr_rsp, &parent);
-
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "%s: lookup failed",
- entry_loc.path);
- continue;
- }
- pump_fill_loc_info (&entry_loc, &iatt,
- &parent);
-
- pump_update_resume_state (this, entry_loc.path);
-
- pump_save_path (this, entry_loc.path);
- pump_save_file_stats (this, entry_loc.path);
-
- ret = pump_check_and_update_status (this);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "Pump beginning to exit out");
- goto out;
- }
-
- if (IA_ISDIR (iatt.ia_type)) {
- if (is_pump_traversal_allowed (this, entry_loc.path)) {
- gf_log (this->name, GF_LOG_TRACE,
- "entering dir=%s",
- entry->d_name);
- gf_pump_traverse_directory (&entry_loc);
- }
- }
+ if ((strcmp (entry->d_name, ".") == 0) ||
+ (strcmp (entry->d_name, "..") == 0))
+ continue;
+
+ is_directory_empty = _gf_false;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "lookup %s => %"PRId64,
+ entry_loc.path,
+ iatt.ia_ino);
+
+ ret = syncop_lookup (this, &entry_loc, NULL, &iatt,
+ &xattr_rsp, &parent);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: lookup failed", entry_loc.path);
+ continue;
+ }
+
+ ret = afr_selfheal_name (this, loc->gfid, entry->d_name);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: name self-heal failed (%s/%s)",
+ entry_loc.path, uuid_utoa (loc->gfid),
+ entry->d_name);
+ continue;
+ }
+
+ ret = afr_selfheal (this, iatt.ia_gfid);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s: self-heal failed (%s)",
+ entry_loc.path, uuid_utoa (iatt.ia_gfid));
+ continue;
+ }
+
+ pump_fill_loc_info (&entry_loc, &iatt, &parent);
+
+ pump_update_resume_state (this, entry_loc.path);
+
+ pump_save_path (this, entry_loc.path);
+ pump_save_file_stats (this, entry_loc.path);
+
+ ret = pump_check_and_update_status (this);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Pump beginning to exit out");
+ goto out;
+ }
+
+ if (IA_ISDIR (iatt.ia_type)) {
+ if (is_pump_traversal_allowed (this, entry_loc.path)) {
+ gf_log (this->name, GF_LOG_TRACE,
+ "entering dir=%s", entry->d_name);
+ gf_pump_traverse_directory (&entry_loc);
+ }
}
}
gf_dirent_free (&entries);
free_entries = _gf_false;
- gf_log (this->name, GF_LOG_TRACE,
- "offset incremented to %d",
+ gf_log (this->name, GF_LOG_TRACE, "offset incremented to %d",
(int32_t ) offset);
}
@@ -443,7 +571,7 @@ gf_pump_traverse_directory (loc_t *loc)
if (ret < 0)
gf_log (this->name, GF_LOG_DEBUG, "closing the fd failed");
- if (is_directory_empty && IS_ROOT_PATH (loc->path)) {
+ if (is_directory_empty && (strcmp (loc->path, "/") == 0)) {
pump_change_state (this, PUMP_STATE_RUNNING);
gf_log (this->name, GF_LOG_INFO, "Empty source brick. "
"Nothing to be done.");
@@ -496,17 +624,18 @@ pump_xattr_cleaner (call_frame_t *frame, void *cookie, xlator_t *this,
afr_build_root_loc (this, &loc);
ret = syncop_removexattr (priv->children[source], &loc,
- PUMP_PATH);
+ PUMP_PATH, 0);
ret = syncop_removexattr (priv->children[sink], &loc,
- PUMP_SINK_COMPLETE);
+ PUMP_SINK_COMPLETE, 0);
for (i = 0; i < priv->child_count; i++) {
ret = syncop_removexattr (priv->children[i], &loc,
- PUMP_SOURCE_COMPLETE);
- if (ret)
+ PUMP_SOURCE_COMPLETE, 0);
+ if (ret) {
gf_log (this->name, GF_LOG_DEBUG, "removexattr "
- "failed with %s", strerror (errno));
+ "failed with %s", strerror (-ret));
+ }
}
loc_wipe (&loc);
@@ -598,6 +727,7 @@ pump_lookup_sink (loc_t *loc)
if (ret) {
gf_log (this->name, GF_LOG_DEBUG,
"Lookup on sink child failed");
+ ret = -1;
goto out;
}
@@ -1275,128 +1405,16 @@ out:
}
-struct _xattr_key {
- char *key;
- struct list_head list;
-};
-
-static int
-__gather_xattr_keys (dict_t *dict, char *key, data_t *value,
- void *data)
-{
- struct list_head * list = data;
- struct _xattr_key * xkey = NULL;
-
- if (!strncmp (key, AFR_XATTR_PREFIX,
- strlen (AFR_XATTR_PREFIX))) {
-
- xkey = GF_CALLOC (1, sizeof (*xkey), gf_afr_mt_xattr_key);
- if (!xkey)
- return -1;
-
- xkey->key = key;
- INIT_LIST_HEAD (&xkey->list);
-
- list_add_tail (&xkey->list, list);
- }
- return 0;
-}
-
-static void
-__filter_xattrs (dict_t *dict)
-{
- struct list_head keys;
-
- struct _xattr_key *key;
- struct _xattr_key *tmp;
-
- INIT_LIST_HEAD (&keys);
-
- dict_foreach (dict, __gather_xattr_keys,
- (void *) &keys);
-
- list_for_each_entry_safe (key, tmp, &keys, list) {
- dict_del (dict, key->key);
-
- list_del_init (&key->list);
-
- GF_FREE (key);
- }
-}
-
-int32_t
-pump_getxattr_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *dict, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- xlator_t **children = NULL;
- int unwind = 1;
- int32_t *last_index = NULL;
- int32_t next_call_child = -1;
- int32_t read_child = -1;
- int32_t *fresh_children = NULL;
-
-
- priv = this->private;
- children = priv->children;
-
- local = frame->local;
-
- read_child = (long) cookie;
-
- if (op_ret == -1) {
- last_index = &local->cont.getxattr.last_index;
- fresh_children = local->fresh_children;
- next_call_child = afr_next_call_child (fresh_children,
- local->child_up,
- priv->child_count,
- last_index, read_child);
- if (next_call_child < 0)
- goto out;
-
- unwind = 0;
- STACK_WIND_COOKIE (frame, pump_getxattr_cbk,
- (void *) (long) read_child,
- children[next_call_child],
- children[next_call_child]->fops->getxattr,
- &local->loc,
- local->cont.getxattr.name, NULL);
- }
-
-out:
- if (unwind) {
- if (op_ret >= 0 && dict)
- __filter_xattrs (dict);
-
- AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, NULL);
- }
-
- return 0;
-}
-
-int32_t
-pump_getxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name, dict_t *xdata)
+int
+pump_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- xlator_t ** children = NULL;
- int call_child = 0;
- afr_local_t *local = NULL;
- int32_t ret = -1;
- int32_t op_errno = 0;
- uint64_t read_child = 0;
-
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
+ afr_private_t *priv = NULL;
+ int op_errno = 0;
+ int ret = 0;
- priv = this->private;
- VALIDATE_OR_GOTO (priv->children, out);
+ priv = this->private;
- children = priv->children;
if (!priv->use_afr_in_pump) {
STACK_WIND (frame, default_getxattr_cbk,
FIRST_CHILD (this),
@@ -1405,14 +1423,6 @@ pump_getxattr (call_frame_t *frame, xlator_t *this,
return 0;
}
-
- AFR_LOCAL_ALLOC_OR_GOTO (frame->local, out);
- local = frame->local;
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0)
- goto out;
-
if (name) {
if (!strncmp (name, AFR_XATTR_PREFIX,
strlen (AFR_XATTR_PREFIX))) {
@@ -1430,32 +1440,7 @@ pump_getxattr (call_frame_t *frame, xlator_t *this,
}
}
- local->fresh_children = GF_CALLOC (priv->child_count,
- sizeof (*local->fresh_children),
- gf_afr_mt_int32_t);
- if (!local->fresh_children) {
- ret = -1;
- op_errno = ENOMEM;
- goto out;
- }
-
- read_child = afr_inode_get_read_ctx (this, loc->inode, local->fresh_children);
- ret = afr_get_call_child (this, local->child_up, read_child,
- local->fresh_children,
- &call_child,
- &local->cont.getxattr.last_index);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
- loc_copy (&local->loc, loc);
- if (name)
- local->cont.getxattr.name = gf_strdup (name);
-
- STACK_WIND_COOKIE (frame, pump_getxattr_cbk,
- (void *) (long) call_child,
- children[call_child], children[call_child]->fops->getxattr,
- loc, name, xdata);
+ afr_getxattr (frame, this, loc, name, xdata);
ret = 0;
out:
@@ -1464,134 +1449,6 @@ out:
return 0;
}
-static int
-afr_setxattr_unwind (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- if (local->transaction.main_frame)
- main_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
- }
- UNLOCK (&frame->lock);
-
- if (main_frame) {
- AFR_STACK_UNWIND (setxattr, main_frame,
- local->op_ret, local->op_errno, NULL);
- }
- return 0;
-}
-
-static int
-afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
-
- int call_count = -1;
- int need_unwind = 0;
-
- local = frame->local;
- priv = this->private;
-
- LOCK (&frame->lock);
- {
- if (op_ret != -1) {
- if (local->success_count == 0) {
- local->op_ret = op_ret;
- }
- local->success_count++;
-
- if (local->success_count == priv->child_count) {
- need_unwind = 1;
- }
- }
-
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
-
- if (need_unwind)
- local->transaction.unwind (frame, this);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- }
-
- return 0;
-}
-
-static int
-afr_setxattr_wind (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- int call_count = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- call_count = afr_up_children_count (local->child_up, priv->child_count);
-
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
- }
-
- local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->setxattr,
- &local->loc,
- local->cont.setxattr.dict,
- local->cont.setxattr.flags, NULL);
-
- if (!--call_count)
- break;
- }
- }
-
- return 0;
-}
-
-
-static int
-afr_setxattr_done (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t * local = frame->local;
-
- local->transaction.unwind (frame, this);
-
- AFR_STACK_DESTROY (frame);
-
- return 0;
-}
-
-int32_t
-pump_setxattr_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno, dict_t *xdata)
-{
- AFR_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata);
- return 0;
-}
-
int
pump_command_reply (call_frame_t *frame, xlator_t *this)
{
@@ -1615,51 +1472,56 @@ pump_command_reply (call_frame_t *frame, xlator_t *this)
}
int
-pump_parse_command (call_frame_t *frame, xlator_t *this,
- afr_local_t *local, dict_t *dict)
+pump_parse_command (call_frame_t *frame, xlator_t *this, dict_t *dict,
+ int *op_errno_p)
{
-
+ afr_local_t *local = NULL;
int ret = -1;
+ int op_errno = 0;
if (pump_command_start (this, dict)) {
- frame->local = local;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
local->dict = dict_ref (dict);
ret = pump_execute_start (frame, this);
} else if (pump_command_pause (this, dict)) {
- frame->local = local;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
local->dict = dict_ref (dict);
ret = pump_execute_pause (frame, this);
} else if (pump_command_abort (this, dict)) {
- frame->local = local;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
local->dict = dict_ref (dict);
ret = pump_execute_abort (frame, this);
} else if (pump_command_commit (this, dict)) {
- frame->local = local;
+ local = AFR_FRAME_INIT (frame, op_errno);
+ if (!local)
+ goto out;
local->dict = dict_ref (dict);
ret = pump_execute_commit (frame, this);
}
+out:
+ if (op_errno_p)
+ *op_errno_p = op_errno;
return ret;
}
int
-pump_setxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata)
+pump_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+ int32_t flags, dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
+ afr_private_t *priv = NULL;
int ret = -1;
int op_errno = 0;
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (this, out);
- VALIDATE_OR_GOTO (this->private, out);
-
- GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.pump*", dict,
- op_errno, out);
+ GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.pump*", dict, op_errno, out);
priv = this->private;
if (!priv->use_afr_in_pump) {
@@ -1670,56 +1532,15 @@ pump_setxattr (call_frame_t *frame, xlator_t *this,
return 0;
}
-
- AFR_LOCAL_ALLOC_OR_GOTO (local, out);
-
- ret = afr_local_init (local, priv, &op_errno);
- if (ret < 0) {
- afr_local_cleanup (local, this);
- goto out;
- }
-
- ret = pump_parse_command (frame, this,
- local, dict);
- if (ret >= 0) {
- ret = 0;
+ ret = pump_parse_command (frame, this, dict, &op_errno);
+ if (ret >= 0)
goto out;
- }
-
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory.");
- op_errno = ENOMEM;
- ret = -1;
- afr_local_cleanup (local, this);
- goto out;
- }
-
- transaction_frame->local = local;
-
- local->op_ret = -1;
-
- local->cont.setxattr.dict = dict_ref (dict);
- local->cont.setxattr.flags = flags;
- local->transaction.fop = afr_setxattr_wind;
- local->transaction.done = afr_setxattr_done;
- local->transaction.unwind = afr_setxattr_unwind;
-
- loc_copy (&local->loc, loc);
-
- local->transaction.main_frame = frame;
- local->transaction.start = LLONG_MAX - 1;
- local->transaction.len = 0;
-
- afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
+ afr_setxattr (frame, this, loc, dict, flags, xdata);
ret = 0;
out:
if (ret < 0) {
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
}
@@ -2387,7 +2208,7 @@ int32_t
init (xlator_t *this)
{
afr_private_t * priv = NULL;
- pump_private_t *pump_priv = NULL;
+ pump_private_t *pump_priv = NULL;
int child_count = 0;
xlator_list_t * trav = NULL;
int i = 0;
@@ -2408,17 +2229,11 @@ init (xlator_t *this)
"Volume is dangling.");
}
- this->private = GF_CALLOC (1, sizeof (afr_private_t),
- gf_afr_mt_afr_private_t);
- if (!this->private)
+ priv = GF_CALLOC (1, sizeof (afr_private_t), gf_afr_mt_afr_private_t);
+ if (!priv)
goto out;
- priv = this->private;
LOCK_INIT (&priv->lock);
- LOCK_INIT (&priv->read_child_lock);
- //lock recovery is not done in afr
- pthread_mutex_init (&priv->mutex, NULL);
- INIT_LIST_HEAD (&priv->saved_fds);
child_count = xlator_subvolume_count (this);
if (child_count != 2) {
@@ -2437,14 +2252,13 @@ init (xlator_t *this)
priv->metadata_self_heal = 1;
priv->entry_self_heal = 1;
- priv->data_self_heal_algorithm = "";
-
priv->data_self_heal_window_size = 16;
priv->data_change_log = 1;
priv->metadata_change_log = 1;
priv->entry_change_log = 1;
priv->use_afr_in_pump = 1;
+ priv->sh_readdir_size = 65536;
/* Locking options */
@@ -2453,9 +2267,6 @@ init (xlator_t *this)
and the sink.
*/
- priv->strict_readdir = _gf_false;
-
- priv->wait_count = 1;
priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count,
gf_afr_mt_char);
if (!priv->child_up) {
@@ -2489,7 +2300,8 @@ init (xlator_t *this)
while (i < child_count) {
priv->children[i] = trav->xlator;
- ret = gf_asprintf (&priv->pending_key[i], "%s.%s", AFR_XATTR_PREFIX,
+ ret = gf_asprintf (&priv->pending_key[i], "%s.%s",
+ AFR_XATTR_PREFIX,
trav->xlator->name);
if (-1 == ret) {
gf_log (this->name, GF_LOG_ERROR,
@@ -2502,7 +2314,12 @@ init (xlator_t *this)
i++;
}
- priv->first_lookup = 1;
+ ret = gf_asprintf (&priv->sh_domain, "%s-self-heal", this->name);
+ if (-1 == ret) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
priv->root_inode = NULL;
priv->last_event = GF_CALLOC (child_count, sizeof (*priv->last_event),
@@ -2527,13 +2344,12 @@ init (xlator_t *this)
pump_priv->resume_path = GF_CALLOC (1, PATH_MAX,
gf_afr_mt_char);
if (!pump_priv->resume_path) {
- gf_log (this->name, GF_LOG_ERROR,
- "Out of memory");
+ gf_log (this->name, GF_LOG_ERROR, "Out of memory");
ret = -1;
goto out;
}
- pump_priv->env = syncenv_new (0);
+ pump_priv->env = this->ctx->env;
if (!pump_priv->env) {
gf_log (this->name, GF_LOG_ERROR,
"Could not create new sync-environment");
@@ -2551,11 +2367,32 @@ init (xlator_t *this)
}
priv->pump_private = pump_priv;
+ pump_priv = NULL;
+
+ this->private = priv;
+ priv = NULL;
pump_change_state (this, PUMP_STATE_ABORT);
ret = 0;
out:
+
+ if (pump_priv) {
+ GF_FREE (pump_priv->resume_path);
+ LOCK_DESTROY (&pump_priv->resume_path_lock);
+ LOCK_DESTROY (&pump_priv->pump_state_lock);
+ GF_FREE (pump_priv);
+ }
+
+ if (priv) {
+ GF_FREE (priv->child_up);
+ GF_FREE (priv->children);
+ GF_FREE (priv->pending_key);
+ GF_FREE (priv->last_event);
+ LOCK_DESTROY (&priv->lock);
+ GF_FREE (priv);
+ }
+
return ret;
}
@@ -2574,9 +2411,6 @@ fini (xlator_t *this)
if (!pump_priv)
goto afr_priv;
- if (pump_priv->env)
- syncenv_destroy (pump_priv->env);
-
GF_FREE (pump_priv->resume_path);
LOCK_DESTROY (&pump_priv->resume_path_lock);
LOCK_DESTROY (&pump_priv->pump_state_lock);
diff --git a/xlators/cluster/afr/src/pump.h b/xlators/cluster/afr/src/pump.h
index bc4c31a78..9d0b6db6a 100644
--- a/xlators/cluster/afr/src/pump.h
+++ b/xlators/cluster/afr/src/pump.h
@@ -75,4 +75,7 @@ pump_command_status (xlator_t *this, dict_t *dict);
int
pump_execute_status (call_frame_t *frame, xlator_t *this);
+int
+pump_command_reply (call_frame_t *frame, xlator_t *this);
+
#endif /* __PUMP_H__ */
diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am
index b78af1ff3..3fc29bf81 100644
--- a/xlators/cluster/dht/src/Makefile.am
+++ b/xlators/cluster/dht/src/Makefile.am
@@ -1,10 +1,9 @@
-
xlator_LTLIBRARIES = dht.la nufa.la switch.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
dht_common_source = dht-layout.c dht-helper.c dht-linkfile.c dht-rebalance.c \
dht-selfheal.c dht-rename.c dht-hashfn.c dht-diskusage.c \
- dht-common.c dht-inode-write.c dht-inode-read.c \
+ dht-common.c dht-inode-write.c dht-inode-read.c dht-shared.c \
$(top_builddir)/xlators/lib/src/libxlator.c
dht_la_SOURCES = $(dht_common_source) dht.c
@@ -12,13 +11,13 @@ dht_la_SOURCES = $(dht_common_source) dht.c
nufa_la_SOURCES = $(dht_common_source) nufa.c
switch_la_SOURCES = $(dht_common_source) switch.c
-dht_la_LDFLAGS = -module -avoidversion
+dht_la_LDFLAGS = -module -avoid-version
dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-nufa_la_LDFLAGS = -module -avoidversion
+nufa_la_LDFLAGS = -module -avoid-version
nufa_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-switch_la_LDFLAGS = -module -avoidversion
+switch_la_LDFLAGS = -module -avoid-version
switch_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
noinst_HEADERS = dht-common.h dht-mem-types.h \
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
index f1f981890..3868fc38f 100644
--- a/xlators/cluster/dht/src/dht-common.c
+++ b/xlators/cluster/dht/src/dht-common.c
@@ -22,6 +22,7 @@
#include "dht-common.h"
#include "defaults.h"
#include "byte-order.h"
+#include "glusterfs-acl.h"
#include <sys/time.h>
#include <libgen.h>
@@ -62,6 +63,11 @@ dht_aggregate (dict_t *this, char *key, data_t *value, void *data)
}
*size = hton64 (ntoh64 (*size) + ntoh64 (*ptr));
+
+ } else if (fnmatch (GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) {
+ ret = gf_get_min_stime (THIS, dst, key, value);
+ if (ret < 0)
+ return ret;
} else {
/* compare user xattrs only */
if (!strncmp (key, "user.", strlen ("user."))) {
@@ -148,9 +154,11 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame)
int op_errno = 0;
int ret = -1;
dht_layout_t *layout = NULL;
+ dht_conf_t *conf = NULL;
local = discover_frame->local;
layout = local->layout;
+ conf = this->private;
LOCK(&discover_frame->lock);
{
@@ -193,11 +201,14 @@ dht_discover_complete (xlator_t *this, call_frame_t *discover_frame)
"(overlaps/holes present: %s, "
"ENOENT errors: %d)", local->loc.path,
(ret < 0) ? "yes" : "no", (ret > 0) ? ret : 0);
- op_errno = EINVAL;
- goto out;
+ if ((ret > 0) && (ret == conf->subvolume_cnt)) {
+ op_errno = ESTALE;
+ goto out;
+ }
}
- dht_layout_set (this, local->inode, layout);
+ if (local->inode)
+ dht_layout_set (this, local->inode, layout);
}
DHT_STACK_UNWIND (lookup, main_frame, local->op_ret, local->op_errno,
@@ -226,6 +237,7 @@ dht_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int is_dir = 0;
int is_linkfile = 0;
int attempt_unwind = 0;
+ dht_conf_t *conf = 0;
GF_VALIDATE_OR_GOTO ("dht", frame, out);
GF_VALIDATE_OR_GOTO ("dht", this, out);
@@ -235,6 +247,7 @@ dht_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local = frame->local;
prev = cookie;
+ conf = this->private;
layout = local->layout;
@@ -269,7 +282,8 @@ dht_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto unlock;
}
- is_linkfile = check_is_linkfile (inode, stbuf, xattr);
+ is_linkfile = check_is_linkfile (inode, stbuf, xattr,
+ conf->link_xattr_name);
is_dir = check_is_dir (inode, stbuf, xattr);
if (is_dir) {
@@ -328,23 +342,20 @@ dht_discover (call_frame_t *frame, xlator_t *this, loc_t *loc)
int i = 0;
call_frame_t *discover_frame = NULL;
-
conf = this->private;
local = frame->local;
- ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht", 4 * 4);
+ ret = dict_set_uint32 (local->xattr_req, conf->xattr_name, 4 * 4);
if (ret)
gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to set 'trusted.glusterfs.dht' key",
- loc->path);
+ "%s: failed to set '%s' key",
+ loc->path, conf->xattr_name);
- ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht.linkto", 256);
+ ret = dict_set_uint32 (local->xattr_req, conf->link_xattr_name, 256);
if (ret)
gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to set 'trusted.glusterfs.dht.linkto' key",
- loc->path);
+ "%s: failed to set '%s' key",
+ loc->path, conf->link_xattr_name);
call_cnt = conf->subvolume_cnt;
local->call_cnt = call_cnt;
@@ -430,7 +441,7 @@ dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
op_ret, op_errno, xattr);
if (op_ret == -1) {
- local->op_errno = ENOENT;
+ local->op_errno = op_errno;
gf_log (this->name, GF_LOG_DEBUG,
"lookup of %s on %s returned error (%s)",
local->loc.path, prev->this->name,
@@ -525,6 +536,8 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int ret = -1;
int is_dir = 0;
int is_linkfile = 0;
+ call_frame_t *copy = NULL;
+ dht_local_t *copy_local = NULL;
GF_VALIDATE_OR_GOTO ("dht", frame, err);
GF_VALIDATE_OR_GOTO ("dht", this, err);
@@ -583,7 +596,8 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
layout = local->layout;
is_dir = check_is_dir (inode, stbuf, xattr);
- is_linkfile = check_is_linkfile (inode, stbuf, xattr);
+ is_linkfile = check_is_linkfile (inode, stbuf, xattr,
+ conf->link_xattr_name);
if (is_linkfile) {
gf_log (this->name, GF_LOG_INFO,
@@ -595,6 +609,23 @@ dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
if (is_dir) {
+ ret = dht_dir_has_layout (xattr, conf->xattr_name);
+ if (ret >= 0) {
+ if (is_greater_time(local->stbuf.ia_ctime,
+ local->stbuf.ia_ctime_nsec,
+ stbuf->ia_ctime,
+ stbuf->ia_ctime_nsec)) {
+ local->prebuf.ia_gid = stbuf->ia_gid;
+ local->prebuf.ia_uid = stbuf->ia_uid;
+ }
+ }
+ if (local->stbuf.ia_type != IA_INVAL)
+ {
+ if ((local->stbuf.ia_gid != stbuf->ia_gid) ||
+ (local->stbuf.ia_uid != stbuf->ia_uid)) {
+ local->need_selfheal = 1;
+ }
+ }
ret = dht_layout_dir_mismatch (this, layout,
prev->this, &local->loc,
xattr);
@@ -633,7 +664,28 @@ out:
&& (conf && conf->unhashed_sticky_bit)) {
local->stbuf.ia_prot.sticky = 1;
}
- if (local->layout_mismatch) {
+ if (local->need_selfheal) {
+ local->need_selfheal = 0;
+ uuid_copy (local->gfid, local->stbuf.ia_gfid);
+ local->stbuf.ia_gid = local->prebuf.ia_gid;
+ local->stbuf.ia_uid = local->prebuf.ia_uid;
+ copy = create_frame (this, this->ctx->pool);
+ if (copy) {
+ copy_local = dht_local_init (copy, &local->loc,
+ NULL, 0);
+ if (!copy_local)
+ goto cont;
+ copy_local->stbuf = local->stbuf;
+ copy->local = copy_local;
+ FRAME_SU_DO (copy, dht_local_t);
+ ret = synctask_new (this->ctx->env,
+ dht_dir_attr_heal,
+ dht_dir_attr_heal_done,
+ copy, copy);
+ }
+ }
+cont:
+ if (local->layout_mismatch) {
/* Found layout mismatch in the directory, need to
fix this in the inode context */
dht_layout_unref (this, local->layout);
@@ -661,7 +713,7 @@ out:
if (local->loc.parent) {
dht_inode_ctx_time_update (local->loc.parent, this,
- postparent, 1);
+ &local->postparent, 1);
}
DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
@@ -698,7 +750,7 @@ dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie,
cached_subvol = local->cached_subvol;
conf = this->private;
- ret = dht_layout_preset (this, local->cached_subvol, inode);
+ ret = dht_layout_preset (this, local->cached_subvol, local->loc.inode);
if (ret < 0) {
gf_log (this->name, GF_LOG_DEBUG,
"failed to set layout for subvolume %s",
@@ -714,12 +766,15 @@ dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie,
local->stbuf.ia_prot.sticky = 1;
}
-unwind:
if (local->loc.parent) {
dht_inode_ctx_time_update (local->loc.parent, this,
postparent, 1);
}
+unwind:
+ if (local->linked == _gf_true)
+ dht_linkfile_attr_heal (frame, this);
+
DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
local->inode, &local->stbuf, local->xattr,
@@ -843,7 +898,7 @@ dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this)
hashed_subvol->name);
ret = dht_linkfile_create (frame,
- dht_lookup_linkfile_create_cbk,
+ dht_lookup_linkfile_create_cbk, this,
cached_subvol, hashed_subvol, &local->loc);
return ret;
@@ -881,8 +936,9 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
xlator_t *subvol = NULL;
loc_t *loc = NULL;
xlator_t *link_subvol = NULL;
- int ret = -1;
- int32_t fd_count = 0;
+ int ret = -1;
+ int32_t fd_count = 0;
+ dht_conf_t *conf = NULL;
GF_VALIDATE_OR_GOTO ("dht", frame, out);
GF_VALIDATE_OR_GOTO ("dht", this, out);
@@ -892,6 +948,7 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local = frame->local;
loc = &local->loc;
+ conf = this->private;
prev = cookie;
subvol = prev->this;
@@ -913,7 +970,8 @@ dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
loc->path, prev->this->name);
}
- is_linkfile = check_is_linkfile (inode, buf, xattr);
+ is_linkfile = check_is_linkfile (inode, buf, xattr,
+ conf->link_xattr_name);
is_dir = check_is_dir (inode, buf, xattr);
if (is_linkfile) {
@@ -1055,7 +1113,16 @@ dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie,
gf_log (this->name, GF_LOG_INFO,
"lookup of %s on %s (following linkfile) failed (%s)",
local->loc.path, subvol->name, strerror (op_errno));
- goto err;
+
+ /* If cached subvol returned ENOTCONN, do not do
+ lookup_everywhere. We need to make sure linkfile does not get
+ removed, which can take away the namespace, and subvol is
+ anyways down. */
+
+ if (op_errno != ENOTCONN)
+ goto err;
+ else
+ goto unwind;
}
if (check_is_dir (inode, stbuf, xattr)) {
@@ -1065,7 +1132,7 @@ dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie,
goto err;
}
- if (check_is_linkfile (inode, stbuf, xattr)) {
+ if (check_is_linkfile (inode, stbuf, xattr, conf->link_xattr_name)) {
gf_log (this->name, GF_LOG_INFO,
"lookup of %s on %s (following linkfile) reached link",
local->loc.path, subvol->name);
@@ -1093,12 +1160,12 @@ dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie,
op_errno = EINVAL;
}
-unwind:
if (local->loc.parent) {
dht_inode_ctx_time_update (local->loc.parent, this,
postparent, 1);
}
+unwind:
DHT_STRIP_PHASE1_FLAGS (stbuf);
DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, stbuf, xattr,
postparent);
@@ -1242,7 +1309,8 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- is_linkfile = check_is_linkfile (inode, stbuf, xattr);
+ is_linkfile = check_is_linkfile (inode, stbuf, xattr,
+ conf->link_xattr_name);
if (!is_linkfile) {
/* non-directory and not a linkfile */
@@ -1282,7 +1350,7 @@ out:
* from each of the subvolume. See dht_iatt_merge for reference.
*/
- if (local->loc.parent) {
+ if (!op_ret && local->loc.parent) {
dht_inode_ctx_time_update (local->loc.parent, this,
postparent, 1);
}
@@ -1347,7 +1415,6 @@ dht_lookup (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (loc, err);
VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
conf = this->private;
if (!conf)
@@ -1422,7 +1489,7 @@ dht_lookup (call_frame_t *frame, xlator_t *this,
* revalidates directly go to the cached-subvolume.
*/
ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht", 4 * 4);
+ conf->xattr_name, 4 * 4);
if (IA_ISDIR (local->inode->ia_type)) {
local->call_cnt = call_cnt = conf->subvolume_cnt;
@@ -1457,10 +1524,10 @@ dht_lookup (call_frame_t *frame, xlator_t *this,
do_fresh_lookup:
/* TODO: remove the hard-coding */
ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht", 4 * 4);
+ conf->xattr_name, 4 * 4);
ret = dict_set_uint32 (local->xattr_req,
- DHT_LINKFILE_KEY, 256);
+ conf->link_xattr_name, 256);
/* need it for self-healing linkfiles which is
'in-migration' state */
@@ -1568,7 +1635,8 @@ dht_unlink_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
LOCK (&frame->lock);
{
- if (op_ret == -1) {
+ if ((op_ret == -1) && !((op_errno == ENOENT) ||
+ (op_errno == ENOTCONN))) {
local->op_errno = op_errno;
gf_log (this->name, GF_LOG_DEBUG,
"subvolume %s returned -1 (%s)",
@@ -1581,7 +1649,7 @@ dht_unlink_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
unlock:
UNLOCK (&frame->lock);
- if (op_ret == -1)
+ if (local->op_ret == -1)
goto err;
cached_subvol = dht_subvol_get_cached (this, local->loc.inode);
@@ -1605,41 +1673,6 @@ err:
return 0;
}
-static int
-dht_ufo_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xdata)
-{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
-
- local = frame->local;
- prev = cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_ret = -1;
- local->op_errno = op_errno;
- gf_log (this->name, GF_LOG_DEBUG,
- "subvolume %s returned -1 (%s)",
- prev->this->name, strerror (op_errno));
- goto unlock;
- }
- }
-unlock:
- UNLOCK (&frame->lock);
-
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
- DHT_STACK_UNWIND (setxattr, frame, local->op_ret,
- local->op_errno, NULL);
- }
-
- return 0;
-}
-
-
int
dht_err_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *xdata)
@@ -1712,115 +1745,225 @@ dht_fill_pathinfo_xattr (xlator_t *this, dht_local_t *local,
}
int
-dht_vgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
+dht_vgetxattr_alloc_and_fill (dht_local_t *local, dict_t *xattr, xlator_t *this,
+ int op_errno)
{
- dht_local_t *local = NULL;
- int ret = 0;
- int flag = 0;
- int this_call_cnt = 0;
- char *value_got = NULL;
- char layout_buf[8192] = {0,};
- char *xattr_buf = NULL;
- dict_t *dict = NULL;
- int32_t alloc_len = 0;
- int32_t plen = 0;
- call_frame_t *prev = NULL;
+ int ret = -1;
+ char *value = NULL;
+ int32_t plen = 0;
- local = frame->local;
- prev = cookie;
+ ret = dict_get_str (xattr, local->xsel, &value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Subvolume %s returned -1 (%s)", this->name,
+ strerror (op_errno));
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ goto out;
+ }
- if (op_ret >= 0) {
- ret = dict_get_str (xattr, local->xsel, &value_got);
- if (!ret) {
- alloc_len = strlen (value_got);
+ local->alloc_len += strlen(value);
- /**
- * allocate the buffer:- we allocate 10 bytes extra in
- * case we need to append ' Link: ' in the buffer for
- * another STACK_WIND
- */
+ if (!local->xattr_val) {
+ local->alloc_len += (strlen (DHT_PATHINFO_HEADER) + 10);
+ local->xattr_val = GF_CALLOC (local->alloc_len, sizeof (char),
+ gf_common_mt_char);
+ if (!local->xattr_val) {
+ ret = -1;
+ goto out;
+ }
+ }
+
+ if (local->xattr_val) {
+ plen = strlen (local->xattr_val);
+ if (plen) {
+ /* extra byte(s) for \0 to be safe */
+ local->alloc_len += (plen + 2);
+ local->xattr_val = GF_REALLOC (local->xattr_val,
+ local->alloc_len);
if (!local->xattr_val) {
- alloc_len += (strlen (DHT_PATHINFO_HEADER) + 10);
- local->xattr_val =
- GF_CALLOC (alloc_len,
- sizeof (char),
- gf_common_mt_char);
+ ret = -1;
+ goto out;
}
+ }
- if (local->xattr_val) {
- plen = strlen (local->xattr_val);
- if (plen) {
- void *p;
- /* extra byte(s) for \0 to be safe */
- alloc_len += (plen + 2);
- p = GF_REALLOC (local->xattr_val,
- alloc_len);
- if (!p)
- goto out;
- local->xattr_val = p;
- }
+ (void) strcat (local->xattr_val, value);
+ (void) strcat (local->xattr_val, " ");
+ local->op_ret = 0;
+ }
- strcat (local->xattr_val, value_got);
- }
- local->op_ret = 0;
- }
+ ret = 0;
+
+ out:
+ return ret;
+}
+
+int
+dht_vgetxattr_fill_and_set (dht_local_t *local, dict_t **dict, xlator_t *this,
+ gf_boolean_t flag)
+{
+ int ret = -1;
+ char *xattr_buf = NULL;
+ char layout_buf[8192] = {0,};
+
+ if (flag)
+ fill_layout_info (local->layout, layout_buf);
+
+ *dict = dict_new ();
+ if (!*dict)
+ goto out;
+
+ local->xattr_val[strlen (local->xattr_val) - 1] = '\0';
+
+ /* we would need max this many bytes to create xattr string
+ * extra 40 bytes is just an estimated amount of additional
+ * space required as we include translator name and some
+ * spaces, brackets etc. when forming the pathinfo string.
+ *
+ * For node-uuid we just don't have all the pretty formatting,
+ * but since this is a generic routine for pathinfo & node-uuid
+ * we dont have conditional space allocation and try to be
+ * generic
+ */
+ local->alloc_len += (2 * strlen (this->name))
+ + strlen (layout_buf)
+ + 40;
+ xattr_buf = GF_CALLOC (local->alloc_len, sizeof (char),
+ gf_common_mt_char);
+ if (!xattr_buf)
+ goto out;
+
+ if (XATTR_IS_PATHINFO (local->xsel)) {
+ (void) dht_fill_pathinfo_xattr (this, local, xattr_buf,
+ local->alloc_len, flag,
+ layout_buf);
+ } else if (XATTR_IS_NODE_UUID (local->xsel)) {
+ (void) snprintf (xattr_buf, local->alloc_len, "%s",
+ local->xattr_val);
} else {
- local->op_ret = -1;
- local->op_errno = op_errno;
- gf_log (this->name, GF_LOG_ERROR, "Subvolume %s returned -1 "
- "(%s)", prev->this->name, strerror (op_errno));
+ gf_log (this->name, GF_LOG_WARNING,
+ "Unknown local->xsel (%s)", local->xsel);
+ GF_FREE (xattr_buf);
+ goto out;
}
+ ret = dict_set_dynstr (*dict, local->xsel, xattr_buf);
+ if (ret)
+ GF_FREE (xattr_buf);
+ GF_FREE (local->xattr_val);
+
out:
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
+ return ret;
+}
- if (local->op_ret == -1) {
- goto unwind;
- }
+int
+dht_vgetxattr_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
+{
+ int ret = 0;
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ dict_t *dict = NULL;
- if (local->layout->cnt > 1) {
- /* Set it for directory */
- fill_layout_info (local->layout, layout_buf);
- flag = 1;
- }
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (frame->local, out);
- dict = dict_new ();
-
- /* we would need max this many bytes to create xattr string */
- alloc_len += (2 * strlen (this->name))
- + strlen (layout_buf)
- + 40;
- xattr_buf = GF_CALLOC (alloc_len,
- sizeof (char), gf_common_mt_char);
-
- if (XATTR_IS_PATHINFO (local->xsel)) {
- (void) dht_fill_pathinfo_xattr (this, local, xattr_buf,
- alloc_len, flag,
- layout_buf);
- } else if (XATTR_IS_NODE_UUID (local->xsel)) {
- (void) snprintf (xattr_buf, alloc_len, "%s",
- local->xattr_val);
- } else
- gf_log (this->name, GF_LOG_WARNING,
- "Unknown local->xsel (%s)", local->xsel);
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ this_call_cnt = --local->call_cnt;
+ if (op_ret < 0) {
+ if (op_errno != ENOTCONN) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "getxattr err (%s) for dir",
+ strerror (op_errno));
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ }
- ret = dict_set_dynstr (dict, local->xsel, xattr_buf);
+ goto unlock;
+ }
- GF_FREE (local->xattr_val);
+ ret = dht_vgetxattr_alloc_and_fill (local, xattr, this,
+ op_errno);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "alloc or fill failure");
+ }
+ unlock:
+ UNLOCK (&frame->lock);
- DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict,
- xdata);
+ if (!is_last_call (this_call_cnt))
+ goto out;
- if (dict)
- dict_unref (dict);
+ /* -- last call: do patch ups -- */
- return 0;
+ if (local->op_ret == -1) {
+ goto unwind;
}
-unwind:
+ ret = dht_vgetxattr_fill_and_set (local, &dict, this, _gf_true);
+ if (ret)
+ goto unwind;
+
+ DHT_STACK_UNWIND (getxattr, frame, 0, 0, dict, xdata);
+ goto cleanup;
+
+ unwind:
DHT_STACK_UNWIND (getxattr, frame, -1, local->op_errno, NULL, NULL);
+ cleanup:
+ if (dict)
+ dict_unref (dict);
+ out:
+ return 0;
+}
+
+int
+dht_vgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int ret = 0;
+ dict_t *dict = NULL;
+ call_frame_t *prev = NULL;
+ gf_boolean_t flag = _gf_true;
+
+ local = frame->local;
+ prev = cookie;
+
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ gf_log (this->name, GF_LOG_ERROR, "Subvolume %s returned -1 "
+ "(%s)", prev->this->name, strerror (op_errno));
+ goto unwind;
+ }
+
+ ret = dht_vgetxattr_alloc_and_fill (local, xattr, this,
+ op_errno);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "alloc or fill failure");
+ goto unwind;
+ }
+
+ flag = (local->layout->cnt > 1) ? _gf_true : _gf_false;
+
+ ret = dht_vgetxattr_fill_and_set (local, &dict, this, flag);
+ if (ret)
+ goto unwind;
+
+ DHT_STACK_UNWIND (getxattr, frame, 0, 0, dict, xdata);
+ goto cleanup;
+
+ unwind:
+ DHT_STACK_UNWIND (getxattr, frame, -1, local->op_errno,
+ NULL, NULL);
+ cleanup:
+ if (dict)
+ dict_unref (dict);
+
return 0;
}
@@ -1853,10 +1996,13 @@ dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
{
int this_call_cnt = 0;
dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (frame->local, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ conf = this->private;
local = frame->local;
this_call_cnt = dht_frame_return (frame);
@@ -1864,21 +2010,21 @@ dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (!xattr || (op_ret == -1))
goto out;
- if (dict_get (xattr, "trusted.glusterfs.dht")) {
- dict_del (xattr, "trusted.glusterfs.dht");
+ if (dict_get (xattr, conf->xattr_name)) {
+ dict_del (xattr, conf->xattr_name);
+ }
+
+ if (frame->root->pid >= 0 ) {
+ GF_REMOVE_INTERNAL_XATTR("trusted.glusterfs.quota*", xattr);
+ GF_REMOVE_INTERNAL_XATTR("trusted.pgfid*", xattr);
}
+
local->op_ret = 0;
if (!local->xattr) {
local->xattr = dict_copy_with_ref (xattr, NULL);
} else {
- /* first aggregate everything into xattr and then copy into
- * local->xattr. This is required as we want to have
- * 'local->xattr' as the proper final dictionary passed above
- * distribute xlator.
- */
- dht_aggregate_xattr (xattr, local->xattr);
- local->xattr = dict_copy (xattr, local->xattr);
+ dht_aggregate_xattr (local->xattr, xattr);
}
out:
if (is_last_call (this_call_cnt)) {
@@ -1898,9 +2044,72 @@ dht_getxattr_unwind (call_frame_t *frame,
int
+dht_getxattr_get_real_filename_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ dict_t *xattr, dict_t *xdata)
+{
+ int this_call_cnt = 0;
+ dht_local_t *local = NULL;
+
+
+ local = frame->local;
+
+ if (op_ret != -1) {
+ if (local->xattr)
+ dict_unref (local->xattr);
+ local->xattr = dict_ref (xattr);
+
+ if (local->xattr_req)
+ dict_unref (local->xattr_req);
+ local->xattr_req = dict_ref (xdata);
+ }
+
+ this_call_cnt = dht_frame_return (frame);
+ if (is_last_call (this_call_cnt)) {
+ DHT_STACK_UNWIND (getxattr, frame, local->op_ret, op_errno,
+ local->xattr, local->xattr_req);
+ }
+
+ return 0;
+}
+
+
+int
+dht_getxattr_get_real_filename (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *key, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int i = 0;
+ dht_layout_t *layout = NULL;
+ int cnt = 0;
+ xlator_t *subvol = NULL;
+
+
+ local = frame->local;
+ layout = local->layout;
+
+ cnt = local->call_cnt = layout->cnt;
+
+ local->op_ret = -1;
+ local->op_errno = ENODATA;
+
+ for (i = 0; i < cnt; i++) {
+ subvol = layout->list[i].xlator;
+ STACK_WIND (frame, dht_getxattr_get_real_filename_cbk,
+ subvol, subvol->fops->getxattr,
+ loc, key, xdata);
+ }
+
+ return 0;
+}
+
+
+int
dht_getxattr (call_frame_t *frame, xlator_t *this,
loc_t *loc, const char *key, dict_t *xdata)
+#define DHT_IS_DIR(layout) (layout->cnt > 1)
{
+
xlator_t *subvol = NULL;
xlator_t *hashed_subvol = NULL;
xlator_t *cached_subvol = NULL;
@@ -1916,7 +2125,6 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (loc, err);
VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
VALIDATE_OR_GOTO (this->private, err);
conf = this->private;
@@ -1944,8 +2152,40 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
}
}
- if (key && ((strcmp (key, GF_XATTR_PATHINFO_KEY) == 0)
- || strcmp (key, GF_XATTR_NODE_UUID_KEY) == 0)) {
+ if (key &&
+ (strncmp (key, GF_XATTR_GET_REAL_FILENAME_KEY,
+ strlen (GF_XATTR_GET_REAL_FILENAME_KEY)) == 0)
+ && DHT_IS_DIR(layout)) {
+ dht_getxattr_get_real_filename (frame, this, loc, key, xdata);
+ return 0;
+ }
+
+ /* for file use cached subvolume (obviously!): see if {}
+ * below
+ * for directory:
+ * wind to all subvolumes and exclude subvolumes which
+ * return ENOTCONN (in callback)
+ *
+ * NOTE: Don't trust inode here, as that may not be valid
+ * (until inode_link() happens)
+ */
+ if (key && DHT_IS_DIR(layout) &&
+ (XATTR_IS_PATHINFO (key)
+ || (strcmp (key, GF_XATTR_NODE_UUID_KEY) == 0))) {
+ (void) strncpy (local->xsel, key, 256);
+ cnt = local->call_cnt = layout->cnt;
+ for (i = 0; i < cnt; i++) {
+ subvol = layout->list[i].xlator;
+ STACK_WIND (frame, dht_vgetxattr_dir_cbk,
+ subvol, subvol->fops->getxattr,
+ loc, key, NULL);
+ }
+ return 0;
+ }
+
+ /* node-uuid or pathinfo for files */
+ if (key && ((strcmp (key, GF_XATTR_NODE_UUID_KEY) == 0)
+ || XATTR_IS_PATHINFO (key))) {
cached_subvol = local->cached_subvol;
(void) strncpy (local->xsel, key, 256);
@@ -1955,6 +2195,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
return 0;
}
+
if (key && (strcmp (key, GF_XATTR_LINKINFO_KEY) == 0)) {
hashed_subvol = dht_subvol_get_hashed (this, loc);
if (!hashed_subvol) {
@@ -1987,13 +2228,13 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
}
if (key && (!strcmp (GF_XATTR_MARKER_KEY, key))
- && (-1 == frame->root->pid)) {
-
- if (loc->inode-> ia_type == IA_IFDIR) {
+ && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) {
+ if (DHT_IS_DIR(layout)) {
cnt = layout->cnt;
} else {
cnt = 1;
}
+
sub_volumes = alloca ( cnt * sizeof (xlator_t *));
for (i = 0; i < cnt; i++)
*(sub_volumes + i) = layout->list[i].xlator;
@@ -2001,7 +2242,8 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
if (cluster_getmarkerattr (frame, this, loc, key,
local, dht_getxattr_unwind,
sub_volumes, cnt,
- MARKER_UUID_TYPE, conf->vol_uuid)) {
+ MARKER_UUID_TYPE, marker_uuid_default_gauge,
+ conf->vol_uuid)) {
op_errno = EINVAL;
goto err;
}
@@ -2009,10 +2251,22 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
return 0;
}
+ if (key && !strcmp (GF_XATTR_QUOTA_LIMIT_LIST, key)) {
+ /* quota hardlimit and aggregated size of a directory is stored
+ * in inode contexts of each brick. Hence its good enough that
+ * we send getxattr for this key to any brick.
+ */
+ local->call_cnt = 1;
+ subvol = dht_first_up_subvol (this);
+ STACK_WIND (frame, dht_getxattr_cbk, subvol,
+ subvol->fops->getxattr, loc, key, xdata);
+ return 0;
+ }
+
if (key && *conf->vol_uuid) {
if ((match_uuid_local (key, conf->vol_uuid) == 0) &&
- (-1 == frame->root->pid)) {
- if (loc->inode-> ia_type == IA_IFDIR) {
+ (GF_CLIENT_PID_GSYNCD == frame->root->pid)) {
+ if (DHT_IS_DIR(layout)) {
cnt = layout->cnt;
} else {
cnt = 1;
@@ -2025,6 +2279,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
local, dht_getxattr_unwind,
sub_volumes, cnt,
MARKER_XTIME_TYPE,
+ marker_xtime_default_gauge,
conf->vol_uuid)) {
op_errno = EINVAL;
goto err;
@@ -2034,7 +2289,7 @@ dht_getxattr (call_frame_t *frame, xlator_t *this,
}
}
- if (loc->inode-> ia_type == IA_IFDIR) {
+ if (DHT_IS_DIR(layout)) {
cnt = local->call_cnt = layout->cnt;
} else {
cnt = local->call_cnt = 1;
@@ -2054,6 +2309,7 @@ err:
return 0;
}
+#undef DHT_IS_DIR
int
dht_fgetxattr (call_frame_t *frame, xlator_t *this,
@@ -2095,7 +2351,10 @@ dht_fgetxattr (call_frame_t *frame, xlator_t *this,
}
}
- if (fd->inode->ia_type == IA_IFDIR) {
+ if ((fd->inode->ia_type == IA_IFDIR)
+ && key
+ && (strncmp (key, GF_XATTR_LOCKINFO_KEY,
+ strlen (GF_XATTR_LOCKINFO_KEY) != 0))) {
cnt = local->call_cnt = layout->cnt;
} else {
cnt = local->call_cnt = 1;
@@ -2123,13 +2382,17 @@ dht_fsetxattr (call_frame_t *frame, xlator_t *this,
xlator_t *subvol = NULL;
dht_local_t *local = NULL;
int op_errno = EINVAL;
+ dht_conf_t *conf = NULL;
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (fd, err);
VALIDATE_OR_GOTO (fd->inode, err);
+ VALIDATE_OR_GOTO (this->private, err);
+
+ conf = this->private;
- GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.dht*", xattr,
+ GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr,
op_errno, err);
local = dht_local_init (frame, NULL, fd, GF_FOP_FSETXATTR);
@@ -2233,12 +2496,12 @@ dht_setxattr (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (loc, err);
VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
- GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.dht*", xattr,
+ conf = this->private;
+
+ GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr,
op_errno, err);
- conf = this->private;
local = dht_local_init (frame, loc, NULL, GF_FOP_SETXATTR);
if (!local) {
op_errno = ENOMEM;
@@ -2263,25 +2526,6 @@ dht_setxattr (call_frame_t *frame, xlator_t *this,
local->call_cnt = call_cnt = layout->cnt;
- /* This key is sent by Unified File and Object storage
- * to test xattr support in backend.
- */
- tmp = dict_get (xattr, "user.ufo-test");
- if (tmp) {
- if (IA_ISREG (loc->inode->ia_type)) {
- op_errno = ENOTSUP;
- goto err;
- }
- local->op_ret = 0;
- for (i = 0; i < call_cnt; i++) {
- STACK_WIND (frame, dht_ufo_xattr_cbk,
- layout->list[i].xlator,
- layout->list[i].xlator->fops->setxattr,
- loc, xattr, flags, NULL);
- }
- return 0;
- }
-
tmp = dict_get (xattr, "distribute.migrate-data");
if (tmp) {
if (IA_ISDIR (loc->inode->ia_type)) {
@@ -2356,9 +2600,13 @@ dht_setxattr (call_frame_t *frame, xlator_t *this,
gf_log (this->name, GF_LOG_INFO,
"fixing the layout of %s", loc->path);
- dht_fix_directory_layout (frame, dht_common_setxattr_cbk,
- layout);
- return 0;
+ ret = dht_fix_directory_layout (frame, dht_common_setxattr_cbk,
+ layout);
+ if (ret) {
+ op_errno = ENOTCONN;
+ goto err;
+ }
+ return ret;
}
tmp = dict_get (xattr, "distribute.directory-spread-count");
@@ -2370,10 +2618,14 @@ dht_setxattr (call_frame_t *frame, xlator_t *this,
(dir_spread > 0))) {
layout->spread_cnt = dir_spread;
- dht_fix_directory_layout (frame,
- dht_common_setxattr_cbk,
- layout);
- return 0;
+ ret = dht_fix_directory_layout (frame,
+ dht_common_setxattr_cbk,
+ layout);
+ if (ret) {
+ op_errno = ENOTCONN;
+ goto err;
+ }
+ return ret;
}
gf_log (this->name, GF_LOG_ERROR,
"wrong 'directory-spread-count' value (%s)", value);
@@ -2443,18 +2695,20 @@ dht_removexattr (call_frame_t *frame, xlator_t *this,
dht_local_t *local = NULL;
dht_layout_t *layout = NULL;
int call_cnt = 0;
+ dht_conf_t *conf = NULL;
int i;
VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (this->private, err);
- GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.dht*",
- key, op_errno, err);
+ conf = this->private;
+
+ GF_IF_NATIVE_XATTR_GOTO (conf->wild_xattr_name, key, op_errno, err);
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (loc, err);
VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
local = dht_local_init (frame, loc, NULL, GF_FOP_REMOVEXATTR);
if (!local) {
@@ -2506,13 +2760,16 @@ dht_fremovexattr (call_frame_t *frame, xlator_t *this,
dht_local_t *local = NULL;
dht_layout_t *layout = NULL;
int call_cnt = 0;
+ dht_conf_t *conf = 0;
int i;
VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (this->private, err);
- GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.dht*",
- key, op_errno, err);
+ conf = this->private;
+
+ GF_IF_NATIVE_XATTR_GOTO (conf->wild_xattr_name, key, op_errno, err);
VALIDATE_OR_GOTO (frame, err);
@@ -2620,11 +2877,16 @@ int
dht_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, struct statvfs *statvfs, dict_t *xdata)
{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- int bsize = 0;
- int frsize = 0;
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ int bsize = 0;
+ int frsize = 0;
+ int8_t quota_deem_statfs = 0;
+ GF_UNUSED int ret = 0;
+ unsigned long new_usage = 0;
+ unsigned long cur_usage = 0;
+ ret = dict_get_int8 (xdata, "quota-deem-statfs", &quota_deem_statfs);
local = frame->local;
@@ -2634,8 +2896,22 @@ dht_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->op_errno = op_errno;
goto unlock;
}
+ if (!statvfs) {
+ op_errno = EINVAL;
+ local->op_ret = -1;
+ goto unlock;
+ }
local->op_ret = 0;
+ if (quota_deem_statfs) {
+ new_usage = statvfs->f_blocks - statvfs->f_bfree;
+ cur_usage = local->statvfs.f_blocks - local->statvfs.f_bfree;
+ /* We take the maximux of the usage from the subvols */
+ if (new_usage >= cur_usage)
+ local->statvfs = *statvfs;
+ goto unlock;
+ }
+
if (local->statvfs.f_bsize != 0) {
bsize = max(local->statvfs.f_bsize, statvfs->f_bsize);
frsize = max(local->statvfs.f_frsize, statvfs->f_frsize);
@@ -2656,6 +2932,7 @@ dht_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->statvfs.f_flag = statvfs->f_flag;
local->statvfs.f_namemax = statvfs->f_namemax;
+
}
unlock:
UNLOCK (&frame->lock);
@@ -2683,7 +2960,6 @@ dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (loc, err);
VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
VALIDATE_OR_GOTO (this->private, err);
conf = this->private;
@@ -2803,10 +3079,13 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
list_for_each_entry (orig_entry, (&orig_entries->list), list) {
next_offset = orig_entry->d_off;
- if ((check_is_dir (NULL, (&orig_entry->d_stat), NULL) &&
- (prev->this != dht_first_up_subvol (this))) ||
- check_is_linkfile (NULL, (&orig_entry->d_stat),
- orig_entry->dict)) {
+ if (check_is_dir (NULL, (&orig_entry->d_stat), NULL) &&
+ (prev->this != local->first_up_subvol)) {
+ continue;
+ }
+ if (check_is_linkfile (NULL, (&orig_entry->d_stat),
+ orig_entry->dict,
+ conf->link_xattr_name)) {
continue;
}
@@ -2841,7 +3120,7 @@ dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
/* making sure we set the inode ctx right with layout,
currently possible only for non-directories, so for
directories don't set entry inodes */
- if (!IA_ISDIR(entry->d_stat.ia_type)) {
+ if (!IA_ISDIR(entry->d_stat.ia_type) && orig_entry->inode) {
ret = dht_layout_preset (this, prev->this,
orig_entry->inode);
if (ret)
@@ -2883,13 +3162,16 @@ done:
}
if (conf->readdir_optimize == _gf_true) {
- if (next_subvol != dht_first_up_subvol (this)) {
+ if (next_subvol != local->first_up_subvol) {
ret = dict_set_int32 (local->xattr,
GF_READDIR_SKIP_DIRS, 1);
if (ret)
gf_log (this->name, GF_LOG_ERROR,
"dict set failed");
- }
+ } else {
+ dict_del (local->xattr,
+ GF_READDIR_SKIP_DIRS);
+ }
}
STACK_WIND (frame, dht_readdirp_cbk,
@@ -3022,6 +3304,7 @@ dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (this->private, err);
conf = this->private;
@@ -3034,6 +3317,7 @@ dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
local->fd = fd_ref (fd);
local->size = size;
local->xattr_req = (dict)? dict_ref (dict) : NULL;
+ local->first_up_subvol = dht_first_up_subvol (this);
dht_deitransform (this, yoff, &xvol, (uint64_t *)&xoff);
@@ -3046,20 +3330,22 @@ dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
if (local->xattr) {
ret = dict_set_uint32 (local->xattr,
- "trusted.glusterfs.dht.linkto",
- 256);
+ conf->link_xattr_name, 256);
if (ret)
gf_log (this->name, GF_LOG_WARNING,
- "failed to set 'glusterfs.dht.linkto'"
- " key");
+ "failed to set '%s' key",
+ conf->link_xattr_name);
if (conf->readdir_optimize == _gf_true) {
- if (xvol != dht_first_up_subvol (this)) {
+ if (xvol != local->first_up_subvol) {
ret = dict_set_int32 (local->xattr,
GF_READDIR_SKIP_DIRS, 1);
if (ret)
gf_log (this->name,
GF_LOG_ERROR,
"Dict set failed");
+ } else {
+ dict_del (local->xattr,
+ GF_READDIR_SKIP_DIRS);
}
}
}
@@ -3195,7 +3481,7 @@ dht_newfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
inode_t *inode, struct iatt *stbuf, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- call_frame_t *prev = NULL;
+ xlator_t *prev = NULL;
int ret = -1;
dht_local_t *local = NULL;
@@ -3220,15 +3506,17 @@ dht_newfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
postparent, 1);
}
- ret = dht_layout_preset (this, prev->this, inode);
+ ret = dht_layout_preset (this, prev, inode);
if (ret < 0) {
gf_log (this->name, GF_LOG_DEBUG,
"could not set pre-set layout for subvolume %s",
- prev->this->name);
+ prev? prev->name: NULL);
op_ret = -1;
op_errno = EINVAL;
goto out;
}
+ if (local->linked == _gf_true)
+ dht_linkfile_attr_heal (frame, this);
out:
/*
* FIXME: ia_size and st_blocks of preparent and postparent do not have
@@ -3237,7 +3525,6 @@ out:
* corresponding values from each of the subvolume.
* See dht_iatt_merge for reference.
*/
-
DHT_STRIP_PHASE1_FLAGS (stbuf);
DHT_STACK_UNWIND (mknod, frame, op_ret, op_errno, inode, stbuf,
preparent, postparent, xdata);
@@ -3259,12 +3546,17 @@ dht_mknod_linkfile_create_cbk (call_frame_t *frame, void *cookie,
goto err;
local = frame->local;
+ if (!local || !local->cached_subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
cached_subvol = local->cached_subvol;
- STACK_WIND (frame, dht_newfile_cbk,
- cached_subvol, cached_subvol->fops->mknod,
- &local->loc, local->mode, local->rdev, local->umask,
- local->params);
+ STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)cached_subvol,
+ cached_subvol, cached_subvol->fops->mknod,
+ &local->loc, local->mode, local->rdev, local->umask,
+ local->params);
return 0;
err:
@@ -3307,11 +3599,13 @@ dht_mknod (call_frame_t *frame, xlator_t *this,
gf_log (this->name, GF_LOG_TRACE,
"creating %s on %s", loc->path, subvol->name);
- STACK_WIND (frame, dht_newfile_cbk,
- subvol, subvol->fops->mknod,
- loc, mode, rdev, umask, params);
+ STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol,
+ subvol, subvol->fops->mknod, loc, mode,
+ rdev, umask, params);
} else {
- avail_subvol = dht_free_disk_available_subvol (this, subvol);
+
+ avail_subvol = dht_free_disk_available_subvol (this, subvol,
+ local);
if (avail_subvol != subvol) {
/* Choose the minimum filled volume, and create the
files there */
@@ -3323,14 +3617,15 @@ dht_mknod (call_frame_t *frame, xlator_t *this,
local->umask = umask;
dht_linkfile_create (frame,
dht_mknod_linkfile_create_cbk,
- avail_subvol, subvol, loc);
+ this, avail_subvol, subvol, loc);
} else {
gf_log (this->name, GF_LOG_TRACE,
"creating %s on %s", loc->path, subvol->name);
- STACK_WIND (frame, dht_newfile_cbk,
- subvol, subvol->fops->mknod,
- loc, mode, rdev, umask, params);
+ STACK_WIND_COOKIE (frame, dht_newfile_cbk,
+ (void *)subvol, subvol,
+ subvol->fops->mknod, loc, mode,
+ rdev, umask, params);
}
}
@@ -3375,9 +3670,9 @@ dht_symlink (call_frame_t *frame, xlator_t *this,
gf_log (this->name, GF_LOG_TRACE,
"creating %s on %s", loc->path, subvol->name);
- STACK_WIND (frame, dht_newfile_cbk,
- subvol, subvol->fops->symlink,
- linkname, loc, umask, params);
+ STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol, subvol,
+ subvol->fops->symlink, linkname, loc, umask,
+ params);
return 0;
@@ -3422,12 +3717,12 @@ dht_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
}
hashed_subvol = dht_subvol_get_hashed (this, loc);
+ /* Dont fail unlink if hashed_subvol is NULL which can be the result
+ * of layout anomaly */
if (!hashed_subvol) {
gf_log (this->name, GF_LOG_DEBUG,
"no subvolume in layout for path=%s",
loc->path);
- op_errno = EINVAL;
- goto err;
}
cached_subvol = local->cached_subvol;
@@ -3439,7 +3734,7 @@ dht_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
}
local->flags = xflag;
- if (hashed_subvol != cached_subvol) {
+ if (hashed_subvol && hashed_subvol != cached_subvol) {
STACK_WIND (frame, dht_unlink_linkfile_cbk,
hashed_subvol, hashed_subvol->fops->unlink, loc,
xflag, xdata);
@@ -3491,7 +3786,10 @@ dht_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dht_inode_ctx_time_update (local->loc.parent, this,
postparent, 1);
}
-
+ if (local->linked == _gf_true) {
+ local->stbuf = *stbuf;
+ dht_linkfile_attr_heal (frame, this);
+ }
out:
DHT_STRIP_PHASE1_FLAGS (stbuf);
DHT_STACK_UNWIND (link, frame, op_ret, op_errno, inode, stbuf, preparent,
@@ -3578,7 +3876,7 @@ dht_link (call_frame_t *frame, xlator_t *this,
if (hashed_subvol != cached_subvol) {
uuid_copy (local->gfid, oldloc->inode->gfid);
- dht_linkfile_create (frame, dht_link_linkfile_cbk,
+ dht_linkfile_create (frame, dht_link_linkfile_cbk, this,
cached_subvol, hashed_subvol, newloc);
} else {
STACK_WIND (frame, dht_link_cbk,
@@ -3635,7 +3933,10 @@ dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
op_errno = EINVAL;
goto out;
}
-
+ if (local->linked == _gf_true) {
+ local->stbuf = *stbuf;
+ dht_linkfile_attr_heal (frame, this);
+ }
out:
DHT_STRIP_PHASE1_FLAGS (stbuf);
DHT_STACK_UNWIND (create, frame, op_ret, op_errno, fd, inode, stbuf, preparent,
@@ -3725,7 +4026,7 @@ dht_create (call_frame_t *frame, xlator_t *this,
}
/* Choose the minimum filled volume, and create the
files there */
- avail_subvol = dht_free_disk_available_subvol (this, subvol);
+ avail_subvol = dht_free_disk_available_subvol (this, subvol, local);
if (avail_subvol != subvol) {
local->params = dict_ref (params);
local->flags = flags;
@@ -3736,9 +4037,8 @@ dht_create (call_frame_t *frame, xlator_t *this,
gf_log (this->name, GF_LOG_TRACE,
"creating %s on %s (link at %s)", loc->path,
avail_subvol->name, subvol->name);
- dht_linkfile_create (frame,
- dht_create_linkfile_create_cbk,
- avail_subvol, subvol, loc);
+ dht_linkfile_create (frame, dht_create_linkfile_create_cbk,
+ this, avail_subvol, subvol, loc);
goto done;
}
gf_log (this->name, GF_LOG_TRACE,
@@ -3811,6 +4111,15 @@ dht_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
ret = dht_layout_merge (this, layout, prev->this,
-1, ENOSPC, NULL);
} else {
+ if (op_ret == -1 && op_errno == EEXIST)
+ /* Very likely just a race between mkdir and
+ self-heal (from lookup of a concurrent mkdir
+ attempt).
+ Ignore error for now. layout setting will
+ anyways fail if this was a different (old)
+ pre-existing different directory.
+ */
+ op_ret = 0;
ret = dht_layout_merge (this, layout, prev->this,
op_ret, op_errno, NULL);
}
@@ -4268,6 +4577,7 @@ dht_rmdir_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
call_frame_t *main_frame = NULL;
dht_local_t *main_local = NULL;
int this_call_cnt = 0;
+ dht_conf_t *conf = this->private;
local = frame->local;
prev = cookie;
@@ -4279,7 +4589,7 @@ dht_rmdir_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret != 0)
goto err;
- if (check_is_linkfile (inode, stbuf, xattr) == 0) {
+ if (!check_is_linkfile (inode, stbuf, xattr, conf->link_xattr_name)) {
main_local->op_ret = -1;
main_local->op_errno = ENOTEMPTY;
@@ -4304,16 +4614,85 @@ err:
int
+dht_rmdir_cached_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *stbuf, dict_t *xattr,
+ struct iatt *parent)
+{
+ dht_local_t *local = NULL;
+ xlator_t *src = NULL;
+ call_frame_t *main_frame = NULL;
+ dht_local_t *main_local = NULL;
+ int this_call_cnt = 0;
+ dht_conf_t *conf = this->private;
+ dict_t *xattrs = NULL;
+ int ret = 0;
+
+ local = frame->local;
+ src = local->hashed_subvol;
+
+ main_frame = local->main_frame;
+ main_local = main_frame->local;
+
+ if (op_ret == 0) {
+ main_local->op_ret = -1;
+ main_local->op_errno = ENOTEMPTY;
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s found on cached subvol %s",
+ local->loc.path, src->name);
+ goto err;
+ } else if (op_errno != ENOENT) {
+ main_local->op_ret = -1;
+ main_local->op_errno = op_errno;
+ goto err;
+ }
+
+ xattrs = dict_new ();
+ if (!xattrs) {
+ gf_log (this->name, GF_LOG_ERROR, "dict_new failed");
+ goto err;
+ }
+
+ ret = dict_set_uint32 (xattrs, conf->link_xattr_name, 256);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to set linkto key"
+ " in dict");
+ if (xattrs)
+ dict_unref (xattrs);
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_rmdir_lookup_cbk,
+ src, src->fops->lookup, &local->loc, xattrs);
+ if (xattrs)
+ dict_unref (xattrs);
+
+ return 0;
+err:
+
+ this_call_cnt = dht_frame_return (main_frame);
+ if (is_last_call (this_call_cnt))
+ dht_rmdir_do (main_frame, this);
+
+ DHT_STACK_DESTROY (frame);
+ return 0;
+}
+
+
+int
dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this,
gf_dirent_t *entries, xlator_t *src)
{
- int ret = 0;
- int build_ret = 0;
- gf_dirent_t *trav = NULL;
+ int ret = 0;
+ int build_ret = 0;
+ gf_dirent_t *trav = NULL;
call_frame_t *lookup_frame = NULL;
dht_local_t *lookup_local = NULL;
- dht_local_t *local = NULL;
- dict_t *xattrs = NULL;
+ dht_local_t *local = NULL;
+ dict_t *xattrs = NULL;
+ dht_conf_t *conf = this->private;
+ xlator_t *subvol = NULL;
local = frame->local;
@@ -4322,7 +4701,8 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this,
continue;
if (strcmp (trav->d_name, "..") == 0)
continue;
- if (check_is_linkfile (NULL, (&trav->d_stat), trav->dict)) {
+ if (check_is_linkfile (NULL, (&trav->d_stat), trav->dict,
+ conf->link_xattr_name)) {
ret++;
continue;
}
@@ -4340,7 +4720,7 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this,
return -1;
}
- ret = dict_set_uint32 (xattrs, DHT_LINKFILE_KEY, 256);
+ ret = dict_set_uint32 (xattrs, conf->link_xattr_name, 256);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "failed to set linkto key"
" in dict");
@@ -4372,6 +4752,7 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this,
lookup_frame->local = lookup_local;
lookup_local->main_frame = frame;
+ lookup_local->hashed_subvol = src;
build_ret = dht_build_child_loc (this, &lookup_local->loc,
&local->loc, trav->d_name);
@@ -4390,9 +4771,20 @@ dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this,
}
UNLOCK (&frame->lock);
- STACK_WIND (lookup_frame, dht_rmdir_lookup_cbk,
- src, src->fops->lookup,
- &lookup_local->loc, xattrs);
+ subvol = dht_linkfile_subvol (this, NULL, &trav->d_stat,
+ trav->dict);
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_INFO,
+ "linkfile not having link subvolume. path=%s",
+ lookup_local->loc.path);
+ STACK_WIND (lookup_frame, dht_rmdir_lookup_cbk,
+ src, src->fops->lookup,
+ &lookup_local->loc, xattrs);
+ } else {
+ STACK_WIND (lookup_frame, dht_rmdir_cached_lookup_cbk,
+ subvol, subvol->fops->lookup,
+ &lookup_local->loc, xattrs);
+ }
ret++;
}
@@ -4458,15 +4850,18 @@ int
dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, fd_t *fd, dict_t *xdata)
{
- dht_local_t *local = NULL;
+ dht_local_t *local = NULL;
int this_call_cnt = -1;
- call_frame_t *prev = NULL;
- dict_t *dict = NULL;
- int ret = 0;
+ call_frame_t *prev = NULL;
+ dict_t *dict = NULL;
+ int ret = 0;
+ dht_conf_t *conf = this->private;
+ int i = 0;
local = frame->local;
prev = cookie;
+ this_call_cnt = dht_frame_return (frame);
if (op_ret == -1) {
gf_log (this->name, GF_LOG_DEBUG,
"opendir on %s for %s failed (%s)",
@@ -4479,6 +4874,12 @@ dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto err;
}
+ if (!is_last_call (this_call_cnt))
+ return 0;
+
+ if (local->op_ret == -1)
+ goto err;
+
dict = dict_new ();
if (!dict) {
local->op_ret = -1;
@@ -4486,16 +4887,19 @@ dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto err;
}
- ret = dict_set_uint32 (dict,
- "trusted.glusterfs.dht.linkto", 256);
+ ret = dict_set_uint32 (dict, conf->link_xattr_name, 256);
if (ret)
gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to set 'trusted.glusterfs.dht.linkto' key",
- local->loc.path);
+ "%s: failed to set '%s' key",
+ local->loc.path, conf->link_xattr_name);
- STACK_WIND (frame, dht_rmdir_readdirp_cbk,
- prev->this, prev->this->fops->readdirp,
- local->fd, 4096, 0, dict);
+ local->call_cnt = conf->subvolume_cnt;
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ STACK_WIND (frame, dht_rmdir_readdirp_cbk,
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->readdirp,
+ local->fd, 4096, 0, dict);
+ }
if (dict)
dict_unref (dict);
@@ -4503,8 +4907,6 @@ dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
err:
- this_call_cnt = dht_frame_return (frame);
-
if (is_last_call (this_call_cnt)) {
dht_rmdir_do (frame, this);
}
@@ -4590,7 +4992,6 @@ dht_entrylk (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (loc, err);
VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
local = dht_local_init (frame, loc, NULL, GF_FOP_ENTRYLK);
if (!local) {
@@ -4770,7 +5171,8 @@ dht_notify (xlator_t *this, int event, void *data, ...)
gf_log (this->name, GF_LOG_WARNING,
"Received CHILD_DOWN. Exiting");
if (conf->defrag) {
- gf_defrag_stop (conf->defrag, NULL);
+ gf_defrag_stop (conf->defrag,
+ GF_DEFRAG_STATUS_FAILED, NULL);
} else {
kill (getpid(), SIGTERM);
}
@@ -4846,7 +5248,8 @@ dht_notify (xlator_t *this, int event, void *data, ...)
if (cmd == GF_DEFRAG_CMD_STATUS)
gf_defrag_status_get (defrag, output);
else if (cmd == GF_DEFRAG_CMD_STOP)
- gf_defrag_stop (defrag, output);
+ gf_defrag_stop (defrag,
+ GF_DEFRAG_STATUS_STOPPED, output);
}
unlock:
UNLOCK (&defrag->lock);
@@ -4871,15 +5274,7 @@ unlock:
or wait for anything else. Just propagate blindly */
if (have_heard_from_all) {
propagate = 1;
- if (conf->defrag) {
- ret = pthread_create (&conf->defrag->th, NULL,
- gf_defrag_start, this);
- if (ret) {
- conf->defrag = NULL;
- GF_FREE (conf->defrag);
- kill (getpid(), SIGTERM);
- }
- }
+
}
@@ -4901,6 +5296,19 @@ unlock:
/* continue to check other events for CHILD_UP */
}
}
+
+ /* rebalance is started with assert_no_child_down. So we do
+ * not need to handle CHILD_DOWN event here.
+ */
+ if (conf->defrag) {
+ ret = gf_thread_create (&conf->defrag->th, NULL,
+ gf_defrag_start, this);
+ if (ret) {
+ conf->defrag = NULL;
+ GF_FREE (conf->defrag);
+ kill (getpid(), SIGTERM);
+ }
+ }
}
ret = 0;
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
index be4dea90c..2ece28a61 100644
--- a/xlators/cluster/dht/src/dht-common.h
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -13,6 +13,9 @@
#include "config.h"
#endif
+#include <regex.h>
+#include <signal.h>
+
#include "dht-mem-types.h"
#include "libxlator.h"
#include "syncop.h"
@@ -44,7 +47,7 @@ struct dht_layout {
int gen;
int type;
int ref; /* use with dht_conf_t->layout_lock */
- int search_unhashed;
+ gf_boolean_t search_unhashed;
struct {
int err; /* 0 = normal
-1 = dir exists and no xattr
@@ -53,7 +56,7 @@ struct dht_layout {
uint32_t start;
uint32_t stop;
xlator_t *xlator;
- } list[0];
+ } list[];
};
typedef struct dht_layout dht_layout_t;
@@ -140,7 +143,6 @@ struct dht_local {
struct {
uint32_t hole_cnt;
uint32_t overlaps_cnt;
- uint32_t missing;
uint32_t down;
uint32_t misc;
dht_selfheal_dir_cbk_t dir_cbk;
@@ -161,6 +163,7 @@ struct dht_local {
/* which xattr request? */
char xsel[256];
+ int32_t alloc_len;
char *newpath;
@@ -177,7 +180,11 @@ struct dht_local {
glusterfs_fop_t fop;
+ gf_boolean_t linked;
+ xlator_t *link_subvol;
+
struct dht_rebalance_ rebalance;
+ xlator_t *first_up_subvol;
};
typedef struct dht_local dht_local_t;
@@ -206,15 +213,27 @@ enum gf_defrag_status_t {
GF_DEFRAG_STATUS_STOPPED,
GF_DEFRAG_STATUS_COMPLETE,
GF_DEFRAG_STATUS_FAILED,
+ GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED,
+ GF_DEFRAG_STATUS_LAYOUT_FIX_STOPPED,
+ GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE,
+ GF_DEFRAG_STATUS_LAYOUT_FIX_FAILED,
};
typedef enum gf_defrag_status_t gf_defrag_status_t;
+typedef struct gf_defrag_pattern_list gf_defrag_pattern_list_t;
+
+struct gf_defrag_pattern_list {
+ char path_pattern[256];
+ uint64_t size;
+ gf_defrag_pattern_list_t *next;
+};
struct gf_defrag_info_ {
uint64_t total_files;
uint64_t total_data;
uint64_t num_files_lookedup;
uint64_t total_failures;
+ uint64_t skipped;
gf_lock_t lock;
int cmd;
pthread_t th;
@@ -227,7 +246,7 @@ struct gf_defrag_info_ {
uuid_t node_uuid;
struct timeval start_time;
gf_boolean_t stats;
-
+ gf_defrag_pattern_list_t *defrag_pattern;
};
typedef struct gf_defrag_info_ gf_defrag_info_t;
@@ -243,8 +262,8 @@ struct dht_conf {
gf_boolean_t search_unhashed;
int gen;
dht_du_t *du_stats;
- uint64_t min_free_disk;
- uint32_t min_free_inodes;
+ double min_free_disk;
+ double min_free_inodes;
char disk_unit;
int32_t refresh_interval;
gf_boolean_t unhashed_sticky_bit;
@@ -264,6 +283,7 @@ struct dht_conf {
/* to keep track of nodes which are decomissioned */
xlator_t **decommissioned_bricks;
int decommission_in_progress;
+ int decommission_subvols_cnt;
/* defrag related */
gf_defrag_info_t *defrag;
@@ -271,6 +291,17 @@ struct dht_conf {
/* Request to filter directory entries in readdir request */
gf_boolean_t readdir_optimize;
+
+ /* Support regex-based name reinterpretation. */
+ regex_t rsync_regex;
+ gf_boolean_t rsync_regex_valid;
+ regex_t extra_regex;
+ gf_boolean_t extra_regex_valid;
+
+ /* Support variable xattr names. */
+ char *xattr_name;
+ char *link_xattr_name;
+ char *wild_xattr_name;
};
typedef struct dht_conf dht_conf_t;
@@ -301,13 +332,7 @@ typedef enum {
#define DHT_MIGRATION_IN_PROGRESS 1
#define DHT_MIGRATION_COMPLETED 2
-#define DHT_LINKFILE_KEY "trusted.glusterfs.dht.linkto"
-#define DHT_LINKFILE_MODE (S_ISVTX)
-
-#define check_is_linkfile(i,s,x) ( \
- ((st_mode_from_ia ((s)->ia_prot, (s)->ia_type) & ~S_IFMT) \
- == DHT_LINKFILE_MODE) && \
- dict_get (x, DHT_LINKFILE_KEY))
+#define check_is_linkfile(i,s,x,n) (IS_DHT_LINKFILE_MODE (s) && dict_get (x, n))
#define IS_DHT_MIGRATION_PHASE2(buf) ( \
IA_ISREG ((buf)->ia_type) && \
@@ -326,6 +351,8 @@ typedef enum {
} \
} while (0)
+#define dht_inode_missing(op_errno) (op_errno == ENOENT || op_errno == ESTALE)
+
#define check_is_dir(i,s,x) (IA_ISDIR(s->ia_type))
#define layout_is_sane(layout) ((layout) && (layout->cnt > 0))
@@ -370,6 +397,7 @@ typedef enum {
UNLOCK (&inode->lock); \
} while (0)
+#define is_greater_time(a, an, b, bn) (((a) < (b)) || (((a) == (b)) && ((an) < (bn))))
dht_layout_t *dht_layout_new (xlator_t *this, int cnt);
dht_layout_t *dht_layout_get (xlator_t *this, inode_t *inode);
dht_layout_t *dht_layout_for_subvol (xlator_t *this, xlator_t *subvol);
@@ -379,7 +407,7 @@ int dht_layout_normalize (xlator_t *this, l
int dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout,
uint32_t *holes_p, uint32_t *overlaps_p,
uint32_t *missing_p, uint32_t *down_p,
- uint32_t *misc_p);
+ uint32_t *misc_p, uint32_t *no_space_p);
int dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout,
xlator_t *subvol, loc_t *loc, dict_t *xattr);
@@ -413,12 +441,14 @@ int dht_iatt_merge (xlator_t *this, struct iatt *to, struct iatt
xlator_t *dht_subvol_get_hashed (xlator_t *this, loc_t *loc);
xlator_t *dht_subvol_get_cached (xlator_t *this, inode_t *inode);
xlator_t *dht_subvol_next (xlator_t *this, xlator_t *prev);
+xlator_t *dht_subvol_next_available (xlator_t *this, xlator_t *prev);
int dht_subvol_cnt (xlator_t *this, xlator_t *subvol);
-int dht_hash_compute (int type, const char *name, uint32_t *hash_p);
+int dht_hash_compute (xlator_t *this, int type, const char *name, uint32_t *hash_p);
int dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
- xlator_t *tovol, xlator_t *fromvol, loc_t *loc);
+ xlator_t *this, xlator_t *tovol,
+ xlator_t *fromvol, loc_t *loc);
int dht_lookup_directory (call_frame_t *frame, xlator_t *this, loc_t *loc);
int dht_lookup_everywhere (call_frame_t *frame, xlator_t *this, loc_t *loc);
int
@@ -436,7 +466,8 @@ dht_layout_sort_volname (dht_layout_t *layout);
int dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc);
gf_boolean_t dht_is_subvol_filled (xlator_t *this, xlator_t *subvol);
-xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol);
+xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol,
+ dht_local_t *layout);
int dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx);
int dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode);
@@ -658,7 +689,16 @@ int32_t dht_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
struct iatt *stbuf, int32_t valid, dict_t *xdata);
int32_t dht_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iatt *stbuf, int32_t valid, dict_t *xdata);
-
+int32_t dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int32_t mode, off_t offset, size_t len, dict_t *xdata);
+int32_t dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, size_t len, dict_t *xdata);
+int32_t dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, off_t len, dict_t *xdata);
+
+int32_t dht_init (xlator_t *this);
+void dht_fini (xlator_t *this);
+int dht_reconfigure (xlator_t *this, dict_t *options);
int32_t dht_notify (xlator_t *this, int32_t event, void *data, ...);
/* definitions for nufa/switch */
@@ -692,7 +732,8 @@ int
gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict);
int
-gf_defrag_stop (gf_defrag_info_t *defrag, dict_t *output);
+gf_defrag_stop (gf_defrag_info_t *defrag, gf_defrag_status_t status,
+ dict_t *output);
void*
gf_defrag_start (void *this);
@@ -712,7 +753,35 @@ dht_inode_ctx_layout_set (inode_t *inode, xlator_t *this,
int
dht_inode_ctx_time_update (inode_t *inode, xlator_t *this, struct iatt *stat,
int32_t update_ctx);
+void dht_inode_ctx_time_set (inode_t *inode, xlator_t *this, struct iatt *stat);
int dht_inode_ctx_get (inode_t *inode, xlator_t *this, dht_inode_ctx_t **ctx);
int dht_inode_ctx_set (inode_t *inode, xlator_t *this, dht_inode_ctx_t *ctx);
+int
+dht_dir_attr_heal (void *data);
+int
+dht_dir_attr_heal_done (int ret, call_frame_t *sync_frame, void *data);
+int
+dht_dir_has_layout (dict_t *xattr, char *name);
+gf_boolean_t
+dht_is_subvol_in_layout (dht_layout_t *layout, xlator_t *xlator);
+xlator_t *
+dht_subvol_with_free_space_inodes (xlator_t *this, xlator_t *subvol,
+ dht_layout_t *layout);
+xlator_t *
+dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol,
+ dht_layout_t *layout);
+int
+dht_linkfile_attr_heal (call_frame_t *frame, xlator_t *this);
+
+void
+dht_layout_dump (dht_layout_t *layout, const char *prefix);
+int32_t
+dht_priv_dump (xlator_t *this);
+int32_t
+dht_inodectx_dump (xlator_t *this, inode_t *inode);
+
+int
+dht_inode_ctx_get1 (xlator_t *this, inode_t *inode, xlator_t **subvol);
+
#endif/* _DHT_H */
diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c
index 52ea3a32a..fe3955ecb 100644
--- a/xlators/cluster/dht/src/dht-diskusage.c
+++ b/xlators/cluster/dht/src/dht-diskusage.c
@@ -248,50 +248,167 @@ dht_is_subvol_filled (xlator_t *this, xlator_t *subvol)
return is_subvol_filled;
}
+
+/*Get the best subvolume to create the file in*/
xlator_t *
-dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol)
+dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol,
+ dht_local_t *local)
{
- int i = 0;
- double max = 0;
- double max_inodes = 0;
xlator_t *avail_subvol = NULL;
dht_conf_t *conf = NULL;
+ dht_layout_t *layout = NULL;
+ loc_t *loc = NULL;
conf = this->private;
-
- LOCK (&conf->subvolume_lock);
+ if (!local)
+ goto out;
+ loc = &local->loc;
+ if (!local->layout) {
+ layout = dht_layout_get (this, loc->parent);
+
+ if (!layout) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "layout missing path=%s parent=%s",
+ loc->path, uuid_utoa (loc->parent->gfid));
+ goto out;
+ }
+ } else {
+ layout = dht_layout_ref (this, local->layout);
+ }
+
+ LOCK (&conf->subvolume_lock);
{
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (conf->disk_unit == 'p') {
- if ((conf->du_stats[i].avail_percent > max)
- && (conf->du_stats[i].avail_inodes > max_inodes)) {
- max = conf->du_stats[i].avail_percent;
- max_inodes = conf->du_stats[i].avail_inodes;
- avail_subvol = conf->subvolumes[i];
- }
- } else {
- if ((conf->du_stats[i].avail_space > max)
- && (conf->du_stats[i].avail_inodes > max_inodes)) {
- max = conf->du_stats[i].avail_space;
- max_inodes = conf->du_stats[i].avail_inodes;
- avail_subvol = conf->subvolumes[i];
- }
+ avail_subvol = dht_subvol_with_free_space_inodes(this, subvol,
+ layout);
+ if(!avail_subvol)
+ {
+ avail_subvol = dht_subvol_maxspace_nonzeroinode(this,
+ subvol,
+ layout);
+ }
- }
- }
}
UNLOCK (&conf->subvolume_lock);
-
+out:
if (!avail_subvol) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no subvolume has enough free space and inodes to create");
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "no subvolume has enough free space and/or inodes\
+ to create");
+ avail_subvol = subvol;
}
- if ((max < conf->min_free_disk) && (max_inodes < conf->min_free_inodes))
- avail_subvol = subvol;
+ if (layout)
+ dht_layout_unref (this, layout);
+ return avail_subvol;
+}
+
+static inline
+int32_t dht_subvol_has_err (xlator_t *this, dht_layout_t *layout)
+{
+ int ret = -1;
+ int i = 0;
+
+ if (!this || !layout)
+ goto out;
+
+ /* check if subvol has layout errors, before selecting it */
+ for (i = 0; i < layout->cnt; i++) {
+ if (!strcmp (layout->list[i].xlator->name, this->name) &&
+ (layout->list[i].err != 0)) {
+ ret = -1;
+ goto out;
+ }
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+/*Get subvolume which has both space and inodes more than the min criteria*/
+xlator_t *
+dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol,
+ dht_layout_t *layout)
+{
+ int i = 0;
+ double max = 0;
+ double max_inodes = 0;
+ int ignore_subvol = 0;
+
+ xlator_t *avail_subvol = NULL;
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ for(i=0; i < conf->subvolume_cnt; i++) {
+ /* check if subvol has layout errors, before selecting it */
+ ignore_subvol = dht_subvol_has_err (conf->subvolumes[i],
+ layout);
+ if (ignore_subvol)
+ continue;
+
+ if ((conf->disk_unit == 'p') &&
+ (conf->du_stats[i].avail_percent > conf->min_free_disk) &&
+ (conf->du_stats[i].avail_inodes > conf->min_free_inodes)) {
+ if ((conf->du_stats[i].avail_inodes > max_inodes) ||
+ (conf->du_stats[i].avail_percent > max)) {
+ max = conf->du_stats[i].avail_percent;
+ max_inodes = conf->du_stats[i].avail_inodes;
+ avail_subvol = conf->subvolumes[i];
+ }
+ }
+
+ if ((conf->disk_unit != 'p') &&
+ (conf->du_stats[i].avail_space > conf->min_free_disk) &&
+ (conf->du_stats[i].avail_inodes > conf->min_free_inodes)) {
+ if ((conf->du_stats[i].avail_inodes > max_inodes) ||
+ (conf->du_stats[i].avail_space > max)) {
+ max = conf->du_stats[i].avail_space;
+ max_inodes = conf->du_stats[i].avail_inodes;
+ avail_subvol = conf->subvolumes[i];
+ }
+ }
+ }
+
+ return avail_subvol;
+}
- if (!avail_subvol)
- avail_subvol = subvol;
- return avail_subvol;
+/* Get subvol which has atleast one inode and maximum space */
+xlator_t *
+dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol,
+ dht_layout_t *layout)
+{
+ int i = 0;
+ double max = 0;
+ int ignore_subvol = 0;
+
+ xlator_t *avail_subvol = NULL;
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ /* check if subvol has layout errors, before selecting it */
+ ignore_subvol = dht_subvol_has_err (conf->subvolumes[i],
+ layout);
+ if (ignore_subvol)
+ continue;
+
+ if (conf->disk_unit == 'p') {
+ if ((conf->du_stats[i].avail_percent > max)
+ && (conf->du_stats[i].avail_inodes > 0 )) {
+ max = conf->du_stats[i].avail_percent;
+ avail_subvol = conf->subvolumes[i];
+ }
+ } else {
+ if ((conf->du_stats[i].avail_space > max)
+ && (conf->du_stats[i].avail_inodes > 0)) {
+ max = conf->du_stats[i].avail_space;
+ avail_subvol = conf->subvolumes[i];
+ }
+ }
+ }
+
+ return avail_subvol;
}
diff --git a/xlators/cluster/dht/src/dht-hashfn.c b/xlators/cluster/dht/src/dht-hashfn.c
index 97eb1f05f..656cf23a0 100644
--- a/xlators/cluster/dht/src/dht-hashfn.c
+++ b/xlators/cluster/dht/src/dht-hashfn.c
@@ -44,30 +44,68 @@ dht_hash_compute_internal (int type, const char *name, uint32_t *hash_p)
}
-#define MAKE_RSYNC_FRIENDLY_NAME(rsync_frndly_name, name) do { \
- rsync_frndly_name = (char *) name; \
- if (name[0] == '.') { \
- char *dot = 0; \
- int namelen = 0; \
- \
- dot = strrchr (name, '.'); \
- if (dot && dot > (name + 1) && *(dot + 1)) { \
- namelen = (dot - name); \
- rsync_frndly_name = alloca (namelen); \
- strncpy (rsync_frndly_name, name + 1, \
- namelen); \
- rsync_frndly_name[namelen - 1] = 0; \
- } \
- } \
- } while (0);
+static inline
+gf_boolean_t
+dht_munge_name (const char *original, char *modified, size_t len, regex_t *re)
+{
+ regmatch_t matches[2];
+ size_t new_len;
+ if (regexec(re,original,2,matches,0) != REG_NOMATCH) {
+ if (matches[1].rm_so != -1) {
+ new_len = matches[1].rm_eo - matches[1].rm_so;
+ /* Equal would fail due to the NUL at the end. */
+ if (new_len < len) {
+ memcpy (modified,original+matches[1].rm_so,
+ new_len);
+ modified[new_len] = '\0';
+ return _gf_true;
+ }
+ }
+ }
+
+ /* This is guaranteed safe because of how the dest was allocated. */
+ strcpy(modified,original);
+ return _gf_false;
+}
int
-dht_hash_compute (int type, const char *name, uint32_t *hash_p)
+dht_hash_compute (xlator_t *this, int type, const char *name, uint32_t *hash_p)
{
- char *rsync_friendly_name = NULL;
+ char *rsync_friendly_name = NULL;
+ dht_conf_t *priv = this->private;
+ size_t len = 0;
+ gf_boolean_t munged = _gf_false;
+
+ /*
+ * It wouldn't be safe to use alloca in an inline function that doesn't
+ * actually get inlined, and it wouldn't be efficient to do a real
+ * allocation, so we use alloca here (if needed) and pass that to the
+ * inline.
+ */
- MAKE_RSYNC_FRIENDLY_NAME (rsync_friendly_name, name);
+ if (priv->extra_regex_valid) {
+ len = strlen(name) + 1;
+ rsync_friendly_name = alloca(len);
+ munged = dht_munge_name (name, rsync_friendly_name, len,
+ &priv->extra_regex);
+ }
+
+ if (!munged && priv->rsync_regex_valid) {
+ len = strlen(name) + 1;
+ rsync_friendly_name = alloca(len);
+ gf_log (this->name, GF_LOG_TRACE, "trying regex for %s", name);
+ munged = dht_munge_name (name, rsync_friendly_name, len,
+ &priv->rsync_regex);
+ if (munged) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "munged down to %s", rsync_friendly_name);
+ }
+ }
+
+ if (!munged) {
+ rsync_friendly_name = (char *)name;
+ }
return dht_hash_compute_internal (type, rsync_friendly_name, hash_p);
}
diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c
index 055ea7cb0..f1dc5072f 100644
--- a/xlators/cluster/dht/src/dht-helper.c
+++ b/xlators/cluster/dht/src/dht-helper.c
@@ -18,6 +18,28 @@
#include "xlator.h"
#include "dht-common.h"
+static inline int
+dht_inode_ctx_set1 (xlator_t *this, inode_t *inode, xlator_t *subvol)
+{
+ uint64_t tmp_subvol = 0;
+
+ tmp_subvol = (long)subvol;
+ return inode_ctx_set1 (inode, this, &tmp_subvol);
+}
+
+int
+dht_inode_ctx_get1 (xlator_t *this, inode_t *inode, xlator_t **subvol)
+{
+ int ret = -1;
+ uint64_t tmp_subvol = 0;
+
+ ret = inode_ctx_get1 (inode, this, &tmp_subvol);
+ if (tmp_subvol && subvol)
+ *subvol = (xlator_t *)tmp_subvol;
+
+ return ret;
+}
+
int
dht_frame_return (call_frame_t *frame)
@@ -40,6 +62,43 @@ dht_frame_return (call_frame_t *frame)
}
+static uint64_t
+dht_bits_for (uint64_t num)
+{
+ uint64_t bits = 0, ctrl = 1;
+
+ while (ctrl < num) {
+ ctrl *= 2;
+ bits ++;
+ }
+
+ return bits;
+}
+
+/*
+ * A slightly "updated" version of the algorithm described in the commit log
+ * is used here.
+ *
+ * The only enhancement is that:
+ *
+ * - The number of bits used by the backend filesystem for HUGE d_off which
+ * is described as 63, and
+ * - The number of bits used by the d_off presented by the transformation
+ * upwards which is described as 64, are both made "configurable."
+ */
+
+
+#define BACKEND_D_OFF_BITS 63
+#define PRESENT_D_OFF_BITS 63
+
+#define ONE 1ULL
+#define MASK (~0ULL)
+#define PRESENT_MASK (MASK >> (64 - PRESENT_D_OFF_BITS))
+#define BACKEND_MASK (MASK >> (64 - BACKEND_D_OFF_BITS))
+
+#define TOP_BIT (ONE << (PRESENT_D_OFF_BITS - 1))
+#define SHIFT_BITS (max (0, (BACKEND_D_OFF_BITS - PRESENT_D_OFF_BITS + 1)))
+
int
dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p)
{
@@ -47,6 +106,9 @@ dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p)
int cnt = 0;
int max = 0;
uint64_t y = 0;
+ uint64_t hi_mask = 0;
+ uint64_t off_mask = 0;
+ int max_bits = 0;
if (x == ((uint64_t) -1)) {
y = (uint64_t) -1;
@@ -60,7 +122,23 @@ dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p)
max = conf->subvolume_cnt;
cnt = dht_subvol_cnt (this, subvol);
- y = ((x * max) + cnt);
+ if (max == 1) {
+ y = x;
+ goto out;
+ }
+
+ max_bits = dht_bits_for (max);
+
+ hi_mask = ~(PRESENT_MASK >> (max_bits + 1));
+
+ if (x & hi_mask) {
+ /* HUGE d_off */
+ off_mask = MASK << max_bits;
+ y = TOP_BIT | ((x >> SHIFT_BITS) & off_mask) | cnt;
+ } else {
+ /* small d_off */
+ y = ((x * max) + cnt);
+ }
out:
if (y_p)
@@ -135,16 +213,38 @@ dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p,
int max = 0;
uint64_t x = 0;
xlator_t *subvol = 0;
+ int max_bits = 0;
+ uint64_t off_mask = 0;
+ uint64_t host_mask = 0;
if (!this->private)
- goto out;
+ return -1;
conf = this->private;
max = conf->subvolume_cnt;
- cnt = y % max;
- x = y / max;
+ if (max == 1) {
+ x = y;
+ cnt = 0;
+ goto out;
+ }
+
+ if (y & TOP_BIT) {
+ /* HUGE d_off */
+ max_bits = dht_bits_for (max);
+ off_mask = (MASK << max_bits);
+ host_mask = ~(off_mask);
+
+ x = ((y & ~TOP_BIT) & off_mask) << SHIFT_BITS;
+
+ cnt = y & host_mask;
+ } else {
+ /* small d_off */
+ cnt = y % max;
+ x = y / max;
+ }
+out:
subvol = conf->subvolumes[cnt];
if (subvol_p)
@@ -153,7 +253,6 @@ dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p,
if (x_p)
*x_p = x;
-out:
return 0;
}
@@ -263,20 +362,6 @@ out:
return local;
}
-
-char *
-basestr (const char *str)
-{
- char *basestr = NULL;
-
- basestr = strrchr (str, '/');
- if (basestr)
- basestr ++;
-
- return basestr;
-}
-
-
xlator_t *
dht_first_up_subvol (xlator_t *this)
{
@@ -428,7 +513,36 @@ out:
return next;
}
+/* This func wraps around, if prev is actually the last subvol.
+ */
+xlator_t *
+dht_subvol_next_available (xlator_t *this, xlator_t *prev)
+{
+ dht_conf_t *conf = NULL;
+ int i = 0;
+ xlator_t *next = NULL;
+
+ conf = this->private;
+ if (!conf)
+ goto out;
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == prev) {
+ /* if prev is last in conf->subvolumes, then wrap
+ * around.
+ */
+ if ((i + 1) < conf->subvolume_cnt) {
+ next = conf->subvolumes[i + 1];
+ } else {
+ next = conf->subvolumes[0];
+ }
+ break;
+ }
+ }
+
+out:
+ return next;
+}
int
dht_subvol_cnt (xlator_t *this, xlator_t *subvol)
{
@@ -620,36 +734,61 @@ dht_migration_complete_check_task (void *data)
call_frame_t *frame = NULL;
loc_t tmp_loc = {0,};
char *path = NULL;
+ dht_conf_t *conf = NULL;
+ inode_t *inode = NULL;
+ fd_t *iter_fd = NULL;
+ uint64_t tmp_subvol = 0;
+ int open_failed = 0;
this = THIS;
frame = data;
local = frame->local;
+ conf = this->private;
src_node = local->cached_subvol;
- /* getxattr on cached_subvol for 'linkto' value */
- if (!local->loc.inode)
+ if (!local->loc.inode && !local->fd) {
+ local->op_errno = EINVAL;
+ goto out;
+ }
+
+ inode = (!local->fd) ? local->loc.inode : local->fd->inode;
+
+ /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr
+ * as root:root. If a fd is already open, access check wont be done*/
+
+ if (!local->loc.inode) {
ret = syncop_fgetxattr (src_node, local->fd, &dict,
- DHT_LINKFILE_KEY);
- else
+ conf->link_xattr_name);
+ } else {
+ SYNCTASK_SETID (0, 0);
ret = syncop_getxattr (src_node, &local->loc, &dict,
- DHT_LINKFILE_KEY);
+ conf->link_xattr_name);
+ SYNCTASK_SETID (frame->root->uid, frame->root->gid);
+ }
if (!ret)
dst_node = dht_linkfile_subvol (this, NULL, NULL, dict);
if (ret) {
- if ((errno != ENOENT) || (!local->loc.inode)) {
+ if (!dht_inode_missing(-ret) || (!local->loc.inode)) {
+ local->op_errno = -ret;
gf_log (this->name, GF_LOG_ERROR,
"%s: failed to get the 'linkto' xattr %s",
- local->loc.path, strerror (errno));
+ local->loc.path, strerror (-ret));
+ ret = -1;
goto out;
}
+
/* Need to do lookup on hashed subvol, then get the file */
ret = syncop_lookup (this, &local->loc, NULL, &stbuf, NULL,
NULL);
- if (ret)
+ if (ret) {
+ local->op_errno = -ret;
+ ret = -1;
goto out;
+ }
+
dst_node = dht_subvol_get_cached (this, local->loc.inode);
}
@@ -658,17 +797,21 @@ dht_migration_complete_check_task (void *data)
"%s: failed to get the destination node",
local->loc.path);
ret = -1;
+ local->op_errno = EINVAL;
goto out;
}
/* lookup on dst */
if (local->loc.inode) {
- ret = syncop_lookup (dst_node, &local->loc, NULL, &stbuf, NULL, NULL);
+ ret = syncop_lookup (dst_node, &local->loc, NULL, &stbuf, NULL,
+ NULL);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"%s: failed to lookup the file on %s",
local->loc.path, dst_node->name);
+ local->op_errno = -ret;
+ ret = -1;
goto out;
}
@@ -677,6 +820,7 @@ dht_migration_complete_check_task (void *data)
"%s: gfid different on the target file on %s",
local->loc.path, dst_node->name);
ret = -1;
+ local->op_errno = EIO;
goto out;
}
}
@@ -684,15 +828,13 @@ dht_migration_complete_check_task (void *data)
/* update inode ctx (the layout) */
dht_layout_unref (this, local->layout);
- if (!local->loc.inode)
- ret = dht_layout_preset (this, dst_node, local->fd->inode);
- else
- ret = dht_layout_preset (this, dst_node, local->loc.inode);
+ ret = dht_layout_preset (this, dst_node, inode);
if (ret != 0) {
gf_log (this->name, GF_LOG_DEBUG,
"%s: could not set preset layout for subvol %s",
local->loc.path, dst_node->name);
ret = -1;
+ local->op_errno = EINVAL;
goto out;
}
@@ -702,59 +844,69 @@ dht_migration_complete_check_task (void *data)
"%s: no pre-set layout for subvolume %s",
local->loc.path, dst_node ? dst_node->name : "<nil>");
ret = -1;
+ local->op_errno = EINVAL;
goto out;
}
- if (!local->loc.inode)
- ret = dht_layout_set (this, local->fd->inode, layout);
- else
- ret = dht_layout_set (this, local->loc.inode, layout);
+ ret = dht_layout_set (this, inode, layout);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"%s: failed to set the new layout",
local->loc.path);
+ local->op_errno = EINVAL;
goto out;
}
local->cached_subvol = dst_node;
ret = 0;
- if (!local->fd)
+ /* once we detect the migration complete, the inode-ctx2 is no more
+ required.. delete the ctx and also, it means, open() already
+ done on all the fd of inode */
+ ret = inode_ctx_reset1 (inode, this, &tmp_subvol);
+ if (tmp_subvol)
goto out;
- /* once we detect the migration complete, the fd-ctx is no more
- required.. delete the ctx, and do one extra 'fd_unref' for open fd */
- ret = fd_ctx_del (local->fd, this, NULL);
- if (!ret) {
- fd_unref (local->fd);
- ret = 0;
+ if (list_empty (&inode->fd_list))
goto out;
- }
- /* if 'local->fd' (ie, fd based operation), send a 'open()' on
- destination if not already done */
- if (local->loc.inode) {
- ret = syncop_open (dst_node, &local->loc,
- local->fd->flags, local->fd);
- } else {
- tmp_loc.inode = local->fd->inode;
- inode_path (local->fd->inode, NULL, &path);
- if (path)
- tmp_loc.path = path;
+ /* perform open as root:root. There is window between linkfile
+ * creation(root:root) and setattr with the correct uid/gid
+ */
+ SYNCTASK_SETID(0, 0);
+
+ /* perform 'open()' on all the fd's present on the inode */
+ tmp_loc.inode = inode;
+ inode_path (inode, NULL, &path);
+ if (path)
+ tmp_loc.path = path;
+ list_for_each_entry (iter_fd, &inode->fd_list, inode_list) {
+ if (fd_is_anonymous (iter_fd))
+ continue;
+
+ /* flags for open are stripped down to allow following the
+ * new location of the file, otherwise we can get EEXIST or
+ * truncate the file again as rebalance is moving the data */
ret = syncop_open (dst_node, &tmp_loc,
- local->fd->flags, local->fd);
- GF_FREE (path);
-
+ (iter_fd->flags &
+ ~(O_CREAT | O_EXCL | O_TRUNC)), iter_fd);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to open "
+ "the fd (%p, flags=0%o) on file %s @ %s",
+ iter_fd, iter_fd->flags, path, dst_node->name);
+ open_failed = 1;
+ local->op_errno = -ret;
+ ret = -1;
+ }
}
- if (ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "%s: failed to send open() on target file at %s",
- local->loc.path, dst_node->name);
+ GF_FREE (path);
+
+ SYNCTASK_SETID (frame->root->uid, frame->root->gid);
+
+ if (open_failed) {
+ ret = -1;
goto out;
}
-
- /* need this unref for the fd on src_node */
- fd_unref (local->fd);
ret = 0;
out:
@@ -798,26 +950,41 @@ dht_rebalance_inprogress_task (void *data)
char *path = NULL;
struct iatt stbuf = {0,};
loc_t tmp_loc = {0,};
+ dht_conf_t *conf = NULL;
+ inode_t *inode = NULL;
+ fd_t *iter_fd = NULL;
+ int open_failed = 0;
this = THIS;
frame = data;
local = frame->local;
+ conf = this->private;
src_node = local->cached_subvol;
- /* getxattr on cached_subvol for 'linkto' value */
- if (local->loc.inode)
+ if (!local->loc.inode && !local->fd)
+ goto out;
+
+ inode = (!local->fd) ? local->loc.inode : local->fd->inode;
+
+ /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr
+ * as root:root. If a fd is already open, access check wont be done*/
+ if (local->loc.inode) {
+ SYNCTASK_SETID (0, 0);
ret = syncop_getxattr (src_node, &local->loc, &dict,
- DHT_LINKFILE_KEY);
- else
+ conf->link_xattr_name);
+ SYNCTASK_SETID (frame->root->uid, frame->root->gid);
+ } else {
ret = syncop_fgetxattr (src_node, local->fd, &dict,
- DHT_LINKFILE_KEY);
+ conf->link_xattr_name);
+ }
- if (ret) {
+ if (ret < 0) {
gf_log (this->name, GF_LOG_ERROR,
"%s: failed to get the 'linkto' xattr %s",
- local->loc.path, strerror (errno));
- goto out;
+ local->loc.path, strerror (-ret));
+ ret = -1;
+ goto out;
}
dst_node = dht_linkfile_subvol (this, NULL, NULL, dict);
@@ -839,6 +1006,7 @@ dht_rebalance_inprogress_task (void *data)
gf_log (this->name, GF_LOG_ERROR,
"%s: failed to lookup the file on %s",
local->loc.path, dst_node->name);
+ ret = -1;
goto out;
}
@@ -853,33 +1021,51 @@ dht_rebalance_inprogress_task (void *data)
ret = 0;
- if (!local->fd)
- goto out;
+ if (list_empty (&inode->fd_list))
+ goto done;
- if (local->loc.inode) {
- ret = syncop_open (dst_node, &local->loc,
- local->fd->flags, local->fd);
- } else {
- tmp_loc.inode = local->fd->inode;
- inode_path (local->fd->inode, NULL, &path);
- if (path)
- tmp_loc.path = path;
+ /* perform open as root:root. There is window between linkfile
+ * creation(root:root) and setattr with the correct uid/gid
+ */
+ SYNCTASK_SETID (0, 0);
+
+ tmp_loc.inode = inode;
+ inode_path (inode, NULL, &path);
+ if (path)
+ tmp_loc.path = path;
+
+ list_for_each_entry (iter_fd, &inode->fd_list, inode_list) {
+ if (fd_is_anonymous (iter_fd))
+ continue;
+
+ /* flags for open are stripped down to allow following the
+ * new location of the file, otherwise we can get EEXIST or
+ * truncate the file again as rebalance is moving the data */
ret = syncop_open (dst_node, &tmp_loc,
- local->fd->flags, local->fd);
- GF_FREE (path);
+ (iter_fd->flags &
+ ~(O_CREAT | O_EXCL | O_TRUNC)), iter_fd);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to send open "
+ "the fd (%p, flags=0%o) on file %s @ %s",
+ iter_fd, iter_fd->flags, path, dst_node->name);
+ ret = -1;
+ open_failed = 1;
+ }
}
+ GF_FREE (path);
- if (ret == -1) {
- gf_log (this->name, GF_LOG_ERROR,
- "%s: failed to send open() on target file at %s",
- local->loc.path, dst_node->name);
+ SYNCTASK_SETID (frame->root->uid, frame->root->gid);
+
+ if (open_failed) {
+ ret = -1;
goto out;
}
- ret = fd_ctx_set (local->fd, this, (uint64_t)(long)dst_node);
+done:
+ ret = dht_inode_ctx_set1 (this, inode, dst_node);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
- "%s: failed to set fd-ctx target file at %s",
+ "%s: failed to set inode-ctx target file at %s",
local->loc.path, dst_node->name);
goto out;
}
@@ -923,6 +1109,34 @@ dht_inode_ctx_layout_set (inode_t *inode, xlator_t *this,
return ret;
}
+
+void
+dht_inode_ctx_time_set (inode_t *inode, xlator_t *this, struct iatt *stat)
+{
+ dht_inode_ctx_t *ctx = NULL;
+ dht_stat_time_t *time = 0;
+ int ret = -1;
+
+ ret = dht_inode_ctx_get (inode, this, &ctx);
+
+ if (ret)
+ return;
+
+ time = &ctx->time;
+
+ time->mtime = stat->ia_mtime;
+ time->mtime_nsec = stat->ia_mtime_nsec;
+
+ time->ctime = stat->ia_ctime;
+ time->ctime_nsec = stat->ia_ctime_nsec;
+
+ time->atime = stat->ia_atime;
+ time->atime_nsec = stat->ia_atime_nsec;
+
+ return;
+}
+
+
int
dht_inode_ctx_time_update (inode_t *inode, xlator_t *this, struct iatt *stat,
int32_t post)
@@ -931,6 +1145,9 @@ dht_inode_ctx_time_update (inode_t *inode, xlator_t *this, struct iatt *stat,
dht_stat_time_t *time = 0;
int ret = -1;
+ GF_VALIDATE_OR_GOTO (this->name, stat, out);
+ GF_VALIDATE_OR_GOTO (this->name, inode, out);
+
ret = dht_inode_ctx_get (inode, this, &ctx);
if (ret) {
@@ -948,6 +1165,8 @@ dht_inode_ctx_time_update (inode_t *inode, xlator_t *this, struct iatt *stat,
DHT_UPDATE_TIME(time->atime, time->atime_nsec,
stat->ia_atime, stat->ia_atime_nsec, inode, post);
+ ret = dht_inode_ctx_set (inode, this, ctx);
+out:
return 0;
}
diff --git a/xlators/cluster/dht/src/dht-inode-read.c b/xlators/cluster/dht/src/dht-inode-read.c
index 7e29a7a8c..e8a9a7196 100644
--- a/xlators/cluster/dht/src/dht-inode-read.c
+++ b/xlators/cluster/dht/src/dht-inode-read.c
@@ -35,7 +35,7 @@ dht_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
prev = cookie;
local->op_errno = op_errno;
- if ((op_ret == -1) && (op_errno != ENOENT)) {
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
gf_log (this->name, GF_LOG_DEBUG,
"subvolume %s returned -1 (%s)",
prev->this->name, strerror (op_errno));
@@ -130,10 +130,11 @@ int
dht_file_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, struct iatt *stbuf, dict_t *xdata)
{
- uint64_t tmp_subvol = 0;
+ xlator_t *subvol = 0;
dht_local_t *local = NULL;
call_frame_t *prev = NULL;
int ret = -1;
+ inode_t *inode = NULL;
GF_VALIDATE_OR_GOTO ("dht", frame, err);
GF_VALIDATE_OR_GOTO ("dht", this, out);
@@ -143,7 +144,7 @@ dht_file_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local = frame->local;
prev = cookie;
- if ((op_ret == -1) && (op_errno != ENOENT)) {
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
local->op_errno = op_errno;
gf_log (this->name, GF_LOG_DEBUG,
"subvolume %s returned -1 (%s)",
@@ -154,21 +155,23 @@ dht_file_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (local->call_cnt != 1)
goto out;
+ local->op_errno = op_errno;
/* Check if the rebalance phase2 is true */
if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) {
- if (local->fd)
- ret = fd_ctx_get (local->fd, this, &tmp_subvol);
- if (ret) {
+ inode = (local->fd) ? local->fd->inode : local->loc.inode;
+ ret = dht_inode_ctx_get1 (this, inode, &subvol);
+ if (!subvol) {
/* Phase 2 of migration */
local->rebalance.target_op_fn = dht_attr2;
ret = dht_rebalance_complete_check (this, frame);
+ if (!ret)
+ return 0;
} else {
/* value is already set in fd_ctx, that means no need
to check for whether its complete or not. */
dht_attr2 (this, frame, 0);
- }
- if (!ret)
return 0;
+ }
}
out:
@@ -381,6 +384,8 @@ dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
{
dht_local_t *local = NULL;
int ret = 0;
+ inode_t *inode = NULL;
+ xlator_t *subvol = 0;
local = frame->local;
if (!local) {
@@ -393,22 +398,24 @@ dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (local->call_cnt != 1)
goto out;
- if ((op_ret == -1) && (op_errno != ENOENT))
+ if ((op_ret == -1) && !dht_inode_missing(op_errno))
goto out;
+ local->op_errno = op_errno;
if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) {
/* File would be migrated to other node */
- ret = fd_ctx_get (local->fd, this, NULL);
- if (ret) {
+ ret = dht_inode_ctx_get1 (this, inode, &subvol);
+ if (!subvol) {
local->rebalance.target_op_fn = dht_readv2;
ret = dht_rebalance_complete_check (this, frame);
+ if (!ret)
+ return 0;
} else {
/* value is already set in fd_ctx, that means no need
to check for whether its complete or not. */
dht_readv2 (this, frame, 0);
- }
- if (!ret)
return 0;
+ }
}
out:
@@ -498,14 +505,35 @@ dht_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
{
int ret = -1;
dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ call_frame_t *prev = NULL;
local = frame->local;
+ prev = cookie;
+ if (!prev || !prev->this)
+ goto out;
if (local->call_cnt != 1)
goto out;
+ if ((op_ret == -1) && (op_errno == ENOTCONN) &&
+ IA_ISDIR(local->loc.inode->ia_type)) {
+
+ subvol = dht_subvol_next_available (this, prev->this);
+ if (!subvol)
+ goto out;
- if ((op_ret == -1) && (op_errno == ENOENT)) {
+ /* check if we are done with visiting every node */
+ if (subvol == local->cached_subvol) {
+ goto out;
+ }
+
+ STACK_WIND (frame, dht_access_cbk, subvol, subvol->fops->access,
+ &local->loc, local->rebalance.flags, NULL);
+ return 0;
+ }
+ if ((op_ret == -1) && dht_inode_missing(op_errno)) {
/* File would be migrated to other node */
+ local->op_errno = op_errno;
local->rebalance.target_op_fn = dht_access2;
ret = dht_rebalance_complete_check (frame->this, frame);
if (!ret)
@@ -593,8 +621,9 @@ int
dht_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *xdata)
{
- dht_local_t *local = NULL;
- int ret = -1;
+ dht_local_t *local = NULL;
+ inode_t *inode = NULL;
+ xlator_t *subvol = 0;
local = frame->local;
@@ -604,8 +633,8 @@ dht_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
/* If context is set, then send flush() it to the destination */
- ret = fd_ctx_get (local->fd, this, NULL);
- if (!ret) {
+ dht_inode_ctx_get1 (this, inode, &subvol);
+ if (subvol) {
dht_flush2 (this, frame, 0);
return 0;
}
@@ -621,14 +650,10 @@ dht_flush2 (xlator_t *this, call_frame_t *frame, int op_ret)
{
dht_local_t *local = NULL;
xlator_t *subvol = NULL;
- uint64_t tmp_subvol = 0;
- int ret = -1;
local = frame->local;
- ret = fd_ctx_get (local->fd, this, &tmp_subvol);
- if (!ret)
- subvol = (xlator_t *)(long)tmp_subvol;
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
if (!subvol)
subvol = local->cached_subvol;
@@ -690,12 +715,14 @@ dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
dht_local_t *local = NULL;
call_frame_t *prev = NULL;
int ret = -1;
+ inode_t *inode = NULL;
+ xlator_t *subvol = 0;
local = frame->local;
prev = cookie;
local->op_errno = op_errno;
- if (op_ret == -1) {
+ if (op_ret == -1 && !dht_inode_missing(op_errno)) {
gf_log (this->name, GF_LOG_DEBUG,
"subvolume %s returned -1 (%s)",
prev->this->name, strerror (op_errno));
@@ -710,8 +737,9 @@ dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
goto out;
}
- ret = fd_ctx_get (local->fd, this, NULL);
- if (ret) {
+ local->op_errno = op_errno;
+ dht_inode_ctx_get1 (this, inode, &subvol);
+ if (!subvol) {
local->rebalance.target_op_fn = dht_fsync2;
/* Check if the rebalance phase1 is true */
@@ -726,11 +754,12 @@ dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
if (IS_DHT_MIGRATION_PHASE2 (postbuf)) {
ret = dht_rebalance_complete_check (this, frame);
}
+ if (!ret)
+ return 0;
} else {
dht_fsync2 (this, frame, 0);
- }
- if (!ret)
return 0;
+ }
out:
DHT_STRIP_PHASE1_FLAGS (postbuf);
@@ -746,15 +775,10 @@ dht_fsync2 (xlator_t *this, call_frame_t *frame, int op_ret)
{
dht_local_t *local = NULL;
xlator_t *subvol = NULL;
- uint64_t tmp_subvol = 0;
- int ret = -1;
local = frame->local;
- ret = fd_ctx_get (local->fd, this, &tmp_subvol);
- if (!ret)
- subvol = (xlator_t *)(long)tmp_subvol;
-
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
if (!subvol)
subvol = local->cached_subvol;
diff --git a/xlators/cluster/dht/src/dht-inode-write.c b/xlators/cluster/dht/src/dht-inode-write.c
index d4a3ecc39..576f007e5 100644
--- a/xlators/cluster/dht/src/dht-inode-write.c
+++ b/xlators/cluster/dht/src/dht-inode-write.c
@@ -19,6 +19,9 @@
int dht_writev2 (xlator_t *this, call_frame_t *frame, int ret);
int dht_truncate2 (xlator_t *this, call_frame_t *frame, int ret);
int dht_setattr2 (xlator_t *this, call_frame_t *frame, int ret);
+int dht_fallocate2(xlator_t *this, call_frame_t *frame, int op_ret);
+int dht_discard2(xlator_t *this, call_frame_t *frame, int op_ret);
+int dht_zerofill2(xlator_t *this, call_frame_t *frame, int op_ret);
int
dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -27,8 +30,9 @@ dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
{
dht_local_t *local = NULL;
int ret = -1;
+ xlator_t *subvol = NULL;
- if (op_ret == -1) {
+ if (op_ret == -1 && !dht_inode_missing(op_errno)) {
goto out;
}
@@ -50,6 +54,7 @@ dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->rebalance.target_op_fn = dht_writev2;
+ local->op_errno = op_errno;
/* Phase 2 of migration */
if (IS_DHT_MIGRATION_PHASE2 (postbuf)) {
ret = dht_rebalance_complete_check (this, frame);
@@ -62,8 +67,8 @@ dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
- ret = fd_ctx_get (local->fd, this, NULL);
- if (!ret) {
+ ret = dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
+ if (subvol) {
dht_writev2 (this, frame, 0);
return 0;
}
@@ -87,14 +92,10 @@ dht_writev2 (xlator_t *this, call_frame_t *frame, int op_ret)
{
dht_local_t *local = NULL;
xlator_t *subvol = NULL;
- uint64_t tmp_subvol = 0;
- int ret = -1;
local = frame->local;
- ret = fd_ctx_get (local->fd, this, &tmp_subvol);
- if (!ret)
- subvol = (xlator_t *)(long)tmp_subvol;
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
if (!subvol)
subvol = local->cached_subvol;
@@ -169,6 +170,8 @@ dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dht_local_t *local = NULL;
call_frame_t *prev = NULL;
int ret = -1;
+ xlator_t *subvol = NULL;
+ inode_t *inode = NULL;
GF_VALIDATE_OR_GOTO ("dht", frame, err);
GF_VALIDATE_OR_GOTO ("dht", this, out);
@@ -178,7 +181,7 @@ dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local = frame->local;
prev = cookie;
- if ((op_ret == -1) && (op_errno != ENOENT)) {
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
local->op_errno = op_errno;
local->op_ret = -1;
gf_log (this->name, GF_LOG_DEBUG,
@@ -198,6 +201,7 @@ dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->rebalance.target_op_fn = dht_truncate2;
+ local->op_errno = op_errno;
/* Phase 2 of migration */
if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) {
ret = dht_rebalance_complete_check (this, frame);
@@ -209,8 +213,9 @@ dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (IS_DHT_MIGRATION_PHASE1 (postbuf)) {
dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
- ret = fd_ctx_get (local->fd, this, NULL);
- if (!ret) {
+ inode = (local->fd) ? local->fd->inode : local->loc.inode;
+ dht_inode_ctx_get1 (this, inode, &subvol);
+ if (subvol) {
dht_truncate2 (this, frame, 0);
return 0;
}
@@ -234,16 +239,13 @@ dht_truncate2 (xlator_t *this, call_frame_t *frame, int op_ret)
{
dht_local_t *local = NULL;
xlator_t *subvol = NULL;
- uint64_t tmp_subvol = 0;
- int ret = -1;
+ inode_t *inode = NULL;
local = frame->local;
- if (local->fd)
- ret = fd_ctx_get (local->fd, this, &tmp_subvol);
- if (!ret)
- subvol = (xlator_t *)(long)tmp_subvol;
+ inode = local->fd ? local->fd->inode : local->loc.inode;
+ dht_inode_ctx_get1 (this, inode, &subvol);
if (!subvol)
subvol = local->cached_subvol;
@@ -346,6 +348,407 @@ err:
return 0;
}
+
+int
+dht_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+ int ret = -1;
+ xlator_t *subvol = NULL;
+
+ GF_VALIDATE_OR_GOTO ("dht", frame, err);
+ GF_VALIDATE_OR_GOTO ("dht", this, out);
+ GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO ("dht", cookie, out);
+
+ local = frame->local;
+ prev = cookie;
+
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "subvolume %s returned -1 (%s)",
+ prev->this->name, strerror (op_errno));
+
+ goto out;
+ }
+
+ if (local->call_cnt != 1) {
+ if (local->stbuf.ia_blocks) {
+ dht_iatt_merge (this, postbuf, &local->stbuf, NULL);
+ dht_iatt_merge (this, prebuf, &local->prebuf, NULL);
+ }
+ goto out;
+ }
+ local->rebalance.target_op_fn = dht_fallocate2;
+
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) {
+ ret = dht_rebalance_complete_check (this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* Check if the rebalance phase1 is true */
+ if (IS_DHT_MIGRATION_PHASE1 (postbuf)) {
+ dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
+ dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
+ if (subvol) {
+ dht_fallocate2 (this, frame, 0);
+ return 0;
+ }
+ ret = dht_rebalance_in_progress_check (this, frame);
+ if (!ret)
+ return 0;
+ }
+
+out:
+ DHT_STRIP_PHASE1_FLAGS (postbuf);
+ DHT_STRIP_PHASE1_FLAGS (prebuf);
+ DHT_STACK_UNWIND (fallocate, frame, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+err:
+ return 0;
+}
+
+int
+dht_fallocate2(xlator_t *this, call_frame_t *frame, int op_ret)
+{
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+
+ local = frame->local;
+
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
+
+ if (!subvol)
+ subvol = local->cached_subvol;
+
+ local->call_cnt = 2; /* This is the second attempt */
+
+ STACK_WIND(frame, dht_fallocate_cbk, subvol, subvol->fops->fallocate,
+ local->fd, local->rebalance.flags, local->rebalance.offset,
+ local->rebalance.size, NULL);
+
+ return 0;
+}
+
+int
+dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ local = dht_local_init (frame, NULL, fd, GF_FOP_FALLOCATE);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->rebalance.flags = mode;
+ local->rebalance.offset = offset;
+ local->rebalance.size = len;
+
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_fallocate_cbk,
+ subvol, subvol->fops->fallocate,
+ fd, mode, offset, len, xdata);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+
+int
+dht_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+ int ret = -1;
+ xlator_t *subvol = NULL;
+
+ GF_VALIDATE_OR_GOTO ("dht", frame, err);
+ GF_VALIDATE_OR_GOTO ("dht", this, out);
+ GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO ("dht", cookie, out);
+
+ local = frame->local;
+ prev = cookie;
+
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "subvolume %s returned -1 (%s)",
+ prev->this->name, strerror (op_errno));
+
+ goto out;
+ }
+
+ if (local->call_cnt != 1) {
+ if (local->stbuf.ia_blocks) {
+ dht_iatt_merge (this, postbuf, &local->stbuf, NULL);
+ dht_iatt_merge (this, prebuf, &local->prebuf, NULL);
+ }
+ goto out;
+ }
+ local->rebalance.target_op_fn = dht_discard2;
+
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) {
+ ret = dht_rebalance_complete_check (this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* Check if the rebalance phase1 is true */
+ if (IS_DHT_MIGRATION_PHASE1 (postbuf)) {
+ dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
+ dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
+ if (subvol) {
+ dht_discard2 (this, frame, 0);
+ return 0;
+ }
+ ret = dht_rebalance_in_progress_check (this, frame);
+ if (!ret)
+ return 0;
+ }
+
+out:
+ DHT_STRIP_PHASE1_FLAGS (postbuf);
+ DHT_STRIP_PHASE1_FLAGS (prebuf);
+ DHT_STACK_UNWIND (discard, frame, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+err:
+ return 0;
+}
+
+int
+dht_discard2(xlator_t *this, call_frame_t *frame, int op_ret)
+{
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+
+ local = frame->local;
+
+ dht_inode_ctx_get1 (this, local->fd->inode, &subvol);
+
+ if (!subvol)
+ subvol = local->cached_subvol;
+
+ local->call_cnt = 2; /* This is the second attempt */
+
+ STACK_WIND(frame, dht_discard_cbk, subvol, subvol->fops->discard,
+ local->fd, local->rebalance.offset, local->rebalance.size,
+ NULL);
+
+ return 0;
+}
+
+int
+dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ local = dht_local_init (frame, NULL, fd, GF_FOP_DISCARD);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->rebalance.offset = offset;
+ local->rebalance.size = len;
+
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_discard_cbk, subvol, subvol->fops->discard,
+ fd, offset, len, xdata);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+int
+dht_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("dht", frame, err);
+ GF_VALIDATE_OR_GOTO ("dht", this, out);
+ GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO ("dht", cookie, out);
+
+ local = frame->local;
+ prev = cookie;
+
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "subvolume %s returned -1 (%s)",
+ prev->this->name, strerror (op_errno));
+ goto out;
+ }
+
+ if (local->call_cnt != 1) {
+ if (local->stbuf.ia_blocks) {
+ dht_iatt_merge (this, postbuf, &local->stbuf, NULL);
+ dht_iatt_merge (this, prebuf, &local->prebuf, NULL);
+ }
+ goto out;
+ }
+ local->rebalance.target_op_fn = dht_zerofill2;
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) {
+ ret = dht_rebalance_complete_check (this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* Check if the rebalance phase1 is true */
+ if (IS_DHT_MIGRATION_PHASE1 (postbuf)) {
+ dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
+ dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
+ ret = fd_ctx_get (local->fd, this, NULL);
+ if (!ret) {
+ dht_zerofill2 (this, frame, 0);
+ return 0;
+ }
+ ret = dht_rebalance_in_progress_check (this, frame);
+ if (!ret)
+ return 0;
+ }
+
+out:
+ DHT_STRIP_PHASE1_FLAGS (postbuf);
+ DHT_STRIP_PHASE1_FLAGS (prebuf);
+ DHT_STACK_UNWIND (zerofill, frame, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+err:
+ return 0;
+}
+
+int
+dht_zerofill2(xlator_t *this, call_frame_t *frame, int op_ret)
+{
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ uint64_t tmp_subvol = 0;
+ int ret = -1;
+
+ local = frame->local;
+
+ if (local->fd)
+ ret = fd_ctx_get (local->fd, this, &tmp_subvol);
+ if (!ret)
+ subvol = (xlator_t *)(long)tmp_subvol;
+
+ if (!subvol)
+ subvol = local->cached_subvol;
+
+ local->call_cnt = 2; /* This is the second attempt */
+
+ STACK_WIND(frame, dht_zerofill_cbk, subvol, subvol->fops->zerofill,
+ local->fd, local->rebalance.offset, local->rebalance.size,
+ NULL);
+
+ return 0;
+}
+
+int
+dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ local = dht_local_init (frame, NULL, fd, GF_FOP_ZEROFILL);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->rebalance.offset = offset;
+ local->rebalance.size = len;
+
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND (frame, dht_zerofill_cbk, subvol, subvol->fops->zerofill,
+ fd, offset, len, xdata);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+
+
/* handle cases of migration here for 'setattr()' calls */
int
dht_file_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -360,7 +763,7 @@ dht_file_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
prev = cookie;
local->op_errno = op_errno;
- if ((op_ret == -1) && (op_errno != ENOENT)) {
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
gf_log (this->name, GF_LOG_DEBUG,
"subvolume %s returned -1 (%s)",
prev->this->name, strerror (op_errno));
@@ -397,15 +800,13 @@ dht_setattr2 (xlator_t *this, call_frame_t *frame, int op_ret)
{
dht_local_t *local = NULL;
xlator_t *subvol = NULL;
- uint64_t tmp_subvol = 0;
- int ret = -1;
+ inode_t *inode = NULL;
local = frame->local;
- if (local->fd)
- ret = fd_ctx_get (local->fd, this, &tmp_subvol);
- if (!ret)
- subvol = (xlator_t *)(long)tmp_subvol;
+ inode = (local->fd) ? local->fd->inode : local->loc.inode;
+
+ dht_inode_ctx_get1 (this, inode, &subvol);
if (!subvol)
subvol = local->cached_subvol;
@@ -461,9 +862,13 @@ unlock:
UNLOCK (&frame->lock);
this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt))
+ if (is_last_call (this_call_cnt)) {
+ if (local->op_ret == 0)
+ dht_inode_ctx_time_set (local->loc.inode, this,
+ &local->stbuf);
DHT_STACK_UNWIND (setattr, frame, local->op_ret, local->op_errno,
&local->prebuf, &local->stbuf, xdata);
+ }
return 0;
}
diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c
index 198076077..e1a37b77c 100644
--- a/xlators/cluster/dht/src/dht-layout.c
+++ b/xlators/cluster/dht/src/dht-layout.c
@@ -25,14 +25,12 @@
#define layout_size(cnt) (layout_base_size + (cnt * layout_entry_size))
-
dht_layout_t *
dht_layout_new (xlator_t *this, int cnt)
{
dht_layout_t *layout = NULL;
dht_conf_t *conf = NULL;
-
conf = this->private;
layout = GF_CALLOC (1, layout_size (cnt),
@@ -50,6 +48,7 @@ dht_layout_new (xlator_t *this, int cnt)
}
layout->ref = 1;
+
out:
return layout;
}
@@ -114,7 +113,7 @@ dht_layout_unref (xlator_t *this, dht_layout_t *layout)
dht_conf_t *conf = NULL;
int ref = 0;
- if (layout->preset || !this->private)
+ if (!layout || layout->preset || !this->private)
return;
conf = this->private;
@@ -158,7 +157,7 @@ dht_layout_search (xlator_t *this, dht_layout_t *layout, const char *name)
int ret = 0;
- ret = dht_hash_compute (layout->type, name, &hash);
+ ret = dht_hash_compute (this, layout->type, name, &hash);
if (ret != 0) {
gf_log (this->name, GF_LOG_WARNING,
"hash computation failed for type=%d name=%s",
@@ -335,11 +334,12 @@ int
dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
int op_ret, int op_errno, dict_t *xattr)
{
- int i = 0;
- int ret = -1;
- int err = -1;
- void *disk_layout_raw = NULL;
- int disk_layout_len = 0;
+ int i = 0;
+ int ret = -1;
+ int err = -1;
+ void *disk_layout_raw = NULL;
+ int disk_layout_len = 0;
+ dht_conf_t *conf = this->private;
if (op_ret != 0) {
err = op_errno;
@@ -360,12 +360,12 @@ dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
if (xattr) {
/* during lookup and not mkdir */
- ret = dict_get_ptr_and_len (xattr, "trusted.glusterfs.dht",
+ ret = dict_get_ptr_and_len (xattr, conf->xattr_name,
&disk_layout_raw, &disk_layout_len);
}
if (ret != 0) {
- layout->list[i].err = -1;
+ layout->list[i].err = 0;
gf_log (this->name, GF_LOG_TRACE,
"missing disk layout on %s. err = %d",
subvol->name, err);
@@ -412,6 +412,22 @@ dht_layout_entry_swap (dht_layout_t *layout, int i, int j)
layout->list[j].err = err_swap;
}
+void
+dht_layout_range_swap (dht_layout_t *layout, int i, int j)
+{
+ uint32_t start_swap = 0;
+ uint32_t stop_swap = 0;
+
+ start_swap = layout->list[i].start;
+ stop_swap = layout->list[i].stop;
+
+ layout->list[i].start = layout->list[j].start;
+ layout->list[i].stop = layout->list[j].stop;
+
+ layout->list[j].start = start_swap;
+ layout->list[j].stop = stop_swap;
+}
+
int64_t
dht_layout_entry_cmp_volname (dht_layout_t *layout, int i, int j)
{
@@ -419,17 +435,39 @@ dht_layout_entry_cmp_volname (dht_layout_t *layout, int i, int j)
layout->list[j].xlator->name));
}
+
+gf_boolean_t
+dht_is_subvol_in_layout (dht_layout_t *layout, xlator_t *xlator)
+{
+ int i = 0;
+
+ for (i = 0; i < layout->cnt; i++) {
+ /* Check if xlator is already part of layout, and layout is
+ * non-zero. */
+ if (!strcmp (layout->list[i].xlator->name, xlator->name)) {
+ if (layout->list[i].start != layout->list[i].stop)
+ return _gf_true;
+ break;
+ }
+ }
+ return _gf_false;
+}
+
int64_t
dht_layout_entry_cmp (dht_layout_t *layout, int i, int j)
{
int64_t diff = 0;
- if (layout->list[i].err || layout->list[j].err)
- diff = layout->list[i].err - layout->list[j].err;
- else
- diff = (int64_t) layout->list[i].start
- - (int64_t) layout->list[j].start;
+ /* swap zero'ed out layouts to front, if needed */
+ if (!layout->list[j].start && !layout->list[j].stop) {
+ diff = (int64_t) layout->list[i].stop
+ - (int64_t) layout->list[j].stop;
+ goto out;
+ }
+ diff = (int64_t) layout->list[i].start
+ - (int64_t) layout->list[j].start;
+out:
return diff;
}
@@ -478,7 +516,8 @@ dht_layout_sort_volname (dht_layout_t *layout)
int
dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout,
uint32_t *holes_p, uint32_t *overlaps_p,
- uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p)
+ uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p,
+ uint32_t *no_space_p)
{
uint32_t overlaps = 0;
uint32_t missing = 0;
@@ -491,30 +530,51 @@ dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout,
uint32_t prev_stop = 0;
uint32_t last_stop = 0;
char is_virgin = 1;
-
- /* TODO: explain what is happening */
-
+ uint32_t no_space = 0;
+
+ /* This funtion scans through the layout spread of a directory to
+ check if there are any anomalies. Prior to calling this function
+ the layout entries should be sorted in the ascending order.
+
+ If the layout entry has err != 0
+ then increment the corresponding anomaly.
+ else
+ if (start of the current layout entry > stop + 1 of previous
+ non erroneous layout entry)
+ then it indicates a hole in the layout
+ if (start of the current layout entry < stop + 1 of previous
+ non erroneous layout entry)
+ then it indicates an overlap in the layout
+ */
last_stop = layout->list[0].start - 1;
prev_stop = last_stop;
for (i = 0; i < layout->cnt; i++) {
- if (layout->list[i].err) {
- switch (layout->list[i].err) {
- case -1:
- case ENOENT:
- missing++;
- break;
- case ENOTCONN:
- down++;
- break;
- case ENOSPC:
- down++;
- break;
- default:
- misc++;
+ switch (layout->list[i].err) {
+ case -1:
+ case ENOENT:
+ case ESTALE:
+ missing++;
+ continue;
+ case ENOTCONN:
+ down++;
+ continue;
+ case ENOSPC:
+ no_space++;
+ continue;
+ case 0:
+ /* if err == 0 and start == stop, then it is a non misc++;
+ * participating subvolume(spread-cnt). Then, do not
+ * check for anomalies. If start != stop, then treat it
+ * as misc err */
+ if (layout->list[i].start == layout->list[i].stop) {
+ continue;
}
+ break;
+ default:
+ misc++;
continue;
- }
+ }
is_virgin = 0;
@@ -547,6 +607,9 @@ dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout,
if (misc_p)
*misc_p = misc;
+ if (no_space_p)
+ *no_space_p = no_space;
+
return ret;
}
@@ -571,7 +634,7 @@ dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout)
ret = dht_layout_anomalies (this, loc, layout,
&holes, &overlaps,
- &missing, &down, &misc);
+ &missing, &down, &misc, NULL);
if (ret == -1) {
gf_log (this->name, GF_LOG_WARNING,
"error while finding anomalies in %s -- not good news",
@@ -615,21 +678,30 @@ out:
return ret;
}
+int
+dht_dir_has_layout (dict_t *xattr, char *name)
+{
+
+ void *disk_layout_raw = NULL;
+
+ return dict_get_ptr (xattr, name, &disk_layout_raw);
+}
int
dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
loc_t *loc, dict_t *xattr)
{
- int idx = 0;
- int pos = -1;
- int ret = 0;
- int err = 0;
- int dict_ret = 0;
- int32_t disk_layout[4];
- void *disk_layout_raw = NULL;
- int32_t count = -1;
- uint32_t start_off = -1;
- uint32_t stop_off = -1;
+ int idx = 0;
+ int pos = -1;
+ int ret = 0;
+ int err = 0;
+ int dict_ret = 0;
+ int32_t disk_layout[4];
+ void *disk_layout_raw = NULL;
+ int32_t count = -1;
+ uint32_t start_off = -1;
+ uint32_t stop_off = -1;
+ dht_conf_t *conf = this->private;
for (idx = 0; idx < layout->cnt; idx++) {
@@ -659,11 +731,11 @@ dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
goto out;
}
- dict_ret = dict_get_ptr (xattr, "trusted.glusterfs.dht",
+ dict_ret = dict_get_ptr (xattr, conf->xattr_name,
&disk_layout_raw);
if (dict_ret < 0) {
- if (err == 0) {
+ if (err == 0 && layout->list[pos].stop) {
gf_log (this->name, GF_LOG_INFO,
"%s - disk layout missing", loc->path);
ret = -1;
diff --git a/xlators/cluster/dht/src/dht-linkfile.c b/xlators/cluster/dht/src/dht-linkfile.c
index 803f344af..dbc9d0b3c 100644
--- a/xlators/cluster/dht/src/dht-linkfile.c
+++ b/xlators/cluster/dht/src/dht-linkfile.c
@@ -19,8 +19,37 @@
#include "compat.h"
#include "dht-common.h"
+int
+dht_linkfile_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ inode_t *inode, struct iatt *stbuf, dict_t *xattr,
+ struct iatt *postparent)
+{
+ char is_linkfile = 0;
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+
+ if (op_ret)
+ goto out;
+ is_linkfile = check_is_linkfile (inode, stbuf, xattr,
+ conf->link_xattr_name);
+ if (!is_linkfile)
+ gf_log (this->name, GF_LOG_WARNING, "got non-linkfile %s:%s",
+ prev->this->name, local->loc.path);
+out:
+ local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno,
+ inode, stbuf, postparent, postparent,
+ xattr);
+ return 0;
+}
+#define is_equal(a, b) (a == b)
int
dht_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, inode_t *inode,
@@ -28,29 +57,68 @@ dht_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *postparent, dict_t *xdata)
{
dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ call_frame_t *prev = NULL;
+ dict_t *xattrs = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
local = frame->local;
+ if (!op_ret)
+ local->linked = _gf_true;
+
+ FRAME_SU_UNDO (frame, dht_local_t);
+
+ if (op_ret && (op_errno == EEXIST)) {
+ conf = this->private;
+ prev = cookie;
+ subvol = prev->this;
+ if (!subvol)
+ goto out;
+ xattrs = dict_new ();
+ if (!xattrs)
+ goto out;
+ ret = dict_set_uint32 (xattrs, conf->link_xattr_name, 256);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set linkto key");
+ goto out;
+ }
+
+ STACK_WIND (frame, dht_linkfile_lookup_cbk, subvol,
+ subvol->fops->lookup, &local->loc, xattrs);
+ if (xattrs)
+ dict_unref (xattrs);
+ return 0;
+ }
+out:
local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno,
inode, stbuf, preparent, postparent,
xdata);
+ if (xattrs)
+ dict_unref (xattrs);
return 0;
}
int
dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
+ xlator_t *this,
xlator_t *tovol, xlator_t *fromvol, loc_t *loc)
{
dht_local_t *local = NULL;
dict_t *dict = NULL;
int need_unref = 0;
int ret = 0;
+ dht_conf_t *conf = this->private;
local = frame->local;
local->linkfile.linkfile_cbk = linkfile_cbk;
local->linkfile.srcvol = tovol;
+ local->linked = _gf_false;
+
dict = local->params;
if (!dict) {
dict = dict_new ();
@@ -71,8 +139,7 @@ dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
gf_log ("dht-linkfile", GF_LOG_INFO,
"%s: internal-fop set failed", loc->path);
- ret = dict_set_str (dict, "trusted.glusterfs.dht.linkto",
- tovol->name);
+ ret = dict_set_str (dict, conf->link_xattr_name, tovol->name);
if (ret < 0) {
gf_log (frame->this->name, GF_LOG_INFO,
@@ -81,6 +148,10 @@ dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
goto out;
}
+ local->link_subvol = fromvol;
+ /* Always create as root:root. dht_linkfile_attr_heal fixes the
+ * ownsership */
+ FRAME_SU_DO (frame, dht_local_t);
STACK_WIND (frame, dht_linkfile_create_cbk,
fromvol, fromvol->fops->mknod, loc,
S_IFREG | DHT_LINKFILE_MODE, 0, 0, dict);
@@ -173,7 +244,7 @@ dht_linkfile_subvol (xlator_t *this, inode_t *inode, struct iatt *stbuf,
if (!xattr)
goto out;
- ret = dict_get_ptr (xattr, "trusted.glusterfs.dht.linkto", &volname);
+ ret = dict_get_ptr (xattr, conf->link_xattr_name, &volname);
if ((-1 == ret) || !volname)
goto out;
@@ -188,3 +259,70 @@ dht_linkfile_subvol (xlator_t *this, inode_t *inode, struct iatt *stbuf,
out:
return subvol;
}
+
+int
+dht_linkfile_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *statpre,
+ struct iatt *statpost, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ loc_t *loc = NULL;
+
+ local = frame->local;
+ loc = &local->loc;
+
+ if (op_ret)
+ gf_log (this->name, GF_LOG_ERROR, "setattr of uid/gid on %s"
+ " :<gfid:%s> failed (%s)",
+ (loc->path? loc->path: "NULL"),
+ uuid_utoa(local->gfid), strerror(op_errno));
+
+ DHT_STACK_DESTROY (frame);
+
+ return 0;
+}
+
+int
+dht_linkfile_attr_heal (call_frame_t *frame, xlator_t *this)
+{
+ int ret = -1;
+ call_frame_t *copy = NULL;
+ dht_local_t *local = NULL;
+ dht_local_t *copy_local = NULL;
+ xlator_t *subvol = NULL;
+ struct iatt stbuf = {0,};
+
+ local = frame->local;
+
+ GF_VALIDATE_OR_GOTO ("dht", local, out);
+ GF_VALIDATE_OR_GOTO ("dht", local->link_subvol, out);
+
+ if (local->stbuf.ia_type == IA_INVAL)
+ return 0;
+
+ uuid_copy (local->loc.gfid, local->stbuf.ia_gfid);
+
+ copy = copy_frame (frame);
+
+ if (!copy)
+ goto out;
+
+ copy_local = dht_local_init (copy, &local->loc, NULL, 0);
+
+ if (!copy_local)
+ goto out;
+
+ stbuf = local->stbuf;
+ subvol = local->link_subvol;
+
+ copy->local = copy_local;
+
+ FRAME_SU_DO (copy, dht_local_t);
+
+ STACK_WIND (copy, dht_linkfile_setattr_cbk, subvol,
+ subvol->fops->setattr, &copy_local->loc,
+ &stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID), NULL);
+ ret = 0;
+out:
+ return ret;
+}
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index d31f8a012..4f78f5203 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -16,6 +16,8 @@
#include "dht-common.h"
#include "xlator.h"
+#include <signal.h>
+#include <fnmatch.h>
#define GF_DISK_SECTOR_SIZE 512
#define DHT_REBALANCE_PID 4242 /* Change it if required */
@@ -56,7 +58,8 @@ dht_write_with_holes (xlator_t *to, fd_t *fd, struct iovec *vec, int count,
if (ret < 0) {
gf_log (THIS->name, GF_LOG_WARNING,
"failed to write (%s)",
- strerror (errno));
+ strerror (-ret));
+ ret = -1;
goto out;
}
@@ -74,7 +77,8 @@ dht_write_with_holes (xlator_t *to, fd_t *fd, struct iovec *vec, int count,
/* 'path' will be logged in calling function */
gf_log (THIS->name, GF_LOG_WARNING,
"failed to write (%s)",
- strerror (errno));
+ strerror (-ret));
+ ret = -1;
goto out;
}
}
@@ -90,6 +94,41 @@ out:
}
+/*
+ return values:
+ -1 : failure
+ -2 : success
+
+Hard link migration is carried out in three stages.
+
+(Say there are n hardlinks)
+Stage 1: Setting the new hashed subvol information on the 1st hardlink
+ encountered (linkto setxattr)
+
+Stage 2: Creating hardlinks on new hashed subvol for the 2nd to (n-1)th
+ hardlink
+
+Stage 3: Physical migration of the data file for nth hardlink
+
+Why to deem "-2" as success and not "0":
+
+ dht_migrate_file expects return value "0" from _is_file_migratable if
+the file has to be migrated.
+
+ _is_file_migratable returns zero only when it is called with the
+flag "GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS".
+
+ gf_defrag_handle_hardlink calls dht_migrate_file for physical migration
+of the data file with the flag "GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS"
+
+Hence, gf_defrag_handle_hardlink returning "0" for success will force
+"dht_migrate_file" to migrate each of the hardlink which is not intended.
+
+For each of the three stage mentioned above "-2" will be returned and will
+be converted to "0" in dht_migrate_file.
+
+*/
+
int32_t
gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs,
struct iatt *stbuf)
@@ -101,12 +140,16 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs,
data_t *data = NULL;
struct iatt iatt = {0,};
int32_t op_errno = 0;
+ dht_conf_t *conf = NULL;
GF_VALIDATE_OR_GOTO ("defrag", loc, out);
GF_VALIDATE_OR_GOTO ("defrag", loc->name, out);
GF_VALIDATE_OR_GOTO ("defrag", stbuf, out);
GF_VALIDATE_OR_GOTO ("defrag", this, out);
GF_VALIDATE_OR_GOTO ("defrag", xattrs, out);
+ GF_VALIDATE_OR_GOTO ("defrag", this->private, out);
+
+ conf = this->private;
if (uuid_is_null (loc->pargfid)) {
gf_log ("", GF_LOG_ERROR, "loc->pargfid is NULL for "
@@ -137,10 +180,10 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs,
gf_log (this->name, GF_LOG_INFO, "Attempting to migrate hardlink %s "
"with gfid %s from %s -> %s", loc->name, uuid_utoa (loc->gfid),
cached_subvol->name, hashed_subvol->name);
- data = dict_get (xattrs, DHT_LINKFILE_KEY);
+ data = dict_get (xattrs, conf->link_xattr_name);
/* set linkto on cached -> hashed if not present, else link it */
if (!data) {
- ret = dict_set_str (xattrs, DHT_LINKFILE_KEY,
+ ret = dict_set_str (xattrs, conf->link_xattr_name,
hashed_subvol->name);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "Failed to set "
@@ -152,9 +195,11 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs,
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "Linkto setxattr "
"failed %s -> %s (%s)", cached_subvol->name,
- loc->name, strerror (errno));
+ loc->name, strerror (-ret));
+ ret = -1;
goto out;
}
+ ret = -2;
goto out;
} else {
linkto_subvol = dht_linkfile_subvol (this, NULL, NULL, xattrs);
@@ -167,7 +212,8 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs,
ret = syncop_link (hashed_subvol, loc, loc);
if (ret) {
- op_errno = errno;
+ op_errno = -ret;
+ ret = -1;
gf_log (this->name, GF_LOG_ERROR, "link of %s -> %s"
" failed on subvol %s (%s)", loc->name,
uuid_utoa(loc->gfid),
@@ -179,7 +225,8 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs,
ret = syncop_lookup (hashed_subvol, loc, NULL, &iatt, NULL, NULL);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "Failed lookup %s on %s (%s)"
- , loc->name, hashed_subvol->name, strerror (errno));
+ , loc->name, hashed_subvol->name, strerror (-ret));
+ ret = -1;
goto out;
}
@@ -189,12 +236,19 @@ gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs,
if (ret)
goto out;
}
- ret = 0;
+ ret = -2;
out:
return ret;
}
-
+/*
+ return values
+ 0 : File will be migrated
+ -2 : File will not be migrated
+ (This is the return value from gf_defrag_handle_hardlink. Checkout
+ gf_defrag_handle_hardlink for description of "returning -2")
+ -1 : failure
+*/
static inline int
__is_file_migratable (xlator_t *this, loc_t *loc,
struct iatt *stbuf, dict_t *xattrs, int flags)
@@ -217,7 +271,12 @@ __is_file_migratable (xlator_t *this, loc_t *loc,
if (flags == GF_DHT_MIGRATE_HARDLINK) {
ret = gf_defrag_handle_hardlink (this, loc,
xattrs, stbuf);
- if (ret) {
+
+ /*
+ Returning zero will force the file to be remigrated.
+ Checkout gf_defrag_handle_hardlink for more information.
+ */
+ if (ret && ret != -2) {
gf_log (this->name, GF_LOG_WARNING,
"%s: failed to migrate file with link",
loc->path);
@@ -225,8 +284,8 @@ __is_file_migratable (xlator_t *this, loc_t *loc,
} else {
gf_log (this->name, GF_LOG_WARNING,
"%s: file has hardlinks", loc->path);
+ ret = -ENOTSUP;
}
- ret = ENOTSUP;
goto out;
}
@@ -238,14 +297,16 @@ out:
static inline int
__dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struct iatt *stbuf,
- dict_t *dict, fd_t **dst_fd)
+ dict_t *dict, fd_t **dst_fd, dict_t *xattr)
{
- xlator_t *this = NULL;
- int ret = -1;
- fd_t *fd = NULL;
- struct iatt new_stbuf = {0,};
+ xlator_t *this = NULL;
+ int ret = -1;
+ fd_t *fd = NULL;
+ struct iatt new_stbuf = {0,};
+ dht_conf_t *conf = NULL;
this = THIS;
+ conf = this->private;
ret = dict_set_static_bin (dict, "gfid-req", stbuf->ia_gfid, 16);
if (ret) {
@@ -254,7 +315,7 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc
goto out;
}
- ret = dict_set_str (dict, DHT_LINKFILE_KEY, from->name);
+ ret = dict_set_str (dict, conf->link_xattr_name, from->name);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"%s: failed to set gfid in dict for create", loc->path);
@@ -281,32 +342,46 @@ __dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struc
goto out;
}
}
- if ((ret == -1) && (errno != ENOENT)) {
+ if ((ret < 0) && (-ret != ENOENT)) {
/* File exists in destination, but not accessible */
gf_log (THIS->name, GF_LOG_WARNING,
"%s: failed to lookup file (%s)",
- loc->path, strerror (errno));
+ loc->path, strerror (-ret));
+ ret = -1;
goto out;
}
/* Create the destination with LINKFILE mode, and linkto xattr,
if the linkfile already exists, it will just open the file */
ret = syncop_create (to, loc, O_RDWR, DHT_LINKFILE_MODE, fd,
- dict);
+ dict, &new_stbuf);
if (ret < 0) {
gf_log (this->name, GF_LOG_ERROR,
"failed to create %s on %s (%s)",
- loc->path, to->name, strerror (errno));
+ loc->path, to->name, strerror (-ret));
+ ret = -1;
goto out;
}
+ ret = syncop_fsetxattr (to, fd, xattr, 0);
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to set xattr on %s (%s)",
+ loc->path, to->name, strerror (-ret));
+
+ ret = syncop_ftruncate (to, fd, stbuf->ia_size);
+ if (ret < 0)
+ gf_log (this->name, GF_LOG_ERROR,
+ "ftruncate failed for %s on %s (%s)",
+ loc->path, to->name, strerror (-ret));
+
ret = syncop_fsetattr (to, fd, stbuf,
(GF_SET_ATTR_UID | GF_SET_ATTR_GID),
NULL, NULL);
if (ret < 0)
gf_log (this->name, GF_LOG_ERROR,
"chown failed for %s on %s (%s)",
- loc->path, to->name, strerror (errno));
+ loc->path, to->name, strerror (-ret));
if (dst_fd)
*dst_fd = fd;
@@ -327,13 +402,17 @@ __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc,
int ret = -1;
xlator_t *this = NULL;
+ uint64_t src_statfs_blocks = 1;
+ uint64_t dst_statfs_blocks = 1;
+
this = THIS;
ret = syncop_statfs (from, loc, &src_statfs);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"failed to get statfs of %s on %s (%s)",
- loc->path, from->name, strerror (errno));
+ loc->path, from->name, strerror (-ret));
+ ret = -1;
goto out;
}
@@ -341,7 +420,8 @@ __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc,
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"failed to get statfs of %s on %s (%s)",
- loc->path, to->name, strerror (errno));
+ loc->path, to->name, strerror (-ret));
+ ret = -1;
goto out;
}
@@ -350,22 +430,34 @@ __dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc,
if (flag != GF_DHT_MIGRATE_DATA)
goto check_avail_space;
- if (((dst_statfs.f_bavail *
- dst_statfs.f_bsize) / GF_DISK_SECTOR_SIZE) <
- (((src_statfs.f_bavail * src_statfs.f_bsize) /
- GF_DISK_SECTOR_SIZE) - stbuf->ia_blocks)) {
- gf_log (this->name, GF_LOG_WARNING,
- "data movement attempted from node (%s) with"
- " higher disk space to a node (%s) with "
- "lesser disk space (%s)", from->name,
- to->name, loc->path);
-
- /* this is not a 'failure', but we don't want to
- consider this as 'success' too :-/ */
- ret = 1;
- goto out;
+ /* Check:
+ During rebalance `migrate-data` - Destination subvol experiences
+ a `reduction` in 'blocks' of free space, at the same time source
+ subvol gains certain 'blocks' of free space. A valid check is
+ necessary here to avoid errorneous move to destination where
+ the space could be scantily available.
+ */
+ if (stbuf) {
+ dst_statfs_blocks = ((dst_statfs.f_bavail *
+ dst_statfs.f_bsize) /
+ GF_DISK_SECTOR_SIZE);
+ src_statfs_blocks = ((src_statfs.f_bavail *
+ src_statfs.f_bsize) /
+ GF_DISK_SECTOR_SIZE);
+ if ((dst_statfs_blocks - stbuf->ia_blocks) <
+ (src_statfs_blocks + stbuf->ia_blocks)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "data movement attempted from node (%s) with"
+ " higher disk space to a node (%s) with "
+ "lesser disk space (%s)", from->name,
+ to->name, loc->path);
+
+ /* this is not a 'failure', but we don't want to
+ consider this as 'success' too :-/ */
+ ret = 1;
+ goto out;
+ }
}
-
check_avail_space:
if (((dst_statfs.f_bavail * dst_statfs.f_bsize) /
GF_DISK_SECTOR_SIZE) < stbuf->ia_blocks) {
@@ -428,6 +520,8 @@ __dht_rebalance_migrate_data (xlator_t *from, xlator_t *to, fd_t *src, fd_t *dst
if (ret >= 0)
ret = 0;
+ else
+ ret = -1;
return ret;
}
@@ -442,8 +536,10 @@ __dht_rebalance_open_src_file (xlator_t *from, xlator_t *to, loc_t *loc,
dict_t *dict = NULL;
xlator_t *this = NULL;
struct iatt iatt = {0,};
+ dht_conf_t *conf = NULL;
this = THIS;
+ conf = this->private;
fd = fd_create (loc->inode, DHT_REBALANCE_PID);
if (!fd) {
@@ -454,10 +550,11 @@ __dht_rebalance_open_src_file (xlator_t *from, xlator_t *to, loc_t *loc,
}
ret = syncop_open (from, loc, O_RDWR, fd);
- if (ret == -1) {
+ if (ret < 0) {
gf_log (this->name, GF_LOG_ERROR,
"failed to open file %s on %s (%s)",
- loc->path, from->name, strerror (errno));
+ loc->path, from->name, strerror (-ret));
+ ret = -1;
goto out;
}
@@ -466,7 +563,7 @@ __dht_rebalance_open_src_file (xlator_t *from, xlator_t *to, loc_t *loc,
if (!dict)
goto out;
- ret = dict_set_str (dict, DHT_LINKFILE_KEY, to->name);
+ ret = dict_set_str (dict, conf->link_xattr_name, to->name);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"failed to set xattr in dict for %s (linkto:%s)",
@@ -480,7 +577,8 @@ __dht_rebalance_open_src_file (xlator_t *from, xlator_t *to, loc_t *loc,
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"failed to set xattr on %s in %s (%s)",
- loc->path, from->name, strerror (errno));
+ loc->path, from->name, strerror (-ret));
+ ret = -1;
goto out;
}
@@ -494,7 +592,8 @@ __dht_rebalance_open_src_file (xlator_t *from, xlator_t *to, loc_t *loc,
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"failed to set mode on %s in %s (%s)",
- loc->path, from->name, strerror (errno));
+ loc->path, from->name, strerror (-ret));
+ ret = -1;
goto out;
}
@@ -519,12 +618,13 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc,
dict_t *dict = NULL;
char *link = NULL;
struct iatt stbuf = {0,};
+ dht_conf_t *conf = this->private;
dict = dict_new ();
if (!dict)
goto out;
- ret = dict_set_int32 (dict, DHT_LINKFILE_KEY, 256);
+ ret = dict_set_int32 (dict, conf->link_xattr_name, 256);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"%s: failed to set 'linkto' key in dict", loc->path);
@@ -533,19 +633,21 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc,
/* check in the destination if the file is link file */
ret = syncop_lookup (to, loc, dict, &stbuf, &rsp_dict, NULL);
- if ((ret == -1) && (errno != ENOENT)) {
+ if ((ret < 0) && (-ret != ENOENT)) {
gf_log (this->name, GF_LOG_WARNING, "%s: lookup failed (%s)",
- loc->path, strerror (errno));
+ loc->path, strerror (-ret));
+ ret = -1;
goto out;
}
/* we no more require this key */
- dict_del (dict, DHT_LINKFILE_KEY);
+ dict_del (dict, conf->link_xattr_name);
/* file exists in target node, only if it is 'linkfile' its valid,
otherwise, error out */
if (!ret) {
- if (!check_is_linkfile (loc->inode, &stbuf, rsp_dict)) {
+ if (!check_is_linkfile (loc->inode, &stbuf, rsp_dict,
+ conf->link_xattr_name)) {
gf_log (this->name, GF_LOG_WARNING,
"%s: file exists in destination", loc->path);
ret = -1;
@@ -557,7 +659,8 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc,
if (ret) {
gf_log (this->name, GF_LOG_WARNING,
"%s: failed to delete the linkfile (%s)",
- loc->path, strerror (errno));
+ loc->path, strerror (-ret));
+ ret = -1;
goto out;
}
}
@@ -577,15 +680,17 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc,
if (ret < 0) {
gf_log (this->name, GF_LOG_WARNING,
"%s: readlink on symlink failed (%s)",
- loc->path, strerror (errno));
+ loc->path, strerror (-ret));
+ ret = -1;
goto out;
}
- ret = syncop_symlink (to, loc, link, dict);
+ ret = syncop_symlink (to, loc, link, dict, 0);
if (ret) {
gf_log (this->name, GF_LOG_WARNING,
"%s: creating symlink failed (%s)",
- loc->path, strerror (errno));
+ loc->path, strerror (-ret));
+ ret = -1;
goto out;
}
@@ -595,18 +700,31 @@ migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc,
ret = syncop_mknod (to, loc, st_mode_from_ia (buf->ia_prot,
buf->ia_type),
makedev (ia_major (buf->ia_rdev),
- ia_minor (buf->ia_rdev)), dict);
+ ia_minor (buf->ia_rdev)), dict, 0);
if (ret) {
gf_log (this->name, GF_LOG_WARNING, "%s: mknod failed (%s)",
- loc->path, strerror (errno));
+ loc->path, strerror (-ret));
+ ret = -1;
goto out;
}
done:
+ ret = syncop_setattr (to, loc, buf,
+ (GF_SET_ATTR_UID | GF_SET_ATTR_GID |
+ GF_SET_ATTR_MODE), NULL, NULL);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to perform setattr on %s (%s)",
+ loc->path, to->name, strerror (-ret));
+ ret = -1;
+ }
+
ret = syncop_unlink (from, loc);
- if (ret)
+ if (ret) {
gf_log (this->name, GF_LOG_WARNING, "%s: unlink failed (%s)",
- loc->path, strerror (errno));
+ loc->path, strerror (-ret));
+ ret = -1;
+ }
out:
if (dict)
@@ -640,6 +758,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
dict_t *xattr = NULL;
dict_t *xattr_rsp = NULL;
int file_has_holes = 0;
+ dht_conf_t *conf = this->private;
gf_log (this->name, GF_LOG_INFO, "%s: attempting to move from %s to %s",
loc->path, from->name, to->name);
@@ -648,7 +767,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
if (!dict)
goto out;
- ret = dict_set_int32 (dict, DHT_LINKFILE_KEY, 256);
+ ret = dict_set_int32 (dict, conf->link_xattr_name, 256);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"%s: failed to set 'linkto' key in dict", loc->path);
@@ -659,21 +778,24 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
ret = syncop_lookup (from, loc, dict, &stbuf, &xattr_rsp, NULL);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "%s: lookup failed on %s (%s)",
- loc->path, from->name, strerror (errno));
+ loc->path, from->name, strerror (-ret));
+ ret = -1;
goto out;
}
/* we no more require this key */
- dict_del (dict, DHT_LINKFILE_KEY);
+ dict_del (dict, conf->link_xattr_name);
/* preserve source mode, so set the same to the destination */
src_ia_prot = stbuf.ia_prot;
/* Check if file can be migrated */
ret = __is_file_migratable (this, loc, &stbuf, xattr_rsp, flag);
- if (ret)
+ if (ret) {
+ if (ret == -2)
+ ret = 0;
goto out;
-
+ }
/* Take care of the special files */
if (!IA_ISREG (stbuf.ia_type)) {
/* Special files */
@@ -681,9 +803,18 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
goto out;
}
+ /* TODO: move all xattr related operations to fd based operations */
+ ret = syncop_listxattr (from, loc, &xattr);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to get xattr from %s (%s)",
+ loc->path, from->name, strerror (-ret));
+ ret = -1;
+ }
+
/* create the destination, with required modes/xattr */
ret = __dht_rebalance_create_dst_file (to, from, loc, &stbuf,
- dict, &dst_fd);
+ dict, &dst_fd, xattr);
if (ret)
goto out;
@@ -700,10 +831,12 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
goto out;
}
+
ret = syncop_fstat (from, src_fd, &stbuf);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "failed to lookup %s on %s (%s)",
- loc->path, from->name, strerror (errno));
+ loc->path, from->name, strerror (-ret));
+ ret = -1;
goto out;
}
@@ -722,33 +855,22 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"%s: failed to reset target size back to 0 (%s)",
- loc->path, strerror (errno));
+ loc->path, strerror (-ret));
}
ret = -1;
goto out;
}
- /* TODO: move all xattr related operations to fd based operations */
- ret = syncop_listxattr (from, loc, &xattr);
- if (ret == -1)
- gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to get xattr from %s (%s)",
- loc->path, from->name, strerror (errno));
-
- ret = syncop_setxattr (to, loc, xattr, 0);
- if (ret == -1)
- gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to set xattr on %s (%s)",
- loc->path, to->name, strerror (errno));
-
/* TODO: Sync the locks */
ret = syncop_fsync (to, dst_fd, 0);
- if (ret)
+ if (ret) {
gf_log (this->name, GF_LOG_WARNING,
"%s: failed to fsync on %s (%s)",
- loc->path, to->name, strerror (errno));
+ loc->path, to->name, strerror (-ret));
+ ret = -1;
+ }
/* Phase 2 - Data-Migration Complete, Housekeeping updates pending */
@@ -758,7 +880,8 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
/* Failed to get the stat info */
gf_log (this->name, GF_LOG_ERROR,
"failed to fstat file %s on %s (%s)",
- loc->path, from->name, strerror (errno));
+ loc->path, from->name, strerror (-ret));
+ ret = -1;
goto out;
}
@@ -780,7 +903,8 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
if (ret) {
gf_log (this->name, GF_LOG_WARNING,
"%s: failed to perform setattr on %s (%s)",
- loc->path, to->name, strerror (errno));
+ loc->path, to->name, strerror (-ret));
+ ret = -1;
goto out;
}
@@ -791,7 +915,8 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
if (ret) {
gf_log (this->name, GF_LOG_WARNING,
"%s: failed to perform setattr on %s (%s)",
- loc->path, to->name, strerror (errno));
+ loc->path, to->name, strerror (-ret));
+ ret = -1;
}
/* Make the source as a linkfile first before deleting it */
@@ -801,16 +926,37 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
if (ret) {
gf_log (this->name, GF_LOG_WARNING, \
"%s: failed to perform setattr on %s (%s)",
- loc->path, from->name, strerror (errno));
+ loc->path, from->name, strerror (-ret));
+ ret = -1;
goto out;
}
+ /* Free up the data blocks on the source node, as the whole
+ file is migrated */
+ ret = syncop_ftruncate (from, src_fd, 0);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to perform truncate on %s (%s)",
+ loc->path, from->name, strerror (-ret));
+ ret = -1;
+ }
+
+ /* remove the 'linkto' xattr from the destination */
+ ret = syncop_fremovexattr (to, dst_fd, conf->link_xattr_name, 0);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to perform removexattr on %s (%s)",
+ loc->path, to->name, strerror (-ret));
+ ret = -1;
+ }
+
/* Do a stat and check the gfid before unlink */
ret = syncop_stat (from, loc, &empty_iatt);
if (ret) {
gf_log (this->name, GF_LOG_WARNING,
"%s: failed to do a stat on %s (%s)",
- loc->path, from->name, strerror (errno));
+ loc->path, from->name, strerror (-ret));
+ ret = -1;
goto out;
}
@@ -820,33 +966,18 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
if (ret) {
gf_log (this->name, GF_LOG_WARNING,
"%s: failed to perform unlink on %s (%s)",
- loc->path, from->name, strerror (errno));
+ loc->path, from->name, strerror (-ret));
+ ret = -1;
goto out;
}
}
- /* Free up the data blocks on the source node, as the whole
- file is migrated */
- ret = syncop_ftruncate (from, src_fd, 0);
- if (ret) {
- gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to perform truncate on %s (%s)",
- loc->path, from->name, strerror (errno));
- }
-
- /* remove the 'linkto' xattr from the destination */
- ret = syncop_fremovexattr (to, dst_fd, DHT_LINKFILE_KEY);
- if (ret) {
- gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to perform removexattr on %s (%s)",
- loc->path, to->name, strerror (errno));
- }
-
ret = syncop_lookup (this, loc, NULL, NULL, NULL, NULL);
if (ret) {
gf_log (this->name, GF_LOG_DEBUG,
"%s: failed to lookup the file on subvolumes (%s)",
- loc->path, strerror (errno));
+ loc->path, strerror (-ret));
+ ret = -1;
}
gf_log (this->name, GF_LOG_INFO,
@@ -1009,10 +1140,10 @@ gf_defrag_handle_migrate_error (int32_t op_errno, gf_defrag_info_t *defrag)
{
/* if errno is not ENOSPC or ENOTCONN, we can still continue
with rebalance process */
- if ((errno != ENOSPC) || (errno != ENOTCONN))
+ if ((op_errno != ENOSPC) || (op_errno != ENOTCONN))
return 1;
- if (errno == ENOTCONN) {
+ if (op_errno == ENOTCONN) {
/* Most probably mount point went missing (mostly due
to a brick down), say rebalance failure to user,
let him restart it if everything is fine */
@@ -1020,7 +1151,7 @@ gf_defrag_handle_migrate_error (int32_t op_errno, gf_defrag_info_t *defrag)
return -1;
}
- if (errno == ENOSPC) {
+ if (op_errno == ENOSPC) {
/* rebalance process itself failed, may be
remote brick went down, or write failed due to
disk full etc etc.. */
@@ -1031,6 +1162,31 @@ gf_defrag_handle_migrate_error (int32_t op_errno, gf_defrag_info_t *defrag)
return 0;
}
+static gf_boolean_t
+gf_defrag_pattern_match (gf_defrag_info_t *defrag, char *name, uint64_t size)
+{
+ gf_defrag_pattern_list_t *trav = NULL;
+ gf_boolean_t match = _gf_false;
+ gf_boolean_t ret = _gf_false;
+
+ GF_VALIDATE_OR_GOTO ("dht", defrag, out);
+
+ trav = defrag->defrag_pattern;
+ while (trav) {
+ if (!fnmatch (trav->path_pattern, name, FNM_NOESCAPE)) {
+ match = _gf_true;
+ break;
+ }
+ trav = trav->next;
+ }
+
+ if ((match == _gf_true) && (size >= trav->size))
+ ret = _gf_true;
+
+ out:
+ return ret;
+}
+
/* We do a depth first traversal of directories. But before we move into
* subdirs, we complete the data migration of those directories whose layouts
* have been fixed
@@ -1053,11 +1209,12 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
int32_t op_errno = 0;
char *uuid_str = NULL;
uuid_t node_uuid = {0,};
- int readdir_operrno = 0;
struct timeval dir_start = {0,};
struct timeval end = {0,};
double elapsed = {0,};
struct timeval start = {0,};
+ int32_t err = 0;
+ int loglevel = GF_LOG_TRACE;
gf_log (this->name, GF_LOG_INFO, "migrate data called on %s",
loc->path);
@@ -1073,6 +1230,7 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "Failed to open dir %s",
loc->path);
+ ret = -1;
goto out;
}
@@ -1080,17 +1238,21 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
while ((ret = syncop_readdirp (this, fd, 131072, offset, NULL,
&entries)) != 0) {
- if (ret < 0)
- break;
- /* Need to keep track of ENOENT errno, that means, there is no
- need to send more readdirp() */
- readdir_operrno = errno;
+ if (ret < 0) {
- free_entries = _gf_true;
+ gf_log (this->name, GF_LOG_ERROR, "Readdir returned %s."
+ " Aborting migrate-data",
+ strerror(-ret));
+ ret = -1;
+ goto out;
+ }
if (list_empty (&entries.list))
break;
+
+ free_entries = _gf_true;
+
list_for_each_entry_safe (entry, tmp, &entries.list, list) {
if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
ret = 1;
@@ -1110,6 +1272,12 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
if (defrag->stats == _gf_true) {
gettimeofday (&start, NULL);
}
+ if (defrag->defrag_pattern &&
+ (gf_defrag_pattern_match (defrag, entry->d_name,
+ entry->d_stat.ia_size)
+ == _gf_false)) {
+ continue;
+ }
loc_wipe (&entry_loc);
ret =dht_build_child_loc (this, &entry_loc, loc,
entry->d_name);
@@ -1144,6 +1312,7 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "%s"
" lookup failed", entry_loc.path);
+ ret = -1;
continue;
}
@@ -1152,6 +1321,7 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
if(ret < 0) {
gf_log (this->name, GF_LOG_ERROR, "Failed to "
"get node-uuid for %s", entry_loc.path);
+ ret = -1;
continue;
}
@@ -1161,6 +1331,7 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
gf_log (this->name, GF_LOG_ERROR, "Failed to "
"get node-uuid from dict for %s",
entry_loc.path);
+ ret = -1;
continue;
}
@@ -1185,30 +1356,50 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
/* if distribute is present, it will honor this key.
- * -1 is returned if distribute is not present or file
- * doesn't have a link-file. If file has link-file, the
- * path of link-file will be the value, and also that
- * guarantees that file has to be mostly migrated */
+ * -1, ENODATA is returned if distribute is not present
+ * or file doesn't have a link-file. If file has
+ * link-file, the path of link-file will be the value,
+ * and also that guarantees that file has to be mostly
+ * migrated */
ret = syncop_getxattr (this, &entry_loc, &dict,
GF_XATTR_LINKINFO_KEY);
if (ret < 0) {
- gf_log (this->name, GF_LOG_TRACE, "failed to "
- "get link-to key for %s",
- entry_loc.path);
+ if (-ret != ENODATA) {
+ loglevel = GF_LOG_ERROR;
+ defrag->total_failures += 1;
+ } else {
+ loglevel = GF_LOG_TRACE;
+ }
+ gf_log (this->name, loglevel, "%s: failed to "
+ "get "GF_XATTR_LINKINFO_KEY" key - %s",
+ entry_loc.path, strerror (-ret));
+ ret = -1;
continue;
}
ret = syncop_setxattr (this, &entry_loc, migrate_data,
0);
if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "migrate-data"
- " failed for %s", entry_loc.path);
- defrag->total_failures +=1;
+ err = op_errno;
+ /* errno is overloaded. See
+ * rebalance_task_completion () */
+ if (err != ENOSPC) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "migrate-data skipped for %s"
+ " due to space constraints",
+ entry_loc.path);
+ defrag->skipped +=1;
+ } else{
+ gf_log (this->name, GF_LOG_ERROR,
+ "migrate-data failed for %s",
+ entry_loc.path);
+ defrag->total_failures +=1;
+ }
}
- if (ret == -1) {
- op_errno = errno;
+ if (ret < 0) {
+ op_errno = -ret;
ret = gf_defrag_handle_migrate_error (op_errno,
defrag);
@@ -1243,9 +1434,6 @@ gf_defrag_migrate_data (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
gf_dirent_free (&entries);
free_entries = _gf_false;
INIT_LIST_HEAD (&entries.list);
-
- if (readdir_operrno == ENOENT)
- break;
}
gettimeofday (&end, NULL);
@@ -1269,7 +1457,6 @@ out:
}
-
int
gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
dict_t *fix_layout, dict_t *migrate_data)
@@ -1289,6 +1476,7 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "Lookup failed on %s",
loc->path);
+ ret = -1;
goto out;
}
@@ -1319,12 +1507,19 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
while ((ret = syncop_readdirp (this, fd, 131072, offset, NULL,
&entries)) != 0)
{
- if ((ret < 0) || (ret && (errno == ENOENT)))
- break;
- free_entries = _gf_true;
+
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Readdir returned %s"
+ ". Aborting fix-layout",strerror(-ret));
+ ret = -1;
+ goto out;
+ }
if (list_empty (&entries.list))
break;
+
+ free_entries = _gf_true;
+
list_for_each_entry_safe (entry, tmp, &entries.list, list) {
if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
ret = 1;
@@ -1351,7 +1546,7 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
if (uuid_is_null (entry->d_stat.ia_gfid)) {
gf_log (this->name, GF_LOG_ERROR, "%s/%s"
- "gfid not present", loc->path,
+ " gfid not present", loc->path,
entry->d_name);
continue;
}
@@ -1361,7 +1556,7 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
uuid_copy (entry_loc.gfid, entry->d_stat.ia_gfid);
if (uuid_is_null (loc->gfid)) {
gf_log (this->name, GF_LOG_ERROR, "%s/%s"
- "gfid not present", loc->path,
+ " gfid not present", loc->path,
entry->d_name);
continue;
}
@@ -1373,6 +1568,7 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "%s"
" lookup failed", entry_loc.path);
+ ret = -1;
continue;
}
@@ -1383,6 +1579,8 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
"failed for %s", entry_loc.path);
defrag->defrag_status =
GF_DEFRAG_STATUS_FAILED;
+ defrag->total_failures ++;
+ ret = -1;
goto out;
}
ret = gf_defrag_fix_layout (this, defrag, &entry_loc,
@@ -1391,6 +1589,7 @@ gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "Fix layout "
"failed for %s", entry_loc.path);
+ defrag->total_failures++;
goto out;
}
@@ -1462,6 +1661,7 @@ gf_defrag_start_crawl (void *data)
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "look up on / failed");
+ ret = -1;
goto out;
}
@@ -1481,6 +1681,8 @@ gf_defrag_start_crawl (void *data)
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "fix layout on %s failed",
loc.path);
+ defrag->total_failures++;
+ ret = -1;
goto out;
}
@@ -1587,6 +1789,7 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict)
uint64_t size = 0;
uint64_t lookup = 0;
uint64_t failures = 0;
+ uint64_t skipped = 0;
char *status = "";
double elapsed = 0;
struct timeval end = {0,};
@@ -1603,6 +1806,7 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict)
size = defrag->total_data;
lookup = defrag->num_files_lookedup;
failures = defrag->total_failures;
+ skipped = defrag->skipped;
gettimeofday (&end, NULL);
@@ -1626,6 +1830,7 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict)
gf_log (THIS->name, GF_LOG_WARNING,
"failed to set lookedup file count");
+
ret = dict_set_int32 (dict, "status", defrag->defrag_status);
if (ret)
gf_log (THIS->name, GF_LOG_WARNING,
@@ -1638,6 +1843,14 @@ gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict)
}
ret = dict_set_uint64 (dict, "failures", failures);
+ if (ret)
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "failed to set failure count");
+
+ ret = dict_set_uint64 (dict, "skipped", skipped);
+ if (ret)
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "failed to set skipped file count");
log:
switch (defrag->defrag_status) {
case GF_DEFRAG_STATUS_NOT_STARTED:
@@ -1655,13 +1868,15 @@ log:
case GF_DEFRAG_STATUS_FAILED:
status = "failed";
break;
+ default:
+ break;
}
gf_log (THIS->name, GF_LOG_INFO, "Rebalance is %s. Time taken is %.2f "
"secs", status, elapsed);
gf_log (THIS->name, GF_LOG_INFO, "Files migrated: %"PRIu64", size: %"
- PRIu64", lookups: %"PRIu64", failures: %"PRIu64, files, size,
- lookup, failures);
+ PRIu64", lookups: %"PRIu64", failures: %"PRIu64", skipped: "
+ "%"PRIu64, files, size, lookup, failures, skipped);
out:
@@ -1669,7 +1884,8 @@ out:
}
int
-gf_defrag_stop (gf_defrag_info_t *defrag, dict_t *output)
+gf_defrag_stop (gf_defrag_info_t *defrag, gf_defrag_status_t status,
+ dict_t *output)
{
/* TODO: set a variable 'stop_defrag' here, it should be checked
in defrag loop */
@@ -1681,7 +1897,7 @@ gf_defrag_stop (gf_defrag_info_t *defrag, dict_t *output)
}
gf_log ("", GF_LOG_INFO, "Received stop command on rebalance");
- defrag->defrag_status = GF_DEFRAG_STATUS_STOPPED;
+ defrag->defrag_status = status;
if (output)
gf_defrag_status_get (defrag, output);
diff --git a/xlators/cluster/dht/src/dht-rename.c b/xlators/cluster/dht/src/dht-rename.c
index f1a4364df..925538cc8 100644
--- a/xlators/cluster/dht/src/dht-rename.c
+++ b/xlators/cluster/dht/src/dht-rename.c
@@ -306,7 +306,54 @@ err:
NULL, NULL);
return 0;
}
+#define DHT_MARK_FOP_INTERNAL(xattr) do { \
+ int tmp = -1; \
+ if (!xattr) { \
+ xattr = dict_new (); \
+ if (!xattr) \
+ break; \
+ } \
+ tmp = dict_set_str (xattr, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); \
+ if (tmp) { \
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set" \
+ " internal dict key for %s", local->loc.path); \
+ } \
+ }while (0)
+
+#define DHT_MARKER_DONT_ACCOUNT(xattr) do { \
+ int tmp = -1; \
+ if (!xattr) { \
+ xattr = dict_new (); \
+ if (!xattr) \
+ break; \
+ } \
+ tmp = dict_set_str (xattr, GLUSTERFS_MARKER_DONT_ACCOUNT_KEY, \
+ "yes"); \
+ if (tmp) { \
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set" \
+ " marker dont account key for %s", local->loc.path); \
+ } \
+ }while (0)
+int
+dht_rename_done (call_frame_t *frame, xlator_t *this)
+{
+ dht_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (local->linked == _gf_true) {
+ local->linked = _gf_false;
+ dht_linkfile_attr_heal (frame, this);
+ }
+ DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
+ DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno,
+ &local->stbuf, &local->preoldparent,
+ &local->postoldparent, &local->preparent,
+ &local->postparent, NULL);
+
+ return 0;
+}
int
dht_rename_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -340,11 +387,7 @@ dht_rename_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
WIPE (&local->postparent);
if (is_last_call (this_call_cnt)) {
- DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
- DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno,
- &local->stbuf, &local->preoldparent,
- &local->postoldparent, &local->preparent,
- &local->postparent, NULL);
+ dht_rename_done (frame, this);
}
out:
@@ -362,7 +405,7 @@ dht_rename_cleanup (call_frame_t *frame)
xlator_t *dst_hashed = NULL;
xlator_t *dst_cached = NULL;
int call_cnt = 0;
-
+ dict_t *xattr = NULL;
local = frame->local;
this = frame->this;
@@ -386,24 +429,53 @@ dht_rename_cleanup (call_frame_t *frame)
if (!call_cnt)
goto nolinks;
+ DHT_MARK_FOP_INTERNAL (xattr);
+
if (dst_hashed != src_hashed && dst_hashed != src_cached) {
+ dict_t *xattr_new = NULL;
+
gf_log (this->name, GF_LOG_TRACE,
"unlinking linkfile %s @ %s => %s",
local->loc.path, dst_hashed->name, src_cached->name);
+
+ xattr_new = dict_copy_with_ref (xattr, NULL);
+
+
+ DHT_MARKER_DONT_ACCOUNT(xattr_new);
+
STACK_WIND (frame, dht_rename_unlink_cbk,
dst_hashed, dst_hashed->fops->unlink,
- &local->loc, 0, NULL);
+ &local->loc, 0, xattr_new);
+
+ dict_unref (xattr_new);
+ xattr_new = NULL;
}
if (src_cached != dst_hashed) {
+ dict_t *xattr_new = NULL;
+
gf_log (this->name, GF_LOG_TRACE,
"unlinking link %s => %s (%s)", local->loc.path,
local->loc2.path, src_cached->name);
+
+ xattr_new = dict_copy_with_ref (xattr, NULL);
+
+ if (uuid_compare (local->loc.pargfid,
+ local->loc2.pargfid) == 0) {
+ DHT_MARKER_DONT_ACCOUNT(xattr_new);
+ }
+
STACK_WIND (frame, dht_rename_unlink_cbk,
src_cached, src_cached->fops->unlink,
- &local->loc2, 0, NULL);
+ &local->loc2, 0, xattr_new);
+
+ dict_unref (xattr_new);
+ xattr_new = NULL;
}
+ if (xattr)
+ dict_unref (xattr);
+
return 0;
nolinks:
@@ -441,6 +513,10 @@ dht_rename_links_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->loc.path, prev->this->name, strerror (op_errno));
}
+ if (local->linked == _gf_true) {
+ local->linked = _gf_false;
+ dht_linkfile_attr_heal (frame, this);
+ }
DHT_STACK_DESTROY (frame);
return 0;
@@ -463,6 +539,7 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
xlator_t *rename_subvol = NULL;
call_frame_t *link_frame = NULL;
dht_local_t *link_local = NULL;
+ dict_t *xattr = NULL;
local = frame->local;
prev = cookie;
@@ -472,6 +549,8 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dst_hashed = local->dst_hashed;
dst_cached = local->dst_cached;
+ if (local->linked == _gf_true)
+ FRAME_SU_UNDO (frame, dht_local_t);
if (op_ret == -1) {
gf_log (this->name, GF_LOG_WARNING,
"%s: rename on %s failed (%s)", local->loc.path,
@@ -501,15 +580,25 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
uuid_copy (link_local->gfid, local->loc.inode->gfid);
dht_linkfile_create (link_frame, dht_rename_links_create_cbk,
- src_cached, dst_hashed, &link_local->loc);
+ this, src_cached, dst_hashed,
+ &link_local->loc);
}
err:
- dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
- dht_iatt_merge (this, &local->preoldparent, preoldparent, prev->this);
- dht_iatt_merge (this, &local->postoldparent, postoldparent, prev->this);
- dht_iatt_merge (this, &local->preparent, prenewparent, prev->this);
- dht_iatt_merge (this, &local->postparent, postnewparent, prev->this);
+ /* Merge attrs only from src_cached. In case there of src_cached !=
+ * dst_hashed, this ignores linkfile attrs. */
+ if (prev->this == src_cached) {
+ dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
+ dht_iatt_merge (this, &local->preoldparent, preoldparent,
+ prev->this);
+ dht_iatt_merge (this, &local->postoldparent, postoldparent,
+ prev->this);
+ dht_iatt_merge (this, &local->preparent, prenewparent,
+ prev->this);
+ dht_iatt_merge (this, &local->postparent, postnewparent,
+ prev->this);
+ }
+
/* NOTE: rename_subvol is the same subvolume from which dht_rename_cbk
* is called. since rename has already happened on rename_subvol,
@@ -534,24 +623,47 @@ err:
if (local->call_cnt == 0)
goto unwind;
+ DHT_MARK_FOP_INTERNAL (xattr);
+
if (src_cached != dst_hashed && src_cached != dst_cached) {
+ dict_t *xattr_new = NULL;
+
+ xattr_new = dict_copy_with_ref (xattr, NULL);
+
gf_log (this->name, GF_LOG_TRACE,
"deleting old src datafile %s @ %s",
local->loc.path, src_cached->name);
+ if (uuid_compare (local->loc.pargfid,
+ local->loc2.pargfid) == 0) {
+ DHT_MARKER_DONT_ACCOUNT(xattr_new);
+ }
+
STACK_WIND (frame, dht_rename_unlink_cbk,
src_cached, src_cached->fops->unlink,
- &local->loc, 0, NULL);
+ &local->loc, 0, xattr_new);
+
+ dict_unref (xattr_new);
+ xattr_new = NULL;
}
if (src_hashed != rename_subvol && src_hashed != src_cached) {
+ dict_t *xattr_new = NULL;
+
+ xattr_new = dict_copy_with_ref (xattr, NULL);
+
gf_log (this->name, GF_LOG_TRACE,
"deleting old src linkfile %s @ %s",
local->loc.path, src_hashed->name);
+ DHT_MARKER_DONT_ACCOUNT(xattr_new);
+
STACK_WIND (frame, dht_rename_unlink_cbk,
src_hashed, src_hashed->fops->unlink,
- &local->loc, 0, NULL);
+ &local->loc, 0, xattr_new);
+
+ dict_unref (xattr_new);
+ xattr_new = NULL;
}
if (dst_cached
@@ -563,8 +675,10 @@ err:
STACK_WIND (frame, dht_rename_unlink_cbk,
dst_cached, dst_cached->fops->unlink,
- &local->loc2, 0, NULL);
+ &local->loc2, 0, xattr);
}
+ if (xattr)
+ dict_unref (xattr);
return 0;
unwind:
@@ -572,16 +686,16 @@ unwind:
WIPE (&local->postoldparent);
WIPE (&local->preparent);
WIPE (&local->postparent);
+ if (xattr)
+ dict_unref (xattr);
- DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
- DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno,
- &local->stbuf, &local->preoldparent,
- &local->postoldparent, &local->preparent,
- &local->postparent, NULL);
+ dht_rename_done (frame, this);
return 0;
cleanup:
+ if (xattr)
+ dict_unref (xattr);
dht_rename_cleanup (frame);
return 0;
@@ -591,12 +705,13 @@ cleanup:
int
dht_do_rename (call_frame_t *frame)
{
- dht_local_t *local = NULL;
- xlator_t *dst_hashed = NULL;
- xlator_t *src_cached = NULL;
- xlator_t *dst_cached = NULL;
- xlator_t *this = NULL;
+ dht_local_t *local = NULL;
+ xlator_t *dst_hashed = NULL;
+ xlator_t *src_cached = NULL;
+ xlator_t *dst_cached = NULL;
+ xlator_t *this = NULL;
xlator_t *rename_subvol = NULL;
+ dict_t *dict = NULL;
local = frame->local;
@@ -611,13 +726,19 @@ dht_do_rename (call_frame_t *frame)
else
rename_subvol = dst_hashed;
+ if ((src_cached != dst_hashed) && (rename_subvol == dst_hashed)) {
+ DHT_MARKER_DONT_ACCOUNT(dict);
+ }
+
gf_log (this->name, GF_LOG_TRACE,
"renaming %s => %s (%s)",
local->loc.path, local->loc2.path, rename_subvol->name);
+ if (local->linked == _gf_true)
+ FRAME_SU_DO (frame, dht_local_t);
STACK_WIND (frame, dht_rename_cbk,
rename_subvol, rename_subvol->fops->rename,
- &local->loc, &local->loc2, NULL);
+ &local->loc, &local->loc2, dict);
return 0;
}
@@ -645,6 +766,9 @@ dht_rename_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local->op_ret = -1;
if (op_errno != ENOENT)
local->op_errno = op_errno;
+ } else if (local->src_cached == prev->this) {
+ /* merge of attr returned only from linkfile creation */
+ dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
}
this_call_cnt = dht_frame_return (frame);
@@ -710,6 +834,7 @@ dht_rename_create_links (call_frame_t *frame)
xlator_t *dst_hashed = NULL;
xlator_t *dst_cached = NULL;
int call_cnt = 0;
+ dict_t *xattr = NULL;
local = frame->local;
@@ -720,18 +845,27 @@ dht_rename_create_links (call_frame_t *frame)
dst_hashed = local->dst_hashed;
dst_cached = local->dst_cached;
+ DHT_MARK_FOP_INTERNAL (xattr);
if (src_cached == dst_cached) {
+ dict_t *xattr_new = NULL;
+
if (dst_hashed == dst_cached)
goto nolinks;
+ xattr_new = dict_copy_with_ref (xattr, NULL);
+
gf_log (this->name, GF_LOG_TRACE,
"unlinking dst linkfile %s @ %s",
local->loc2.path, dst_hashed->name);
+ DHT_MARKER_DONT_ACCOUNT(xattr_new);
+
STACK_WIND (frame, dht_rename_unlink_links_cbk,
dst_hashed, dst_hashed->fops->unlink,
- &local->loc2, 0, NULL);
+ &local->loc2, 0, xattr_new);
+
+ dict_unref (xattr_new);
return 0;
}
@@ -748,17 +882,28 @@ dht_rename_create_links (call_frame_t *frame)
"linkfile %s @ %s => %s",
local->loc.path, dst_hashed->name, src_cached->name);
memcpy (local->gfid, local->loc.inode->gfid, 16);
- dht_linkfile_create (frame, dht_rename_links_cbk,
+ dht_linkfile_create (frame, dht_rename_links_cbk, this,
src_cached, dst_hashed, &local->loc);
}
if (src_cached != dst_hashed) {
+ dict_t *xattr_new = NULL;
+
+ xattr_new = dict_copy_with_ref (xattr, NULL);
+
gf_log (this->name, GF_LOG_TRACE,
"link %s => %s (%s)", local->loc.path,
local->loc2.path, src_cached->name);
+ if (uuid_compare (local->loc.pargfid,
+ local->loc2.pargfid) == 0) {
+ DHT_MARKER_DONT_ACCOUNT(xattr_new);
+ }
+
STACK_WIND (frame, dht_rename_links_cbk,
src_cached, src_cached->fops->link,
- &local->loc, &local->loc2, NULL);
+ &local->loc, &local->loc2, xattr_new);
+
+ dict_unref (xattr_new);
}
nolinks:
@@ -766,6 +911,8 @@ nolinks:
/* skip to next step */
dht_do_rename (frame);
}
+ if (xattr)
+ dict_unref (xattr);
return 0;
}
diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c
index 8ac5a9e9e..0e6527544 100644
--- a/xlators/cluster/dht/src/dht-selfheal.c
+++ b/xlators/cluster/dht/src/dht-selfheal.c
@@ -17,6 +17,7 @@
#include "glusterfs.h"
#include "xlator.h"
#include "dht-common.h"
+#include "glusterfs-acl.h"
#define DHT_SET_LAYOUT_RANGE(layout,i,srt,chunk,cnt,path) do { \
layout->list[i].start = srt; \
@@ -28,6 +29,13 @@
layout->list[i].xlator->name, path); \
} while (0)
+#define DHT_RESET_LAYOUT_RANGE(layout) do { \
+ int cnt = 0; \
+ for (cnt = 0; cnt < layout->cnt; cnt++ ) { \
+ layout->list[cnt].start = 0; \
+ layout->list[cnt].stop = 0; \
+ } \
+ } while (0)
static uint32_t
dht_overlap_calc (dht_layout_t *old, int o, dht_layout_t *new, int n)
@@ -38,12 +46,20 @@ dht_overlap_calc (dht_layout_t *old, int o, dht_layout_t *new, int n)
if (old->list[o].err > 0 || new->list[n].err > 0)
return 0;
+ if (old->list[o].start == old->list[o].stop) {
+ return 0;
+ }
+
+ if (new->list[n].start == new->list[n].stop) {
+ return 0;
+ }
+
if ((old->list[o].start > new->list[n].stop) ||
(old->list[o].stop < new->list[n].start))
return 0;
- return max (old->list[o].start, new->list[n].start) -
- min (old->list[o].stop, new->list[n].stop);
+ return min (old->list[o].stop, new->list[n].stop) -
+ max (old->list[o].start, new->list[n].start) + 1;
}
@@ -101,7 +117,8 @@ dht_selfheal_dir_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int
dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc,
- dht_layout_t *layout, int i)
+ dht_layout_t *layout, int i,
+ xlator_t *req_subvol)
{
xlator_t *subvol = NULL;
dict_t *xattr = NULL;
@@ -109,16 +126,23 @@ dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc,
xlator_t *this = NULL;
int32_t *disk_layout = NULL;
dht_local_t *local = NULL;
-
+ dht_conf_t *conf = NULL;
+ data_t *data = NULL;
local = frame->local;
- subvol = layout->list[i].xlator;
+ if (req_subvol)
+ subvol = req_subvol;
+ else
+ subvol = layout->list[i].xlator;
this = frame->this;
GF_VALIDATE_OR_GOTO ("", this, err);
GF_VALIDATE_OR_GOTO (this->name, layout, err);
GF_VALIDATE_OR_GOTO (this->name, local, err);
GF_VALIDATE_OR_GOTO (this->name, subvol, err);
+ VALIDATE_OR_GOTO (this->private, err);
+
+ conf = this->private;
xattr = get_new_dict ();
if (!xattr) {
@@ -133,8 +157,7 @@ dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc,
goto err;
}
- ret = dict_set_bin (xattr, "trusted.glusterfs.dht",
- disk_layout, 4 * 4);
+ ret = dict_set_bin (xattr, conf->xattr_name, disk_layout, 4 * 4);
if (ret == -1) {
gf_log (this->name, GF_LOG_WARNING,
"%s: (subvol %s) failed to set xattr dictionary",
@@ -149,7 +172,16 @@ dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc,
layout->type, subvol->name, loc->path);
dict_ref (xattr);
-
+ if (local->xattr) {
+ data = dict_get (local->xattr, QUOTA_LIMIT_KEY);
+ if (data) {
+ ret = dict_add (xattr, QUOTA_LIMIT_KEY, data);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "set quota limit key on %s",loc->path);
+ }
+ }
+ }
if (!uuid_is_null (local->gfid))
uuid_copy (loc->gfid, local->gfid);
@@ -179,21 +211,42 @@ dht_fix_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout)
int i = 0;
int count = 0;
xlator_t *this = NULL;
+ dht_conf_t *conf = NULL;
+ dht_layout_t *dummy = NULL;
local = frame->local;
this = frame->this;
+ conf = this->private;
gf_log (this->name, GF_LOG_DEBUG,
"writing the new range for all subvolumes");
- local->call_cnt = count = layout->cnt;
+ local->call_cnt = count = conf->subvolume_cnt;
for (i = 0; i < layout->cnt; i++) {
- dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i);
+ dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i, NULL);
if (--count == 0)
- break;
+ goto out;
+ }
+ /* if we are here, subvolcount > layout_count. subvols-per-directory
+ * option might be set here. We need to clear out layout from the
+ * non-participating subvolumes, else it will result in overlaps */
+ dummy = dht_layout_new (this, 1);
+ if (!dummy)
+ goto out;
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (_gf_false ==
+ dht_is_subvol_in_layout (layout, conf->subvolumes[i])) {
+ dht_selfheal_dir_xattr_persubvol (frame, loc, dummy, 0,
+ conf->subvolumes[i]);
+ if (--count == 0)
+ break;
+ }
}
+
+ dht_layout_unref (this, dummy);
+out:
return 0;
}
@@ -204,9 +257,12 @@ dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout)
int missing_xattr = 0;
int i = 0;
xlator_t *this = NULL;
+ dht_conf_t *conf = NULL;
+ dht_layout_t *dummy = NULL;
local = frame->local;
this = frame->this;
+ conf = this->private;
for (i = 0; i < layout->cnt; i++) {
if (layout->list[i].err != -1 || !layout->list[i].stop) {
@@ -219,7 +275,14 @@ dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout)
}
missing_xattr++;
}
-
+ /* Also account for subvolumes with no-layout. Used for zero'ing out
+ * the layouts and for setting quota key's if present */
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (_gf_false ==
+ dht_is_subvol_in_layout (layout, conf->subvolumes[i])) {
+ missing_xattr++;
+ }
+ }
gf_log (this->name, GF_LOG_TRACE,
"%d subvolumes missing xattr for %s",
missing_xattr, loc->path);
@@ -230,16 +293,29 @@ dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout)
}
local->call_cnt = missing_xattr;
-
for (i = 0; i < layout->cnt; i++) {
if (layout->list[i].err != -1 || !layout->list[i].stop)
continue;
- dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i);
+ dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i, NULL);
if (--missing_xattr == 0)
break;
}
+ dummy = dht_layout_new (this, 1);
+ if (!dummy)
+ goto out;
+ for (i = 0; i < conf->subvolume_cnt && missing_xattr; i++) {
+ if (_gf_false ==
+ dht_is_subvol_in_layout (layout, conf->subvolumes[i])) {
+ dht_selfheal_dir_xattr_persubvol (frame, loc, dummy, 0,
+ conf->subvolumes[i]);
+ missing_xattr--;
+ }
+ }
+
+ dht_layout_unref (this, dummy);
+out:
return 0;
}
@@ -476,7 +552,7 @@ dht_selfheal_layout_alloc_start (xlator_t *this, loc_t *loc,
uint32_t hashval = 0;
int ret = 0;
- ret = dht_hash_compute (layout->type, loc->path, &hashval);
+ ret = dht_hash_compute (this, layout->type, loc->path, &hashval);
if (ret == 0) {
start = (hashval % layout->cnt);
}
@@ -507,9 +583,33 @@ dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout)
for (i = 0; i < layout->cnt; i++) {
err = layout->list[i].err;
- if (err == -1 || err == 0) {
- layout->list[i].err = -1;
+ if (err == -1 || err == 0 || err == ENOENT) {
+ /* Setting list[i].err = -1 is an indication for
+ dht_selfheal_layout_new_directory() to assign
+ a range. We set it to -1 based on any one of
+ the three criteria:
+
+ - err == -1 already, which means directory
+ existed but layout was not set on it.
+
+ - err == 0, which means directory exists and
+ has an old layout piece which will be
+ overwritten now.
+
+ - err == ENOENT, which means directory does
+ not exist (possibly racing with mkdir or
+ finishing half done mkdir). The missing
+ directory will be attempted to be recreated.
+
+ It is important to note that it is safe
+ to race with mkdir() as self-heal and
+ mkdir are idempotent operations. Both will
+ strive to set the directory and layouts to
+ the same final state.
+ */
count++;
+ if (!err)
+ layout->list[i].err = -1;
}
}
@@ -524,8 +624,12 @@ dht_get_layout_count (xlator_t *this, dht_layout_t *layout, int new_layout)
}
}
- count = ((layout->spread_cnt) ? layout->spread_cnt :
- ((count) ? count : 1));
+ /* if layout->spread_cnt is set, check if it is <= available
+ * subvolumes (down brick and decommissioned bricks are considered
+ * un-availbale). Else return count (available up bricks) */
+ count = ((layout->spread_cnt &&
+ (layout->spread_cnt <= count)) ?
+ layout->spread_cnt : ((count) ? count : 1));
return count;
}
@@ -535,18 +639,25 @@ void dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc,
dht_layout_t *new_layout);
void dht_layout_entry_swap (dht_layout_t *layout, int i, int j);
+void dht_layout_range_swap (dht_layout_t *layout, int i, int j);
+
+/*
+ * It's a bit icky using local variables in a macro, but it makes the rest
+ * of the code a lot clearer.
+ */
+#define OV_ENTRY(x,y) table[x*new->cnt+y]
void
dht_selfheal_layout_maximize_overlap (call_frame_t *frame, loc_t *loc,
dht_layout_t *new, dht_layout_t *old)
{
int i = 0;
- int k = 0;
+ int j = 0;
uint32_t curr_overlap = 0;
uint32_t max_overlap = 0;
int max_overlap_idx = -1;
uint32_t overlap = 0;
-
+ uint32_t *table = NULL;
dht_layout_sort_volname (old);
/* Now both old_layout->list[] and new_layout->list[]
@@ -555,31 +666,61 @@ dht_selfheal_layout_maximize_overlap (call_frame_t *frame, loc_t *loc,
to the same subvolumes
*/
+ /* Build a table of overlaps between new[i] and old[j]. */
+ table = alloca(sizeof(overlap)*old->cnt*new->cnt);
+ if (!table) {
+ return;
+ }
+ memset(table,0,sizeof(overlap)*old->cnt*new->cnt);
+ for (i = 0; i < new->cnt; ++i) {
+ for (j = 0; j < old->cnt; ++j) {
+ OV_ENTRY(i,j) = dht_overlap_calc(old,j,new,i);
+ }
+ }
+
for (i = 0; i < new->cnt; i++) {
- if (new->list[i].err > 0)
+ if (new->list[i].err > 0) {
/* Subvol might be marked for decommission
with EINVAL, or some other serious error
marked with positive errno.
*/
continue;
+ }
+
+ max_overlap = 0;
+ max_overlap_idx = i;
+ for (j = (i + 1); j < new->cnt; ++j) {
+ if (new->list[j].err > 0) {
+ /* Subvol might be marked for decommission
+ with EINVAL, or some other serious error
+ marked with positive errno.
+ */
+ continue;
+ }
+ /* Calculate the overlap now. */
+ curr_overlap = OV_ENTRY(i,i) + OV_ENTRY(j,j);
+ /* Calculate the overlap after the proposed swap. */
+ overlap = OV_ENTRY(i,j) + OV_ENTRY(j,i);
+ /* Are we better than status quo? */
+ if (overlap > curr_overlap) {
+ overlap -= curr_overlap;
+ /* Are we better than the previous choice? */
+ if (overlap > max_overlap) {
+ max_overlap = overlap;
+ max_overlap_idx = j;
+ }
+ }
+ }
- curr_overlap = dht_overlap_calc (old, i, new, i);
- max_overlap = curr_overlap;
- max_overlap_idx = i;
-
- /* Subvols up to `i' have already found their best
- matches. Search only from the rest.
- */
- for (k = (i + 1); k < new->cnt; k++) {
- overlap = dht_overlap_calc (old, i, new, k);
- if (overlap > max_overlap) {
- max_overlap_idx = k;
- max_overlap = overlap;
- }
- }
-
- if (max_overlap > curr_overlap)
- dht_layout_entry_swap (new, i, max_overlap_idx);
+ if (max_overlap_idx != i) {
+ dht_layout_range_swap (new, i, max_overlap_idx);
+ /* Need to swap the table values too. */
+ for (j = 0; j < old->cnt; ++j) {
+ overlap = OV_ENTRY(i,j);
+ OV_ENTRY(i,j) = OV_ENTRY(max_overlap_idx,j);
+ OV_ENTRY(max_overlap_idx,j) = overlap;
+ }
+ }
}
}
@@ -593,6 +734,8 @@ dht_fix_layout_of_directory (call_frame_t *frame, loc_t *loc,
dht_layout_t *new_layout = NULL;
dht_conf_t *priv = NULL;
dht_local_t *local = NULL;
+ uint32_t subvol_down = 0;
+ int ret = 0;
this = frame->this;
priv = this->private;
@@ -608,6 +751,17 @@ dht_fix_layout_of_directory (call_frame_t *frame, loc_t *loc,
if (!new_layout)
goto done;
+ /* If a subvolume is down, do not re-write the layout. */
+ ret = dht_layout_anomalies (this, loc, layout, NULL, NULL, NULL,
+ &subvol_down, NULL, NULL);
+
+ if (subvol_down || (ret == -1)) {
+ gf_log (this->name, GF_LOG_WARNING, "%u subvolume(s) are down"
+ ". Skipping fix layout.", subvol_down);
+ GF_FREE (new_layout);
+ return NULL;
+ }
+
for (i = 0; i < new_layout->cnt; i++) {
if (layout->list[i].err != ENOSPC)
new_layout->list[i].err = layout->list[i].err;
@@ -661,9 +815,11 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc,
start_subvol = dht_selfheal_layout_alloc_start (this, loc, layout);
+ /* clear out the range, as we are re-computing here */
+ DHT_RESET_LAYOUT_RANGE (layout);
for (i = start_subvol; i < layout->cnt; i++) {
err = layout->list[i].err;
- if (err == -1) {
+ if (err == -1 || err == ENOENT) {
DHT_SET_LAYOUT_RANGE(layout, i, start, chunk,
cnt, loc->path);
if (--cnt == 0) {
@@ -676,7 +832,7 @@ dht_selfheal_layout_new_directory (call_frame_t *frame, loc_t *loc,
for (i = 0; i < start_subvol; i++) {
err = layout->list[i].err;
- if (err == -1) {
+ if (err == -1 || err == ENOENT) {
DHT_SET_LAYOUT_RANGE(layout, i, start, chunk,
cnt, loc->path);
if (--cnt == 0) {
@@ -695,35 +851,17 @@ int
dht_selfheal_dir_getafix (call_frame_t *frame, loc_t *loc,
dht_layout_t *layout)
{
- dht_conf_t *conf = NULL;
- xlator_t *this = NULL;
dht_local_t *local = NULL;
- int missing = -1;
- int down = -1;
- int holes = -1;
+ uint32_t holes = 0;
int ret = -1;
int i = -1;
- int overlaps = -1;
+ uint32_t overlaps = 0;
- this = frame->this;
- conf = this->private;
local = frame->local;
- missing = local->selfheal.missing;
- down = local->selfheal.down;
holes = local->selfheal.hole_cnt;
overlaps = local->selfheal.overlaps_cnt;
- if ((missing + down) == conf->subvolume_cnt) {
- dht_selfheal_layout_new_directory (frame, loc, layout);
- ret = 0;
- }
-
- if (holes <= down) {
- /* the down subvol might fill up the holes */
- ret = 0;
- }
-
if (holes || overlaps) {
dht_selfheal_layout_new_directory (frame, loc, layout);
ret = 0;
@@ -775,6 +913,9 @@ dht_fix_directory_layout (call_frame_t *frame,
/* No layout sorting required here */
tmp_layout = dht_fix_layout_of_directory (frame, &local->loc, layout);
+ if (!tmp_layout) {
+ return -1;
+ }
dht_fix_dir_xattr (frame, &local->loc, tmp_layout);
return 0;
@@ -797,9 +938,8 @@ dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk,
dht_layout_anomalies (this, loc, layout,
&local->selfheal.hole_cnt,
&local->selfheal.overlaps_cnt,
- &local->selfheal.missing,
- &local->selfheal.down,
- &local->selfheal.misc);
+ NULL, &local->selfheal.down,
+ &local->selfheal.misc, NULL);
down = local->selfheal.down;
misc = local->selfheal.misc;
@@ -858,3 +998,51 @@ dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk,
return ret;
}
+
+int
+dht_dir_attr_heal (void *data)
+{
+ call_frame_t *frame = NULL;
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ xlator_t *this = NULL;
+ dht_conf_t *conf = NULL;
+ int call_cnt = 0;
+ int ret = -1;
+ int i = 0;
+
+ GF_VALIDATE_OR_GOTO ("dht", data, out);
+
+ frame = data;
+ local = frame->local;
+ this = frame->this;
+ GF_VALIDATE_OR_GOTO ("dht", this, out);
+ GF_VALIDATE_OR_GOTO ("dht", local, out);
+ conf = this->private;
+ GF_VALIDATE_OR_GOTO ("dht", conf, out);
+
+ call_cnt = conf->subvolume_cnt;
+
+ for (i = 0; i < call_cnt; i++) {
+ subvol = conf->subvolumes[i];
+ if (!subvol || (subvol == dht_first_up_subvol (this)))
+ continue;
+ ret = syncop_setattr (subvol, &local->loc, &local->stbuf,
+ (GF_SET_ATTR_UID | GF_SET_ATTR_GID),
+ NULL, NULL);
+ if (ret) {
+ gf_log ("dht", GF_LOG_ERROR, "Failed to set uid/gid on"
+ " %s on %s subvol (%s)", local->loc.path,
+ subvol->name, strerror (-ret));
+ }
+ }
+out:
+ return 0;
+}
+
+int
+dht_dir_attr_heal_done (int ret, call_frame_t *sync_frame, void *data)
+{
+ DHT_STACK_DESTROY (sync_frame);
+ return 0;
+}
diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c
new file mode 100644
index 000000000..f2e7467ab
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-shared.c
@@ -0,0 +1,780 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+/* TODO: add NS locking */
+
+#include "statedump.h"
+#include "dht-common.h"
+
+/* TODO:
+ - use volumename in xattr instead of "dht"
+ - use NS locks
+ - handle all cases in self heal layout reconstruction
+ - complete linkfile selfheal
+*/
+struct volume_options options[];
+
+void
+dht_layout_dump (dht_layout_t *layout, const char *prefix)
+{
+
+ char key[GF_DUMP_MAX_BUF_LEN];
+ int i = 0;
+
+ if (!layout)
+ goto out;
+ if (!prefix)
+ goto out;
+
+ gf_proc_dump_build_key(key, prefix, "cnt");
+ gf_proc_dump_write(key, "%d", layout->cnt);
+ gf_proc_dump_build_key(key, prefix, "preset");
+ gf_proc_dump_write(key, "%d", layout->preset);
+ gf_proc_dump_build_key(key, prefix, "gen");
+ gf_proc_dump_write(key, "%d", layout->gen);
+ if (layout->type != IA_INVAL) {
+ gf_proc_dump_build_key(key, prefix, "inode type");
+ gf_proc_dump_write(key, "%d", layout->type);
+ }
+
+ if (!IA_ISDIR (layout->type))
+ goto out;
+
+ for (i = 0; i < layout->cnt; i++) {
+ gf_proc_dump_build_key(key, prefix,"list[%d].err", i);
+ gf_proc_dump_write(key, "%d", layout->list[i].err);
+ gf_proc_dump_build_key(key, prefix,"list[%d].start", i);
+ gf_proc_dump_write(key, "%u", layout->list[i].start);
+ gf_proc_dump_build_key(key, prefix,"list[%d].stop", i);
+ gf_proc_dump_write(key, "%u", layout->list[i].stop);
+ if (layout->list[i].xlator) {
+ gf_proc_dump_build_key(key, prefix,
+ "list[%d].xlator.type", i);
+ gf_proc_dump_write(key, "%s",
+ layout->list[i].xlator->type);
+ gf_proc_dump_build_key(key, prefix,
+ "list[%d].xlator.name", i);
+ gf_proc_dump_write(key, "%s",
+ layout->list[i].xlator->name);
+ }
+ }
+
+out:
+ return;
+}
+
+
+int32_t
+dht_priv_dump (xlator_t *this)
+{
+ char key_prefix[GF_DUMP_MAX_BUF_LEN];
+ char key[GF_DUMP_MAX_BUF_LEN];
+ int i = 0;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+
+ if (!this)
+ goto out;
+
+ conf = this->private;
+ if (!conf)
+ goto out;
+
+ ret = TRY_LOCK(&conf->subvolume_lock);
+ if (ret != 0) {
+ return ret;
+ }
+
+ gf_proc_dump_add_section("xlator.cluster.dht.%s.priv", this->name);
+ gf_proc_dump_build_key(key_prefix,"xlator.cluster.dht","%s.priv",
+ this->name);
+ gf_proc_dump_write("subvol_cnt","%d", conf->subvolume_cnt);
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ snprintf (key, sizeof (key), "subvolumes[%d]", i);
+ gf_proc_dump_write(key, "%s.%s", conf->subvolumes[i]->type,
+ conf->subvolumes[i]->name);
+ if (conf->file_layouts && conf->file_layouts[i]){
+ snprintf (key, sizeof (key), "file_layouts[%d]", i);
+ dht_layout_dump(conf->file_layouts[i], key);
+ }
+ if (conf->dir_layouts && conf->dir_layouts[i]) {
+ snprintf (key, sizeof (key), "dir_layouts[%d]", i);
+ dht_layout_dump(conf->dir_layouts[i], key);
+ }
+ if (conf->subvolume_status) {
+
+ snprintf (key, sizeof (key), "subvolume_status[%d]", i);
+ gf_proc_dump_write(key, "%d",
+ (int)conf->subvolume_status[i]);
+ }
+
+ }
+
+ gf_proc_dump_write("search_unhashed", "%d", conf->search_unhashed);
+ gf_proc_dump_write("gen", "%d", conf->gen);
+ gf_proc_dump_write("min_free_disk", "%lf", conf->min_free_disk);
+ gf_proc_dump_write("min_free_inodes", "%lf", conf->min_free_inodes);
+ gf_proc_dump_write("disk_unit", "%c", conf->disk_unit);
+ gf_proc_dump_write("refresh_interval", "%d", conf->refresh_interval);
+ gf_proc_dump_write("unhashed_sticky_bit", "%d", conf->unhashed_sticky_bit);
+
+ if (conf->du_stats) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (!conf->subvolume_status[i])
+ continue;
+
+ snprintf (key, sizeof (key), "subvolumes[%d]", i);
+ gf_proc_dump_write (key, "%s",
+ conf->subvolumes[i]->name);
+
+ snprintf (key, sizeof (key),
+ "du_stats[%d].avail_percent", i);
+ gf_proc_dump_write (key, "%lf",
+ conf->du_stats[i].avail_percent);
+
+ snprintf (key, sizeof (key), "du_stats[%d].avail_space",
+ i);
+ gf_proc_dump_write (key, "%lu",
+ conf->du_stats[i].avail_space);
+
+ snprintf (key, sizeof (key),
+ "du_stats[%d].avail_inodes", i);
+ gf_proc_dump_write (key, "%lf",
+ conf->du_stats[i].avail_inodes);
+
+ snprintf (key, sizeof (key), "du_stats[%d].log", i);
+ gf_proc_dump_write (key, "%lu",
+ conf->du_stats[i].log);
+ }
+ }
+
+ if (conf->last_stat_fetch.tv_sec)
+ gf_proc_dump_write("last_stat_fetch", "%s",
+ ctime(&conf->last_stat_fetch.tv_sec));
+
+ UNLOCK(&conf->subvolume_lock);
+
+out:
+ return ret;
+}
+
+int32_t
+dht_inodectx_dump (xlator_t *this, inode_t *inode)
+{
+ int ret = -1;
+ dht_layout_t *layout = NULL;
+
+ if (!this)
+ goto out;
+ if (!inode)
+ goto out;
+
+ ret = dht_inode_ctx_layout_get (inode, this, &layout);
+
+ if ((ret != 0) || !layout)
+ return ret;
+
+ gf_proc_dump_add_section("xlator.cluster.dht.%s.inode", this->name);
+ dht_layout_dump(layout, "layout");
+
+out:
+ return ret;
+}
+
+void
+dht_fini (xlator_t *this)
+{
+ int i = 0;
+ dht_conf_t *conf = NULL;
+
+ GF_VALIDATE_OR_GOTO ("dht", this, out);
+
+ conf = this->private;
+ this->private = NULL;
+ if (conf) {
+ if (conf->file_layouts) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ GF_FREE (conf->file_layouts[i]);
+ }
+ GF_FREE (conf->file_layouts);
+ }
+
+ GF_FREE (conf->subvolumes);
+
+ GF_FREE (conf->subvolume_status);
+
+ GF_FREE (conf);
+ }
+out:
+ return;
+}
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("dht", this, out);
+
+ ret = xlator_mem_acct_init (this, gf_dht_mt_end + 1);
+
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Memory accounting init"
+ "failed");
+ return ret;
+ }
+out:
+ return ret;
+}
+
+
+int
+dht_parse_decommissioned_bricks (xlator_t *this, dht_conf_t *conf,
+ const char *bricks)
+{
+ int i = 0;
+ int ret = -1;
+ char *tmpstr = NULL;
+ char *dup_brick = NULL;
+ char *node = NULL;
+
+ if (!conf || !bricks)
+ goto out;
+
+ dup_brick = gf_strdup (bricks);
+ node = strtok_r (dup_brick, ",", &tmpstr);
+ while (node) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (!strcmp (conf->subvolumes[i]->name, node)) {
+ conf->decommissioned_bricks[i] =
+ conf->subvolumes[i];
+ conf->decommission_subvols_cnt++;
+ gf_log (this->name, GF_LOG_INFO,
+ "decommissioning subvolume %s",
+ conf->subvolumes[i]->name);
+ break;
+ }
+ }
+ if (i == conf->subvolume_cnt) {
+ /* Wrong node given. */
+ goto out;
+ }
+ node = strtok_r (NULL, ",", &tmpstr);
+ }
+
+ ret = 0;
+ conf->decommission_in_progress = 1;
+out:
+ GF_FREE (dup_brick);
+
+ return ret;
+}
+
+
+int
+dht_decommissioned_remove (xlator_t *this, dht_conf_t *conf)
+{
+ int i = 0;
+ int ret = -1;
+
+ if (!conf)
+ goto out;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->decommissioned_bricks[i]) {
+ conf->decommissioned_bricks[i] = NULL;
+ conf->decommission_subvols_cnt--;
+ }
+ }
+
+ ret = 0;
+out:
+
+ return ret;
+}
+void
+dht_init_regex (xlator_t *this, dict_t *odict, char *name,
+ regex_t *re, gf_boolean_t *re_valid)
+{
+ char *temp_str;
+
+ if (dict_get_str (odict, name, &temp_str) != 0) {
+ if (strcmp(name,"rsync-hash-regex")) {
+ return;
+ }
+ temp_str = "^\\.(.+)\\.[^.]+$";
+ }
+
+ if (*re_valid) {
+ regfree(re);
+ *re_valid = _gf_false;
+ }
+
+ if (!strcmp(temp_str,"none")) {
+ return;
+ }
+
+ if (regcomp(re,temp_str,REG_EXTENDED) == 0) {
+ gf_log (this->name, GF_LOG_INFO,
+ "using regex %s = %s", name, temp_str);
+ *re_valid = _gf_true;
+ }
+ else {
+ gf_log (this->name, GF_LOG_WARNING,
+ "compiling regex %s failed", temp_str);
+ }
+}
+
+int
+dht_reconfigure (xlator_t *this, dict_t *options)
+{
+ dht_conf_t *conf = NULL;
+ char *temp_str = NULL;
+ gf_boolean_t search_unhashed;
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("dht", this, out);
+ GF_VALIDATE_OR_GOTO ("dht", options, out);
+
+ conf = this->private;
+ if (!conf)
+ return 0;
+
+ if (dict_get_str (options, "lookup-unhashed", &temp_str) == 0) {
+ /* If option is not "auto", other options _should_ be boolean*/
+ if (strcasecmp (temp_str, "auto")) {
+ if (!gf_string2boolean (temp_str, &search_unhashed)) {
+ gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:"
+ " lookup-unhashed reconfigured (%s)",
+ temp_str);
+ conf->search_unhashed = search_unhashed;
+ } else {
+ gf_log(this->name, GF_LOG_ERROR, "Reconfigure:"
+ " lookup-unhashed should be boolean,"
+ " not (%s), defaulting to (%d)",
+ temp_str, conf->search_unhashed);
+ ret = -1;
+ goto out;
+ }
+ } else {
+ gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:"
+ " lookup-unhashed reconfigured auto ");
+ conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO;
+ }
+ }
+
+ GF_OPTION_RECONF ("min-free-disk", conf->min_free_disk, options,
+ percent_or_size, out);
+ /* option can be any one of percent or bytes */
+ conf->disk_unit = 0;
+ if (conf->min_free_disk < 100.0)
+ conf->disk_unit = 'p';
+
+ GF_OPTION_RECONF ("min-free-inodes", conf->min_free_inodes, options,
+ percent, out);
+
+ GF_OPTION_RECONF ("directory-layout-spread", conf->dir_spread_cnt,
+ options, uint32, out);
+
+ GF_OPTION_RECONF ("readdir-optimize", conf->readdir_optimize, options,
+ bool, out);
+ if (conf->defrag) {
+ GF_OPTION_RECONF ("rebalance-stats", conf->defrag->stats,
+ options, bool, out);
+ }
+
+ if (dict_get_str (options, "decommissioned-bricks", &temp_str) == 0) {
+ ret = dht_parse_decommissioned_bricks (this, conf, temp_str);
+ if (ret == -1)
+ goto out;
+ } else {
+ ret = dht_decommissioned_remove (this, conf);
+ if (ret == -1)
+ goto out;
+ }
+
+ dht_init_regex (this, options, "rsync-hash-regex",
+ &conf->rsync_regex, &conf->rsync_regex_valid);
+ dht_init_regex (this, options, "extra-hash-regex",
+ &conf->extra_regex, &conf->extra_regex_valid);
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static int
+gf_defrag_pattern_list_fill (xlator_t *this, gf_defrag_info_t *defrag, char *data)
+{
+ int ret = -1;
+ char *tmp_str = NULL;
+ char *tmp_str1 = NULL;
+ char *dup_str = NULL;
+ char *num = NULL;
+ char *pattern_str = NULL;
+ char *pattern = NULL;
+ gf_defrag_pattern_list_t *temp_list = NULL;
+ gf_defrag_pattern_list_t *pattern_list = NULL;
+
+ if (!this || !defrag || !data)
+ goto out;
+
+ /* Get the pattern for pattern list. "pattern:<optional-size>"
+ * eg: *avi, *pdf:10MB, *:1TB
+ */
+ pattern_str = strtok_r (data, ",", &tmp_str);
+ while (pattern_str) {
+ dup_str = gf_strdup (pattern_str);
+ pattern_list = GF_CALLOC (1, sizeof (gf_defrag_pattern_list_t),
+ 1);
+ if (!pattern_list) {
+ goto out;
+ }
+ pattern = strtok_r (dup_str, ":", &tmp_str1);
+ num = strtok_r (NULL, ":", &tmp_str1);
+ if (!pattern)
+ goto out;
+ if (!num) {
+ if (gf_string2bytesize_uint64(pattern, &pattern_list->size)
+ == 0) {
+ pattern = "*";
+ }
+ } else if (gf_string2bytesize_uint64 (num, &pattern_list->size) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "invalid number format \"%s\"", num);
+ goto out;
+ }
+ memcpy (pattern_list->path_pattern, pattern, strlen (dup_str));
+
+ if (!defrag->defrag_pattern)
+ temp_list = NULL;
+ else
+ temp_list = defrag->defrag_pattern;
+
+ pattern_list->next = temp_list;
+
+ defrag->defrag_pattern = pattern_list;
+ pattern_list = NULL;
+
+ GF_FREE (dup_str);
+ dup_str = NULL;
+
+ pattern_str = strtok_r (NULL, ",", &tmp_str);
+ }
+
+ ret = 0;
+out:
+ if (ret)
+ GF_FREE (pattern_list);
+ GF_FREE (dup_str);
+
+ return ret;
+}
+
+int
+dht_init (xlator_t *this)
+{
+ dht_conf_t *conf = NULL;
+ char *temp_str = NULL;
+ int ret = -1;
+ int i = 0;
+ gf_defrag_info_t *defrag = NULL;
+ int cmd = 0;
+ char *node_uuid = NULL;
+
+
+ GF_VALIDATE_OR_GOTO ("dht", this, err);
+
+ if (!this->children) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "Distribute needs more than one subvolume");
+ return -1;
+ }
+
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "dangling volume. check volfile");
+ }
+
+ conf = GF_CALLOC (1, sizeof (*conf), gf_dht_mt_dht_conf_t);
+ if (!conf) {
+ goto err;
+ }
+
+ ret = dict_get_int32 (this->options, "rebalance-cmd", &cmd);
+
+ if (cmd) {
+ defrag = GF_CALLOC (1, sizeof (gf_defrag_info_t),
+ gf_defrag_info_mt);
+
+ GF_VALIDATE_OR_GOTO (this->name, defrag, err);
+
+ LOCK_INIT (&defrag->lock);
+
+ defrag->is_exiting = 0;
+
+ conf->defrag = defrag;
+
+ ret = dict_get_str (this->options, "node-uuid", &node_uuid);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "node-uuid not "
+ "specified");
+ goto err;
+ }
+
+ if (uuid_parse (node_uuid, defrag->node_uuid)) {
+ gf_log (this->name, GF_LOG_ERROR, "Cannot parse "
+ "glusterd node uuid");
+ goto err;
+ }
+
+ defrag->cmd = cmd;
+
+ defrag->stats = _gf_false;
+ }
+
+ conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON;
+ if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) {
+ /* If option is not "auto", other options _should_ be boolean */
+ if (strcasecmp (temp_str, "auto"))
+ gf_string2boolean (temp_str, &conf->search_unhashed);
+ else
+ conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO;
+ }
+
+ GF_OPTION_INIT ("unhashed-sticky-bit", conf->unhashed_sticky_bit, bool,
+ err);
+
+ GF_OPTION_INIT ("use-readdirp", conf->use_readdirp, bool, err);
+
+ GF_OPTION_INIT ("min-free-disk", conf->min_free_disk, percent_or_size,
+ err);
+
+ GF_OPTION_INIT ("min-free-inodes", conf->min_free_inodes, percent,
+ err);
+
+ conf->dir_spread_cnt = conf->subvolume_cnt;
+ GF_OPTION_INIT ("directory-layout-spread", conf->dir_spread_cnt,
+ uint32, err);
+
+ GF_OPTION_INIT ("assert-no-child-down", conf->assert_no_child_down,
+ bool, err);
+
+ GF_OPTION_INIT ("readdir-optimize", conf->readdir_optimize, bool, err);
+
+ if (defrag) {
+ GF_OPTION_INIT ("rebalance-stats", defrag->stats, bool, err);
+ if (dict_get_str (this->options, "rebalance-filter", &temp_str)
+ == 0) {
+ if (gf_defrag_pattern_list_fill (this, defrag, temp_str)
+ == -1) {
+ gf_log (this->name, GF_LOG_ERROR, "Cannot parse"
+ " rebalance-filter (%s)", temp_str);
+ goto err;
+ }
+ }
+ }
+
+ /* option can be any one of percent or bytes */
+ conf->disk_unit = 0;
+ if (conf->min_free_disk < 100)
+ conf->disk_unit = 'p';
+
+ ret = dht_init_subvolumes (this, conf);
+ if (ret == -1) {
+ goto err;
+ }
+
+ if (dict_get_str (this->options, "decommissioned-bricks", &temp_str) == 0) {
+ ret = dht_parse_decommissioned_bricks (this, conf, temp_str);
+ if (ret == -1)
+ goto err;
+ }
+
+ dht_init_regex (this, this->options, "rsync-hash-regex",
+ &conf->rsync_regex, &conf->rsync_regex_valid);
+ dht_init_regex (this, this->options, "extra-hash-regex",
+ &conf->extra_regex, &conf->extra_regex_valid);
+
+ ret = dht_layouts_init (this, conf);
+ if (ret == -1) {
+ goto err;
+ }
+
+ LOCK_INIT (&conf->subvolume_lock);
+ LOCK_INIT (&conf->layout_lock);
+
+ conf->gen = 1;
+
+ this->local_pool = mem_pool_new (dht_local_t, 512);
+ if (!this->local_pool) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to create local_t's memory pool");
+ goto err;
+ }
+
+ GF_OPTION_INIT ("xattr-name", conf->xattr_name, str, err);
+ gf_asprintf (&conf->link_xattr_name, "%s."DHT_LINKFILE_STR,
+ conf->xattr_name);
+ gf_asprintf (&conf->wild_xattr_name, "%s*", conf->xattr_name);
+ if (!conf->link_xattr_name || !conf->wild_xattr_name) {
+ goto err;
+ }
+
+ this->private = conf;
+
+ return 0;
+
+err:
+ if (conf) {
+ if (conf->file_layouts) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ GF_FREE (conf->file_layouts[i]);
+ }
+ GF_FREE (conf->file_layouts);
+ }
+
+ GF_FREE (conf->subvolumes);
+
+ GF_FREE (conf->subvolume_status);
+
+ GF_FREE (conf->du_stats);
+
+ GF_FREE (conf->defrag);
+
+ GF_FREE (conf->xattr_name);
+ GF_FREE (conf->link_xattr_name);
+ GF_FREE (conf->wild_xattr_name);
+
+ GF_FREE (conf);
+ }
+
+ return -1;
+}
+
+
+struct volume_options options[] = {
+ { .key = {"lookup-unhashed"},
+ .value = {"auto", "yes", "no", "enable", "disable", "1", "0",
+ "on", "off"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = "on",
+ .description = "This option if set to ON, does a lookup through "
+ "all the sub-volumes, in case a lookup didn't return any result "
+ "from the hash subvolume. If set to OFF, it does not do a lookup "
+ "on the remaining subvolumes."
+ },
+ { .key = {"min-free-disk"},
+ .type = GF_OPTION_TYPE_PERCENT_OR_SIZET,
+ .default_value = "10%",
+ .description = "Percentage/Size of disk space, after which the "
+ "process starts balancing out the cluster, and logs will appear "
+ "in log files",
+ },
+ { .key = {"min-free-inodes"},
+ .type = GF_OPTION_TYPE_PERCENT,
+ .default_value = "5%",
+ .description = "after system has only N% of inodes, warnings "
+ "starts to appear in log files",
+ },
+ { .key = {"unhashed-sticky-bit"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ },
+ { .key = {"use-readdirp"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "This option if set to ON, forces the use of "
+ "readdirp, and hence also displays the stats of the files."
+ },
+ { .key = {"assert-no-child-down"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "This option if set to ON, in the event of "
+ "CHILD_DOWN, will call exit."
+ },
+ { .key = {"directory-layout-spread"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .validate = GF_OPT_VALIDATE_MIN,
+ .description = "Specifies the directory layout spread. Takes number "
+ "of subvolumes as default value."
+ },
+ { .key = {"decommissioned-bricks"},
+ .type = GF_OPTION_TYPE_ANY,
+ .description = "This option if set to ON, decommissions "
+ "the brick, so that no new data is allowed to be created "
+ "on that brick."
+ },
+ { .key = {"rebalance-cmd"},
+ .type = GF_OPTION_TYPE_INT,
+ },
+ { .key = {"node-uuid"},
+ .type = GF_OPTION_TYPE_STR,
+ },
+ { .key = {"rebalance-stats"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "This option if set to ON displays and logs the "
+ " time taken for migration of each file, during the rebalance "
+ "process. If set to OFF, the rebalance logs will only display the "
+ "time spent in each directory."
+ },
+ { .key = {"readdir-optimize"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "This option if set to ON enables the optimization "
+ "that allows DHT to requests non-first subvolumes to filter out "
+ "directory entries."
+ },
+ { .key = {"rsync-hash-regex"},
+ .type = GF_OPTION_TYPE_STR,
+ /* Setting a default here doesn't work. See dht_init_regex. */
+ .description = "Regular expression for stripping temporary-file "
+ "suffix and prefix used by rsync, to prevent relocation when the "
+ "file is renamed."
+ },
+ { .key = {"extra-hash-regex"},
+ .type = GF_OPTION_TYPE_STR,
+ /* Setting a default here doesn't work. See dht_init_regex. */
+ .description = "Regular expression for stripping temporary-file "
+ "suffix and prefix used by an application, to prevent relocation when "
+ "the file is renamed."
+ },
+ { .key = {"rebalance-filter"},
+ .type = GF_OPTION_TYPE_STR,
+ },
+
+ { .key = {"xattr-name"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = "trusted.glusterfs.dht",
+ .description = "Base for extended attributes used by this "
+ "translator instance, to avoid conflicts with others above or "
+ "below it."
+ },
+
+ /* NUFA option */
+ { .key = {"local-volume-name"},
+ .type = GF_OPTION_TYPE_XLATOR
+ },
+
+ /* switch option */
+ { .key = {"pattern.switch.case"},
+ .type = GF_OPTION_TYPE_ANY
+ },
+
+ { .key = {NULL} },
+};
diff --git a/xlators/cluster/dht/src/dht.c b/xlators/cluster/dht/src/dht.c
index 6f16c6fcc..fc0ca2f77 100644
--- a/xlators/cluster/dht/src/dht.c
+++ b/xlators/cluster/dht/src/dht.c
@@ -14,492 +14,15 @@
#include "config.h"
#endif
-/* TODO: add NS locking */
-
#include "statedump.h"
#include "dht-common.h"
-/* TODO:
- - use volumename in xattr instead of "dht"
- - use NS locks
- - handle all cases in self heal layout reconstruction
- - complete linkfile selfheal
-*/
-struct volume_options options[];
-
-void
-dht_layout_dump (dht_layout_t *layout, const char *prefix)
-{
-
- char key[GF_DUMP_MAX_BUF_LEN];
- int i = 0;
-
- GF_VALIDATE_OR_GOTO ("dht", layout, out);
- GF_VALIDATE_OR_GOTO ("dht", prefix, out);
-
- gf_proc_dump_build_key(key, prefix, "cnt");
- gf_proc_dump_write(key, "%d", layout->cnt);
- gf_proc_dump_build_key(key, prefix, "preset");
- gf_proc_dump_write(key, "%d", layout->preset);
- gf_proc_dump_build_key(key, prefix, "gen");
- gf_proc_dump_write(key, "%d", layout->gen);
- gf_proc_dump_build_key(key, prefix, "type");
- gf_proc_dump_write(key, "%d", layout->type);
-
- for (i = 0; i < layout->cnt; i++) {
- gf_proc_dump_build_key(key, prefix,"list[%d].err", i);
- gf_proc_dump_write(key, "%d", layout->list[i].err);
- gf_proc_dump_build_key(key, prefix,"list[%d].start", i);
- gf_proc_dump_write(key, "%u", layout->list[i].start);
- gf_proc_dump_build_key(key, prefix,"list[%d].stop", i);
- gf_proc_dump_write(key, "%u", layout->list[i].stop);
- if (layout->list[i].xlator) {
- gf_proc_dump_build_key(key, prefix,
- "list[%d].xlator.type", i);
- gf_proc_dump_write(key, "%s",
- layout->list[i].xlator->type);
- gf_proc_dump_build_key(key, prefix,
- "list[%d].xlator.name", i);
- gf_proc_dump_write(key, "%s",
- layout->list[i].xlator->name);
- }
- }
-
-out:
- return;
-}
-
-
-int32_t
-dht_priv_dump (xlator_t *this)
-{
- char key_prefix[GF_DUMP_MAX_BUF_LEN];
- char key[GF_DUMP_MAX_BUF_LEN];
- int i = 0;
- dht_conf_t *conf = NULL;
- int ret = -1;
-
- GF_VALIDATE_OR_GOTO ("dht", this, out);
-
- conf = this->private;
-
- if (!conf)
- return -1;
-
- ret = TRY_LOCK(&conf->subvolume_lock);
- if (ret != 0) {
- return ret;
- }
-
- gf_proc_dump_add_section("xlator.cluster.dht.%s.priv", this->name);
- gf_proc_dump_build_key(key_prefix,"xlator.cluster.dht","%s.priv",
- this->name);
- gf_proc_dump_write("subvol_cnt","%d", conf->subvolume_cnt);
- for (i = 0; i < conf->subvolume_cnt; i++) {
- sprintf (key, "subvolumes[%d]", i);
- gf_proc_dump_write(key, "%s.%s", conf->subvolumes[i]->type,
- conf->subvolumes[i]->name);
- if (conf->file_layouts && conf->file_layouts[i]){
- sprintf (key, "file_layouts[%d]", i);
- dht_layout_dump(conf->file_layouts[i], key);
- }
- if (conf->dir_layouts && conf->dir_layouts[i]) {
- sprintf (key, "dir_layouts[%d]", i);
- dht_layout_dump(conf->dir_layouts[i], key);
- }
- if (conf->subvolume_status) {
-
- sprintf (key, "subvolume_status[%d]", i);
- gf_proc_dump_write(key, "%d",
- (int)conf->subvolume_status[i]);
- }
-
- }
-
- gf_proc_dump_write("search_unhashed", "%d", conf->search_unhashed);
- gf_proc_dump_write("gen", "%d", conf->gen);
- gf_proc_dump_write("min_free_disk", "%lu", conf->min_free_disk);
- gf_proc_dump_write("min_free_inodes", "%lu", conf->min_free_inodes);
- gf_proc_dump_write("disk_unit", "%c", conf->disk_unit);
- gf_proc_dump_write("refresh_interval", "%d", conf->refresh_interval);
- gf_proc_dump_write("unhashed_sticky_bit", "%d", conf->unhashed_sticky_bit);
- if (conf ->du_stats) {
- gf_proc_dump_write("du_stats.avail_percent", "%lf",
- conf->du_stats->avail_percent);
- gf_proc_dump_write("du_stats.avail_space", "%lu",
- conf->du_stats->avail_space);
- gf_proc_dump_write("du_stats.avail_inodes", "%lf",
- conf->du_stats->avail_inodes);
- gf_proc_dump_write("du_stats.log", "%lu", conf->du_stats->log);
- }
-
- if (conf->last_stat_fetch.tv_sec)
- gf_proc_dump_write("last_stat_fetch", "%s",
- ctime(&conf->last_stat_fetch.tv_sec));
-
- UNLOCK(&conf->subvolume_lock);
-
-out:
- return ret;
-}
-
-int32_t
-dht_inodectx_dump (xlator_t *this, inode_t *inode)
-{
- int ret = -1;
- dht_layout_t *layout = NULL;
-
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO ("dht", inode, out);
-
- ret = dht_inode_ctx_layout_get (inode, this, &layout);
-
- if ((ret != 0) || !layout)
- return ret;
-
- gf_proc_dump_add_section("xlator.cluster.dht.%s.inode", this->name);
- dht_layout_dump(layout, "layout");
-
-out:
- return ret;
-}
-
-int
-notify (xlator_t *this, int event, void *data, ...)
-{
- int ret = -1;
- va_list ap;
- dict_t *output = NULL;
-
- GF_VALIDATE_OR_GOTO ("dht", this, out);
-
-
- if (!data)
- goto out;
-
- va_start (ap, data);
- output = va_arg (ap, dict_t*);
-
- ret = dht_notify (this, event, data, output);
-
-out:
- return ret;
-}
-
-void
-fini (xlator_t *this)
-{
- int i = 0;
- dht_conf_t *conf = NULL;
-
- GF_VALIDATE_OR_GOTO ("dht", this, out);
-
- conf = this->private;
- this->private = NULL;
- if (conf) {
- if (conf->file_layouts) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- GF_FREE (conf->file_layouts[i]);
- }
- GF_FREE (conf->file_layouts);
- }
-
- GF_FREE (conf->subvolumes);
-
- GF_FREE (conf->subvolume_status);
-
- GF_FREE (conf);
- }
-out:
- return;
-}
-
-int32_t
-mem_acct_init (xlator_t *this)
-{
- int ret = -1;
-
- GF_VALIDATE_OR_GOTO ("dht", this, out);
-
- ret = xlator_mem_acct_init (this, gf_dht_mt_end + 1);
-
- if (ret != 0) {
- gf_log (this->name, GF_LOG_ERROR, "Memory accounting init"
- "failed");
- return ret;
- }
-out:
- return ret;
-}
-
-
-int
-dht_parse_decommissioned_bricks (xlator_t *this, dht_conf_t *conf,
- const char *bricks)
-{
- int i = 0;
- int ret = -1;
- char *tmpstr = NULL;
- char *dup_brick = NULL;
- char *node = NULL;
-
- if (!conf || !bricks)
- goto out;
-
- dup_brick = gf_strdup (bricks);
- node = strtok_r (dup_brick, ",", &tmpstr);
- while (node) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (!strcmp (conf->subvolumes[i]->name, node)) {
- conf->decommissioned_bricks[i] =
- conf->subvolumes[i];
- gf_log (this->name, GF_LOG_INFO,
- "decommissioning subvolume %s",
- conf->subvolumes[i]->name);
- break;
- }
- }
- if (i == conf->subvolume_cnt) {
- /* Wrong node given. */
- goto out;
- }
- node = strtok_r (NULL, ",", &tmpstr);
- }
-
- ret = 0;
- conf->decommission_in_progress = 1;
-out:
- GF_FREE (dup_brick);
-
- return ret;
-}
-
-int
-reconfigure (xlator_t *this, dict_t *options)
-{
- dht_conf_t *conf = NULL;
- char *temp_str = NULL;
- gf_boolean_t search_unhashed;
- int ret = -1;
-
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO ("dht", options, out);
-
- conf = this->private;
- if (!conf)
- return 0;
-
- if (dict_get_str (options, "lookup-unhashed", &temp_str) == 0) {
- /* If option is not "auto", other options _should_ be boolean*/
- if (strcasecmp (temp_str, "auto")) {
- if (!gf_string2boolean (temp_str, &search_unhashed)) {
- gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:"
- " lookup-unhashed reconfigured (%s)",
- temp_str);
- conf->search_unhashed = search_unhashed;
- } else {
- gf_log(this->name, GF_LOG_ERROR, "Reconfigure:"
- " lookup-unhashed should be boolean,"
- " not (%s), defaulting to (%d)",
- temp_str, conf->search_unhashed);
- //return -1;
- ret = -1;
- goto out;
- }
- } else {
- gf_log(this->name, GF_LOG_DEBUG, "Reconfigure:"
- " lookup-unhashed reconfigured auto ");
- conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO;
- }
- }
-
- GF_OPTION_RECONF ("min-free-disk", conf->min_free_disk, options,
- percent_or_size, out);
- /* option can be any one of percent or bytes */
- conf->disk_unit = 0;
- if (conf->min_free_disk < 100)
- conf->disk_unit = 'p';
-
- GF_OPTION_RECONF ("min-free-inodes", conf->min_free_inodes, options,
- percent, out);
-
- GF_OPTION_RECONF ("directory-layout-spread", conf->dir_spread_cnt,
- options, uint32, out);
-
- GF_OPTION_RECONF ("readdir-optimize", conf->readdir_optimize, options,
- bool, out);
- if (conf->defrag) {
- GF_OPTION_RECONF ("rebalance-stats", conf->defrag->stats,
- options, bool, out);
- }
-
- if (dict_get_str (options, "decommissioned-bricks", &temp_str) == 0) {
- ret = dht_parse_decommissioned_bricks (this, conf, temp_str);
- if (ret == -1)
- goto out;
- }
-
- ret = 0;
-out:
- return ret;
-}
-
-
-int
-init (xlator_t *this)
-{
- dht_conf_t *conf = NULL;
- char *temp_str = NULL;
- int ret = -1;
- int i = 0;
- gf_defrag_info_t *defrag = NULL;
- int cmd = 0;
- char *node_uuid = NULL;
-
-
- GF_VALIDATE_OR_GOTO ("dht", this, err);
-
- if (!this->children) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "Distribute needs more than one subvolume");
- return -1;
- }
-
- if (!this->parents) {
- gf_log (this->name, GF_LOG_WARNING,
- "dangling volume. check volfile");
- }
-
- conf = GF_CALLOC (1, sizeof (*conf), gf_dht_mt_dht_conf_t);
- if (!conf) {
- goto err;
- }
-
- ret = dict_get_int32 (this->options, "rebalance-cmd", &cmd);
-
- if (cmd) {
- defrag = GF_CALLOC (1, sizeof (gf_defrag_info_t),
- gf_defrag_info_mt);
-
- GF_VALIDATE_OR_GOTO (this->name, defrag, err);
-
- LOCK_INIT (&defrag->lock);
-
- defrag->is_exiting = 0;
-
- conf->defrag = defrag;
-
- ret = dict_get_str (this->options, "node-uuid", &node_uuid);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "node-uuid not "
- "specified");
- goto err;
- }
-
- if (uuid_parse (node_uuid, defrag->node_uuid)) {
- gf_log (this->name, GF_LOG_ERROR, "Cannot parse "
- "glusterd node uuid");
- goto err;
- }
-
- defrag->cmd = cmd;
-
- defrag->stats = _gf_false;
- }
-
- conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON;
- if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) {
- /* If option is not "auto", other options _should_ be boolean */
- if (strcasecmp (temp_str, "auto"))
- gf_string2boolean (temp_str, &conf->search_unhashed);
- else
- conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO;
- }
-
- GF_OPTION_INIT ("unhashed-sticky-bit", conf->unhashed_sticky_bit, bool,
- err);
-
- GF_OPTION_INIT ("use-readdirp", conf->use_readdirp, bool, err);
-
- GF_OPTION_INIT ("min-free-disk", conf->min_free_disk, percent_or_size,
- err);
-
- GF_OPTION_INIT ("min-free-inodes", conf->min_free_inodes, percent,
- err);
-
- conf->dir_spread_cnt = conf->subvolume_cnt;
- GF_OPTION_INIT ("directory-layout-spread", conf->dir_spread_cnt,
- uint32, err);
-
- GF_OPTION_INIT ("assert-no-child-down", conf->assert_no_child_down,
- bool, err);
-
- GF_OPTION_INIT ("readdir-optimize", conf->readdir_optimize, bool, err);
-
- if (defrag) {
- GF_OPTION_INIT ("rebalance-stats", defrag->stats, bool, err);
- }
-
- /* option can be any one of percent or bytes */
- conf->disk_unit = 0;
- if (conf->min_free_disk < 100)
- conf->disk_unit = 'p';
-
- ret = dht_init_subvolumes (this, conf);
- if (ret == -1) {
- goto err;
- }
-
- if (dict_get_str (this->options, "decommissioned-bricks", &temp_str) == 0) {
- ret = dht_parse_decommissioned_bricks (this, conf, temp_str);
- if (ret == -1)
- goto err;
- }
-
- ret = dht_layouts_init (this, conf);
- if (ret == -1) {
- goto err;
- }
-
- LOCK_INIT (&conf->subvolume_lock);
- LOCK_INIT (&conf->layout_lock);
-
- conf->gen = 1;
-
- this->local_pool = mem_pool_new (dht_local_t, 512);
- if (!this->local_pool) {
- gf_log (this->name, GF_LOG_ERROR,
- "failed to create local_t's memory pool");
- goto err;
- }
-
- this->private = conf;
-
- return 0;
-
-err:
- if (conf) {
- if (conf->file_layouts) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- GF_FREE (conf->file_layouts[i]);
- }
- GF_FREE (conf->file_layouts);
- }
-
- GF_FREE (conf->subvolumes);
-
- GF_FREE (conf->subvolume_status);
-
- GF_FREE (conf->du_stats);
-
- GF_FREE (conf->defrag);
-
- GF_FREE (conf);
- }
-
- return -1;
-}
-
+class_methods_t class_methods = {
+ .init = dht_init,
+ .fini = dht_fini,
+ .reconfigure = dht_reconfigure,
+ .notify = dht_notify
+};
struct xlator_fops fops = {
.lookup = dht_lookup,
@@ -547,6 +70,9 @@ struct xlator_fops fops = {
.fxattrop = dht_fxattrop,
.setattr = dht_setattr,
.fsetattr = dht_fsetattr,
+ .fallocate = dht_fallocate,
+ .discard = dht_discard,
+ .zerofill = dht_zerofill,
};
struct xlator_dumpops dumpops = {
@@ -560,59 +86,4 @@ struct xlator_cbks cbks = {
// .releasedir = dht_releasedir,
.forget = dht_forget
};
-
-
-struct volume_options options[] = {
- { .key = {"lookup-unhashed"},
- .value = {"auto", "yes", "no", "enable", "disable", "1", "0",
- "on", "off"},
- .type = GF_OPTION_TYPE_STR,
- .default_value = "on",
- },
- { .key = {"min-free-disk"},
- .type = GF_OPTION_TYPE_PERCENT_OR_SIZET,
- .default_value = "10%",
- .description = "Percentage/Size of disk space that must be "
- "kept free."
- },
- { .key = {"min-free-inodes"},
- .type = GF_OPTION_TYPE_PERCENT,
- .default_value = "5%",
- .description = "Percentage inodes that must be "
- "kept free."
- },
- { .key = {"unhashed-sticky-bit"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "off",
- },
- { .key = {"use-readdirp"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "on",
- },
- { .key = {"assert-no-child-down"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "off",
- },
- { .key = {"directory-layout-spread"},
- .type = GF_OPTION_TYPE_INT,
- },
- { .key = {"decommissioned-bricks"},
- .type = GF_OPTION_TYPE_ANY,
- },
- { .key = {"rebalance-cmd"},
- .type = GF_OPTION_TYPE_INT,
- },
- { .key = {"node-uuid"},
- .type = GF_OPTION_TYPE_STR,
- },
- { .key = {"rebalance-stats"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "off",
- },
- { .key = {"readdir-optimize"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "off",
- },
-
- { .key = {NULL} },
-};
+;
diff --git a/xlators/cluster/dht/src/nufa.c b/xlators/cluster/dht/src/nufa.c
index 53c66aa45..e934acdf0 100644
--- a/xlators/cluster/dht/src/nufa.c
+++ b/xlators/cluster/dht/src/nufa.c
@@ -18,6 +18,8 @@
/* TODO: all 'TODO's in dht.c holds good */
+extern struct volume_options options[];
+
int
nufa_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno,
@@ -52,7 +54,8 @@ nufa_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret == -1)
goto out;
- is_linkfile = check_is_linkfile (inode, stbuf, xattr);
+ is_linkfile = check_is_linkfile (inode, stbuf, xattr,
+ conf->link_xattr_name);
is_dir = check_is_dir (inode, stbuf, xattr);
if (!is_dir && !is_linkfile) {
@@ -201,7 +204,7 @@ nufa_lookup (call_frame_t *frame, xlator_t *this,
* revalidates directly go to the cached-subvolume.
*/
ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht", 4 * 4);
+ conf->xattr_name, 4 * 4);
if (ret < 0) {
gf_log (this->name, GF_LOG_ERROR,
"Failed to set dict value.");
@@ -222,7 +225,7 @@ nufa_lookup (call_frame_t *frame, xlator_t *this,
} else {
do_fresh_lookup:
ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht", 4 * 4);
+ conf->xattr_name, 4 * 4);
if (ret < 0) {
gf_log (this->name, GF_LOG_ERROR,
"Failed to set dict value.");
@@ -231,7 +234,7 @@ nufa_lookup (call_frame_t *frame, xlator_t *this,
}
ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht.linkto", 256);
+ conf->link_xattr_name, 256);
if (ret < 0) {
gf_log (this->name, GF_LOG_ERROR,
"Failed to set dict value.");
@@ -320,7 +323,8 @@ nufa_create (call_frame_t *frame, xlator_t *this,
if (dht_is_subvol_filled (this, (xlator_t *)conf->private)) {
avail_subvol =
dht_free_disk_available_subvol (this,
- (xlator_t *)conf->private);
+ (xlator_t *)conf->private,
+ local);
}
if (subvol != avail_subvol) {
@@ -330,9 +334,8 @@ nufa_create (call_frame_t *frame, xlator_t *this,
local->flags = flags;
local->umask = umask;
local->cached_subvol = avail_subvol;
- dht_linkfile_create (frame,
- nufa_create_linkfile_create_cbk,
- avail_subvol, subvol, loc);
+ dht_linkfile_create (frame, nufa_create_linkfile_create_cbk,
+ this, avail_subvol, subvol, loc);
return 0;
}
@@ -362,17 +365,22 @@ nufa_mknod_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dht_local_t *local = NULL;
local = frame->local;
+ if (!local || !local->cached_subvol) {
+ op_errno = EINVAL;
+ op_ret = -1;
+ goto err;
+ }
if (op_ret >= 0) {
- STACK_WIND (frame, dht_newfile_cbk,
- local->cached_subvol,
+ STACK_WIND_COOKIE (frame, dht_newfile_cbk,
+ (void *)local->cached_subvol, local->cached_subvol,
local->cached_subvol->fops->mknod,
&local->loc, local->mode, local->rdev,
local->umask, local->params);
return 0;
}
-
+err:
WIPE (postparent);
WIPE (preparent);
@@ -420,7 +428,8 @@ nufa_mknod (call_frame_t *frame, xlator_t *this,
if (dht_is_subvol_filled (this, (xlator_t *)conf->private)) {
avail_subvol =
dht_free_disk_available_subvol (this,
- (xlator_t *)conf->private);
+ (xlator_t *)conf->private,
+ local);
}
if (avail_subvol != subvol) {
@@ -432,7 +441,7 @@ nufa_mknod (call_frame_t *frame, xlator_t *this,
local->rdev = rdev;
local->cached_subvol = avail_subvol;
- dht_linkfile_create (frame, nufa_mknod_linkfile_cbk,
+ dht_linkfile_create (frame, nufa_mknod_linkfile_cbk, this,
avail_subvol, subvol, loc);
return 0;
}
@@ -440,9 +449,9 @@ nufa_mknod (call_frame_t *frame, xlator_t *this,
gf_log (this->name, GF_LOG_TRACE,
"creating %s on %s", loc->path, subvol->name);
- STACK_WIND (frame, dht_newfile_cbk,
- subvol, subvol->fops->mknod,
- loc, mode, rdev, umask, params);
+ STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol, subvol,
+ subvol->fops->mknod, loc, mode, rdev, umask,
+ params);
return 0;
@@ -455,189 +464,170 @@ err:
}
-int
-notify (xlator_t *this, int event, void *data, ...)
+gf_boolean_t
+same_first_part (char *str1, char term1, char *str2, char term2)
{
- int ret = -1;
-
- ret = dht_notify (this, event, data);
-
- return ret;
+ gf_boolean_t ended1;
+ gf_boolean_t ended2;
+
+ for (;;) {
+ ended1 = ((*str1 == '\0') || (*str1 == term1));
+ ended2 = ((*str2 == '\0') || (*str2 == term2));
+ if (ended1 && ended2) {
+ return _gf_true;
+ }
+ if (ended1 || ended2 || (*str1 != *str2)) {
+ return _gf_false;
+ }
+ ++str1;
+ ++str2;
+ }
}
-void
-fini (xlator_t *this)
-{
- int i = 0;
- dht_conf_t *conf = NULL;
+typedef struct nufa_args {
+ xlator_t *this;
+ char *volname;
+ gf_boolean_t addr_match;
+} nufa_args_t;
- conf = this->private;
+static void
+nufa_find_local_brick (xlator_t *xl, void *data)
+{
+ nufa_args_t *args = data;
+ xlator_t *this = args->this;
+ char *local_volname = args->volname;
+ gf_boolean_t addr_match = args->addr_match;
+ char *brick_host = NULL;
+ dht_conf_t *conf = this->private;
+ int ret = -1;
+
+ /*This means a local subvol was already found. We pick the first brick
+ * that is local*/
+ if (conf->private)
+ return;
+
+ if (strcmp (xl->name, local_volname) == 0) {
+ conf->private = xl;
+ gf_log (this->name, GF_LOG_INFO, "Using specified subvol %s",
+ local_volname);
+ return;
+ }
- if (conf) {
- if (conf->file_layouts) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- GF_FREE (conf->file_layouts[i]);
- }
- GF_FREE (conf->file_layouts);
- }
+ if (!addr_match)
+ return;
- GF_FREE (conf->subvolumes);
+ ret = dict_get_str (xl->options, "remote-host", &brick_host);
+ if ((ret == 0) &&
+ (gf_is_same_address (local_volname, brick_host) ||
+ gf_is_local_addr (brick_host))) {
+ conf->private = xl;
+ gf_log (this->name, GF_LOG_INFO, "Using the first local "
+ "subvol %s", xl->name);
+ return;
+ }
- GF_FREE (conf->subvolume_status);
+}
- GF_FREE (conf);
- }
+static void
+nufa_to_dht (xlator_t *this)
+{
+ GF_ASSERT (this);
+ GF_ASSERT (this->fops);
- return;
+ this->fops->lookup = dht_lookup;
+ this->fops->create = dht_create;
+ this->fops->mknod = dht_mknod;
}
int
-init (xlator_t *this)
+nufa_find_local_subvol (xlator_t *this,
+ void (*fn) (xlator_t *each, void* data), void *data)
{
- dht_conf_t *conf = NULL;
- xlator_list_t *trav = NULL;
- data_t *data = NULL;
- char *local_volname = NULL;
- char *temp_str = NULL;
- int ret = -1;
- int i = 0;
- char my_hostname[256];
- uint32_t temp_free_disk = 0;
-
- if (!this->children) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "NUFA needs more than one subvolume");
+ int ret = -1;
+ dht_conf_t *conf = this->private;
+ xlator_list_t *trav = NULL;
+ xlator_t *parent = NULL;
+ xlator_t *candidate = NULL;
+
+ xlator_foreach_depth_first (this, fn, data);
+ if (!conf->private) {
+ gf_log (this->name, GF_LOG_ERROR, "Couldn't find a local "
+ "brick");
return -1;
}
- if (!this->parents) {
- gf_log (this->name, GF_LOG_WARNING,
- "dangling volume. check volfile");
- }
-
- conf = GF_CALLOC (1, sizeof (*conf),
- gf_dht_mt_dht_conf_t);
- if (!conf) {
- goto err;
- }
-
- conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON;
- if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) {
- /* If option is not "auto", other options _should_ be boolean */
- if (strcasecmp (temp_str, "auto"))
- gf_string2boolean (temp_str, &conf->search_unhashed);
- else
- conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO;
- }
+ candidate = conf->private;
+ trav = candidate->parents;
+ while (trav) {
- ret = dht_init_subvolumes (this, conf);
- if (ret == -1) {
- goto err;
- }
+ parent = trav->xlator;
+ if (strcmp (parent->type, "cluster/nufa") == 0) {
+ gf_log (this->name, GF_LOG_INFO, "Found local subvol, "
+ "%s", candidate->name);
+ ret = 0;
+ conf->private = candidate;
+ break;
+ }
- ret = dht_layouts_init (this, conf);
- if (ret == -1) {
- goto err;
+ candidate = parent;
+ trav = parent->parents;
}
- LOCK_INIT (&conf->subvolume_lock);
- LOCK_INIT (&conf->layout_lock);
+ return ret;
+}
- conf->gen = 1;
+int
+nufa_init (xlator_t *this)
+{
+ data_t *data = NULL;
+ char *local_volname = NULL;
+ int ret = -1;
+ char my_hostname[256];
+ gf_boolean_t addr_match = _gf_false;
+ nufa_args_t args = {0, };
- local_volname = "localhost";
- ret = gethostname (my_hostname, 256);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "could not find hostname (%s)",
- strerror (errno));
+ ret = dht_init(this);
+ if (ret) {
+ return ret;
}
- if (ret == 0)
- local_volname = my_hostname;
-
- data = dict_get (this->options, "local-volume-name");
- if (data) {
+ if ((data = dict_get (this->options, "local-volume-name"))) {
local_volname = data->data;
- }
- trav = this->children;
- while (trav) {
- if (strcmp (trav->xlator->name, local_volname) == 0)
- break;
- trav = trav->next;
- }
+ } else {
+ addr_match = _gf_true;
+ local_volname = "localhost";
+ ret = gethostname (my_hostname, 256);
+ if (ret == 0)
+ local_volname = my_hostname;
- if (!trav) {
- gf_log (this->name, GF_LOG_ERROR,
- "Could not find subvolume named '%s'. "
- "Please define volume with the name as the hostname "
- "or override it with 'option local-volume-name'",
- local_volname);
- goto err;
- }
- /* The volume specified exists */
- conf->private = trav->xlator;
-
- conf->min_free_disk = 10;
- conf->disk_unit = 'p';
-
- if (dict_get_str (this->options, "min-free-disk",
- &temp_str) == 0) {
- if (gf_string2percent (temp_str,
- &temp_free_disk) == 0) {
- if (temp_free_disk > 100) {
- gf_string2bytesize (temp_str,
- &conf->min_free_disk);
- conf->disk_unit = 'b';
- } else {
- conf->min_free_disk = (uint64_t)temp_free_disk;
- conf->disk_unit = 'p';
- }
- } else {
- gf_string2bytesize (temp_str,
- &conf->min_free_disk);
- conf->disk_unit = 'b';
- }
- }
+ else
+ gf_log (this->name, GF_LOG_WARNING,
+ "could not find hostname (%s)",
+ strerror (errno));
- conf->du_stats = GF_CALLOC (conf->subvolume_cnt, sizeof (dht_du_t),
- gf_dht_mt_dht_du_t);
- if (!conf->du_stats) {
- goto err;
}
- this->local_pool = mem_pool_new (dht_local_t, 128);
- if (!this->local_pool) {
- gf_log (this->name, GF_LOG_ERROR,
- "failed to create local_t's memory pool");
- goto err;
+ args.this = this;
+ args.volname = local_volname;
+ args.addr_match = addr_match;
+ ret = nufa_find_local_subvol (this, nufa_find_local_brick, &args);
+ if (ret) {
+ gf_log (this->name, GF_LOG_INFO,
+ "Unable to find local subvolume, switching "
+ "to dht mode");
+ nufa_to_dht (this);
}
-
- this->private = conf;
-
return 0;
+}
-err:
- if (conf) {
- if (conf->file_layouts) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- GF_FREE (conf->file_layouts[i]);
- }
- GF_FREE (conf->file_layouts);
- }
-
- GF_FREE (conf->subvolumes);
-
- GF_FREE (conf->subvolume_status);
-
- GF_FREE (conf->du_stats);
- GF_FREE (conf);
- }
-
- return -1;
-}
+class_methods_t class_methods = {
+ .init = nufa_init,
+ .fini = dht_fini,
+ .reconfigure = dht_reconfigure,
+ .notify = dht_notify
+};
struct xlator_fops fops = {
@@ -684,19 +674,3 @@ struct xlator_fops fops = {
struct xlator_cbks cbks = {
.forget = dht_forget
};
-
-
-struct volume_options options[] = {
- { .key = {"lookup-unhashed"},
- .value = {"auto", "yes", "no", "enable", "disable", "1", "0",
- "on", "off"},
- .type = GF_OPTION_TYPE_STR
- },
- { .key = {"local-volume-name"},
- .type = GF_OPTION_TYPE_XLATOR
- },
- { .key = {"min-free-disk"},
- .type = GF_OPTION_TYPE_PERCENT_OR_SIZET,
- },
- { .key = {NULL} },
-};
diff --git a/xlators/cluster/dht/src/switch.c b/xlators/cluster/dht/src/switch.c
index 0542d7f9a..2717ce975 100644
--- a/xlators/cluster/dht/src/switch.c
+++ b/xlators/cluster/dht/src/switch.c
@@ -22,6 +22,8 @@
#include <fnmatch.h>
#include <string.h>
+extern struct volume_options options[];
+
struct switch_sched_array {
xlator_t *xl;
int32_t eligible;
@@ -135,7 +137,8 @@ switch_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret == -1)
goto out;
- is_linkfile = check_is_linkfile (inode, stbuf, xattr);
+ is_linkfile = check_is_linkfile (inode, stbuf, xattr,
+ conf->link_xattr_name);
is_dir = check_is_dir (inode, stbuf, xattr);
if (!is_dir && !is_linkfile) {
@@ -289,11 +292,11 @@ switch_lookup (call_frame_t *frame, xlator_t *this,
* attribute, revalidates directly go to the cached-subvolume.
*/
ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht", 4 * 4);
+ conf->xattr_name, 4 * 4);
if (ret < 0)
gf_log (this->name, GF_LOG_WARNING,
- "failed to set dict value for "
- "trusted.glusterfs.dht");
+ "failed to set dict value for %s",
+ conf->xattr_name);
for (i = 0; i < layout->cnt; i++) {
subvol = layout->list[i].xlator;
@@ -308,18 +311,18 @@ switch_lookup (call_frame_t *frame, xlator_t *this,
} else {
do_fresh_lookup:
ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht", 4 * 4);
+ conf->xattr_name, 4 * 4);
if (ret < 0)
gf_log (this->name, GF_LOG_WARNING,
- "failed to set dict value for "
- "trusted.glusterfs.dht");
+ "failed to set dict value for %s",
+ conf->xattr_name);
ret = dict_set_uint32 (local->xattr_req,
- "trusted.glusterfs.dht.linkto", 256);
+ conf->link_xattr_name, 256);
if (ret < 0)
gf_log (this->name, GF_LOG_WARNING,
- "failed to set dict value for "
- "trusted.glusterfs.dht.linkto");
+ "failed to set dict value for %s",
+ conf->link_xattr_name);
if (!hashed_subvol) {
gf_log (this->name, GF_LOG_DEBUG,
@@ -434,7 +437,8 @@ switch_create (call_frame_t *frame, xlator_t *this,
avail_subvol = get_switch_matching_subvol (loc->path, conf, subvol);
if (dht_is_subvol_filled (this, avail_subvol)) {
avail_subvol =
- dht_free_disk_available_subvol (this, avail_subvol);
+ dht_free_disk_available_subvol (this, avail_subvol,
+ local);
}
if (subvol != avail_subvol) {
@@ -443,9 +447,8 @@ switch_create (call_frame_t *frame, xlator_t *this,
local->flags = flags;
local->umask = umask;
local->cached_subvol = avail_subvol;
- dht_linkfile_create (frame,
- switch_create_linkfile_create_cbk,
- avail_subvol, subvol, loc);
+ dht_linkfile_create (frame, switch_create_linkfile_create_cbk,
+ this, avail_subvol, subvol, loc);
return 0;
}
@@ -475,17 +478,22 @@ switch_mknod_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dht_local_t *local = NULL;
local = frame->local;
+ if (!local || !local->cached_subvol) {
+ op_errno = EINVAL;
+ op_ret = -1;
+ goto err;
+ }
if (op_ret >= 0) {
- STACK_WIND (frame, dht_newfile_cbk,
- local->cached_subvol,
+ STACK_WIND_COOKIE (frame, dht_newfile_cbk,
+ (void *)local->cached_subvol, local->cached_subvol,
local->cached_subvol->fops->mknod,
&local->loc, local->mode, local->rdev,
local->umask, local->params);
return 0;
}
-
+err:
DHT_STACK_UNWIND (link, frame, op_ret, op_errno,
inode, stbuf, preparent, postparent, xdata);
return 0;
@@ -529,7 +537,8 @@ switch_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
avail_subvol = get_switch_matching_subvol (loc->path, conf, subvol);
if (dht_is_subvol_filled (this, avail_subvol)) {
avail_subvol =
- dht_free_disk_available_subvol (this, avail_subvol);
+ dht_free_disk_available_subvol (this, avail_subvol,
+ local);
}
if (avail_subvol != subvol) {
@@ -542,16 +551,16 @@ switch_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
local->cached_subvol = avail_subvol;
dht_linkfile_create (frame, switch_mknod_linkfile_cbk,
- avail_subvol, subvol, loc);
+ this, avail_subvol, subvol, loc);
return 0;
}
gf_log (this->name, GF_LOG_TRACE,
"creating %s on %s", loc->path, subvol->name);
- STACK_WIND (frame, dht_newfile_cbk,
- subvol, subvol->fops->mknod,
- loc, mode, rdev, umask, params);
+ STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol, subvol,
+ subvol->fops->mknod, loc, mode, rdev, umask,
+ params);
return 0;
@@ -564,20 +573,9 @@ err:
}
-int
-notify (xlator_t *this, int event, void *data, ...)
-{
- int ret = -1;
-
- ret = dht_notify (this, event, data);
-
- return ret;
-}
-
void
-fini (xlator_t *this)
+switch_fini (xlator_t *this)
{
- int i = 0;
dht_conf_t *conf = NULL;
struct switch_struct *trav = NULL;
struct switch_struct *prev = NULL;
@@ -593,22 +591,9 @@ fini (xlator_t *this)
trav = trav->next;
GF_FREE (prev);
}
-
- if (conf->file_layouts) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- GF_FREE (conf->file_layouts[i]);
- }
- GF_FREE (conf->file_layouts);
- }
-
- GF_FREE (conf->subvolumes);
-
- GF_FREE (conf->subvolume_status);
-
- GF_FREE (conf);
}
- return;
+ dht_fini(this);
}
int
@@ -685,6 +670,7 @@ set_switch_pattern (xlator_t *this, dht_conf_t *conf,
GF_FREE (dup_str);
continue;
}
+ GF_FREE (dup_str);
memcpy (switch_opt->path_pattern, pattern, strlen (pattern));
if (childs) {
dup_childs = gf_strdup (childs);
@@ -741,7 +727,6 @@ set_switch_pattern (xlator_t *this, dht_conf_t *conf,
"option in unify volume. Exiting");
goto err;
}
- GF_FREE (dup_str);
/* Link it to the main structure */
if (switch_buf) {
@@ -834,68 +819,18 @@ err:
}
-int
-init (xlator_t *this)
+int32_t
+switch_init (xlator_t *this)
{
dht_conf_t *conf = NULL;
data_t *data = NULL;
- char *temp_str = NULL;
int ret = -1;
- int i = 0;
- uint32_t temp_free_disk = 0;
-
- if (!this->children) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "SWITCH needs more than one subvolume");
- return -1;
- }
-
- if (!this->parents) {
- gf_log (this->name, GF_LOG_WARNING,
- "dangling volume. check volfile");
- }
- conf = GF_CALLOC (1, sizeof (*conf), gf_switch_mt_dht_conf_t);
- if (!conf) {
- goto err;
- }
-
- conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_ON;
- if (dict_get_str (this->options, "lookup-unhashed", &temp_str) == 0) {
- /* If option is not "auto", other options _should_ be boolean */
- if (strcasecmp (temp_str, "auto"))
- gf_string2boolean (temp_str, &conf->search_unhashed);
- else
- conf->search_unhashed = GF_DHT_LOOKUP_UNHASHED_AUTO;
- }
-
- conf->unhashed_sticky_bit = 0;
- if (dict_get_str (this->options, "unhashed-sticky-bit",
- &temp_str) == 0) {
- gf_string2boolean (temp_str, &conf->unhashed_sticky_bit);
- }
-
- conf->min_free_disk = 10;
- conf->disk_unit = 'p';
-
- if (dict_get_str (this->options, "min-free-disk",
- &temp_str) == 0) {
- if (gf_string2percent (temp_str,
- &temp_free_disk) == 0) {
- if (temp_free_disk > 100) {
- gf_string2bytesize (temp_str,
- &conf->min_free_disk);
- conf->disk_unit = 'b';
- } else {
- conf->min_free_disk = (uint64_t)temp_free_disk;
- conf->disk_unit = 'p';
- }
- } else {
- gf_string2bytesize (temp_str,
- &conf->min_free_disk);
- conf->disk_unit = 'b';
- }
+ ret = dht_init(this);
+ if (ret) {
+ return ret;
}
+ conf = this->private;
data = dict_get (this->options, "pattern.switch.case");
if (data) {
@@ -906,60 +841,23 @@ init (xlator_t *this)
}
}
- ret = dht_init_subvolumes (this, conf);
- if (ret == -1) {
- goto err;
- }
-
- ret = dht_layouts_init (this, conf);
- if (ret == -1) {
- goto err;
- }
-
- LOCK_INIT (&conf->subvolume_lock);
- LOCK_INIT (&conf->layout_lock);
-
- conf->gen = 1;
-
- conf->du_stats = GF_CALLOC (conf->subvolume_cnt, sizeof (dht_du_t),
- gf_switch_mt_dht_du_t);
- if (!conf->du_stats) {
- goto err;
- }
-
- this->local_pool = mem_pool_new (dht_local_t, 128);
- if (!this->local_pool) {
- gf_log (this->name, GF_LOG_ERROR,
- "failed to create local_t's memory pool");
- goto err;
- }
-
this->private = conf;
-
return 0;
err:
- if (conf) {
- if (conf->file_layouts) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- GF_FREE (conf->file_layouts[i]);
- }
- GF_FREE (conf->file_layouts);
- }
-
- GF_FREE (conf->subvolumes);
-
- GF_FREE (conf->subvolume_status);
-
- GF_FREE (conf->du_stats);
-
- GF_FREE (conf);
- }
-
+ dht_fini(this);
return -1;
}
+class_methods_t class_methods = {
+ .init = switch_init,
+ .fini = switch_fini,
+ .reconfigure = dht_reconfigure,
+ .notify = dht_notify
+};
+
+
struct xlator_fops fops = {
.lookup = switch_lookup,
.create = switch_create,
@@ -1004,19 +902,3 @@ struct xlator_fops fops = {
struct xlator_cbks cbks = {
.forget = dht_forget
};
-
-
-struct volume_options options[] = {
- { .key = {"lookup-unhashed"},
- .value = {"auto", "yes", "no", "enable", "disable", "1", "0",
- "on", "off"},
- .type = GF_OPTION_TYPE_STR
- },
- { .key = {"pattern.switch.case"},
- .type = GF_OPTION_TYPE_ANY
- },
- { .key = {"min-free-disk"},
- .type = GF_OPTION_TYPE_PERCENT_OR_SIZET,
- },
- { .key = {NULL} },
-};
diff --git a/xlators/cluster/dht/src/unittest/dht_layout_mock.c b/xlators/cluster/dht/src/unittest/dht_layout_mock.c
new file mode 100644
index 000000000..aa19ddc57
--- /dev/null
+++ b/xlators/cluster/dht/src/unittest/dht_layout_mock.c
@@ -0,0 +1,63 @@
+/*
+ Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "xlator.h"
+#include "dht-common.h"
+#include "byte-order.h"
+
+int
+dht_hash_compute (xlator_t *this, int type, const char *name, uint32_t *hash_p)
+{
+ return 0;
+}
+
+int
+dht_inode_ctx_layout_get (inode_t *inode, xlator_t *this, dht_layout_t **layout)
+{
+ return 0;
+}
+
+int
+dht_inode_ctx_layout_set (inode_t *inode, xlator_t *this,
+ dht_layout_t *layout_int)
+{
+ return 0;
+}
+
+int
+dict_get_ptr (dict_t *this, char *key, void **ptr)
+{
+ return 0;
+}
+
+int
+dict_get_ptr_and_len (dict_t *this, char *key, void **ptr, int *len)
+{
+ return 0;
+}
+
+int _gf_log (const char *domain, const char *file,
+ const char *function, int32_t line, gf_loglevel_t level,
+ const char *fmt, ...)
+{
+ return 0;
+}
+
+int _gf_log_callingfn (const char *domain, const char *file,
+ const char *function, int32_t line, gf_loglevel_t level,
+ const char *fmt, ...)
+{
+ return 0;
+}
diff --git a/xlators/cluster/dht/src/unittest/dht_layout_unittest.c b/xlators/cluster/dht/src/unittest/dht_layout_unittest.c
new file mode 100644
index 000000000..b5233d235
--- /dev/null
+++ b/xlators/cluster/dht/src/unittest/dht_layout_unittest.c
@@ -0,0 +1,124 @@
+/*
+ Copyright (c) 2008-2014 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "dht-common.h"
+#include "logging.h"
+#include "xlator.h"
+
+#include <stdarg.h>
+#include <stddef.h>
+#include <setjmp.h>
+#include <inttypes.h>
+#include <cmockery/pbc.h>
+#include <cmockery/cmockery.h>
+
+/*
+ * Helper functions
+ */
+
+static xlator_t *
+helper_xlator_init(uint32_t num_types)
+{
+ xlator_t *xl;
+ int i, ret;
+
+ REQUIRE(num_types > 0);
+
+ xl = test_calloc(1, sizeof(xlator_t));
+ assert_non_null(xl);
+ xl->mem_acct.num_types = num_types;
+ xl->mem_acct.rec = test_calloc(num_types, sizeof(struct mem_acct_rec));
+ assert_non_null(xl->mem_acct.rec);
+
+ xl->ctx = test_calloc(1, sizeof(glusterfs_ctx_t));
+ assert_non_null(xl->ctx);
+
+ for (i = 0; i < num_types; i++) {
+ ret = LOCK_INIT(&(xl->mem_acct.rec[i].lock));
+ assert_false(ret);
+ }
+
+ ENSURE(num_types == xl->mem_acct.num_types);
+ ENSURE(NULL != xl);
+
+ return xl;
+}
+
+static int
+helper_xlator_destroy(xlator_t *xl)
+{
+ int i, ret;
+
+ for (i = 0; i < xl->mem_acct.num_types; i++) {
+ ret = LOCK_DESTROY(&(xl->mem_acct.rec[i].lock));
+ assert_int_equal(ret, 0);
+ }
+
+ free(xl->mem_acct.rec);
+ free(xl->ctx);
+ free(xl);
+ return 0;
+}
+
+/*
+ * Unit tests
+ */
+static void
+test_dht_layout_new(void **state)
+{
+ xlator_t *xl;
+ dht_layout_t *layout;
+ dht_conf_t *conf;
+ int cnt;
+
+ expect_assert_failure(dht_layout_new(NULL, 0));
+ expect_assert_failure(dht_layout_new((xlator_t *)0x12345, -1));
+ xl = helper_xlator_init(10);
+
+ // xl->private is NULL
+ assert_null(xl->private);
+ cnt = 100;
+ layout = dht_layout_new(xl, cnt);
+ assert_non_null(layout);
+ assert_int_equal(layout->type, DHT_HASH_TYPE_DM);
+ assert_int_equal(layout->cnt, cnt);
+ assert_int_equal(layout->ref, 1);
+ assert_int_equal(layout->gen, 0);
+ assert_int_equal(layout->spread_cnt, 0);
+ free(layout);
+
+ // xl->private is not NULL
+ cnt = 110;
+ conf = (dht_conf_t *)test_calloc(1, sizeof(dht_conf_t));
+ assert_non_null(conf);
+ conf->dir_spread_cnt = 12345;
+ conf->gen = -123;
+ xl->private = conf;
+
+ layout = dht_layout_new(xl, cnt);
+ assert_non_null(layout);
+ assert_int_equal(layout->type, DHT_HASH_TYPE_DM);
+ assert_int_equal(layout->cnt, cnt);
+ assert_int_equal(layout->ref, 1);
+ assert_int_equal(layout->gen, conf->gen);
+ assert_int_equal(layout->spread_cnt, conf->dir_spread_cnt);
+ free(layout);
+
+ free(conf);
+ helper_xlator_destroy(xl);
+}
+
+int main(void) {
+ const UnitTest tests[] = {
+ unit_test(test_dht_layout_new),
+ };
+
+ return run_tests(tests, "xlator_dht_layout");
+}
diff --git a/xlators/cluster/ha/src/Makefile.am b/xlators/cluster/ha/src/Makefile.am
index c5da56ff1..5c1364b7f 100644
--- a/xlators/cluster/ha/src/Makefile.am
+++ b/xlators/cluster/ha/src/Makefile.am
@@ -1,7 +1,7 @@
xlator_LTLIBRARIES = ha.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/cluster
-ha_la_LDFLAGS = -module -avoidversion
+ha_la_LDFLAGS = -module -avoid-version
ha_la_SOURCES = ha-helpers.c ha.c
ha_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
diff --git a/xlators/cluster/map/src/Makefile.am b/xlators/cluster/map/src/Makefile.am
index a51c13573..a278b05e2 100644
--- a/xlators/cluster/map/src/Makefile.am
+++ b/xlators/cluster/map/src/Makefile.am
@@ -1,7 +1,7 @@
xlator_LTLIBRARIES = map.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/cluster
-map_la_LDFLAGS = -module -avoidversion
+map_la_LDFLAGS = -module -avoid-version
map_la_SOURCES = map.c map-helper.c
map_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
diff --git a/xlators/cluster/nsr-client/Makefile.am b/xlators/cluster/nsr-client/Makefile.am
new file mode 100644
index 000000000..d471a3f92
--- /dev/null
+++ b/xlators/cluster/nsr-client/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/cluster/nsr-client/src/Makefile.am b/xlators/cluster/nsr-client/src/Makefile.am
new file mode 100644
index 000000000..4541ea01a
--- /dev/null
+++ b/xlators/cluster/nsr-client/src/Makefile.am
@@ -0,0 +1,33 @@
+noinst_PYTHON = gen-fops.py
+
+xlator_LTLIBRARIES = nsrc.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
+
+nsrc_la_LDFLAGS = -module -avoid-version
+nsrc_la_SOURCES = nsrc.c
+
+nsrc_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = fop-template.c \
+ $(top_srcdir)/xlators/lib/src/libxlator.h \
+ $(top_srcdir)/glusterfsd/src/glusterfsd.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) \
+ -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \
+ -I$(top_srcdir)/rpc/rpc-lib/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+XLATOR_HEADER = $(top_srcdir)/libglusterfs/src/xlator.h
+
+CLEANFILES = nsrc-cg.c
+
+CODEGEN_DIR = ../../nsr-server/src/codegen.py
+
+nsrc-cg.c: gen-fops.py $(CODEGEN) $(XLATOR_HEADER) fop-template.c
+ $(PYTHON) ./gen-fops.py $(XLATOR_HEADER) fop-template.c > $@
+
+nsrc.lo: nsrc-cg.c
+
+uninstall-local:
+ rm -f $(DESTDIR)$(xlatordir)/nsr.so
diff --git a/xlators/cluster/nsr-client/src/fop-template.c b/xlators/cluster/nsr-client/src/fop-template.c
new file mode 100644
index 000000000..699b07d40
--- /dev/null
+++ b/xlators/cluster/nsr-client/src/fop-template.c
@@ -0,0 +1,113 @@
+// template-name fop
+$TYPE$
+nsrc_$NAME$ (call_frame_t *frame, xlator_t *this,
+ $ARGS_LONG$)
+{
+ nsrc_local_t *local = NULL;
+ xlator_t *target_xl = ACTIVE_CHILD(this);
+
+ local = mem_get(this->local_pool);
+ if (!local) {
+ goto err;
+ }
+
+ local->stub = fop_$NAME$_stub (frame, nsrc_$NAME$_continue,
+ $ARGS_SHORT$);
+ if (!local->stub) {
+ goto err;
+ }
+ local->curr_xl = target_xl;
+ local->scars = 0;
+
+ frame->local = local;
+ STACK_WIND_COOKIE (frame, nsrc_$NAME$_cbk, target_xl,
+ target_xl, target_xl->fops->$NAME$,
+ $ARGS_SHORT$);
+ return 0;
+
+err:
+ if (local) {
+ mem_put(local);
+ }
+ STACK_UNWIND_STRICT ($NAME$, frame, -1, ENOMEM,
+ $DEFAULTS$);
+ return 0;
+}
+
+// template-name cbk
+$TYPE$
+nsrc_$NAME$_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ $ARGS_LONG$)
+{
+ nsrc_local_t *local = frame->local;
+ xlator_t *last_xl = cookie;
+ xlator_t *next_xl;
+ nsrc_private_t *priv = this->private;
+ struct timespec spec;
+
+ if (op_ret != (-1)) {
+ if (local->scars) {
+ gf_log (this->name, GF_LOG_INFO,
+ HILITE("retried %p OK"), frame->local);
+ }
+ priv->active = last_xl;
+ goto unwind;
+ }
+ if ((op_errno != EREMOTE) && (op_errno != ENOTCONN)) {
+ goto unwind;
+ }
+
+ /* TBD: get leader ID from xdata? */
+ next_xl = next_xlator(this,last_xl);
+ /*
+ * We can't just give up after we've tried all bricks, because it's
+ * quite likely that a new leader election just hasn't finished yet.
+ * We also shouldn't retry endlessly, and especially not at a high
+ * rate, but that's good enough while we work on other things.
+ *
+ * TBD: implement slow/finite retry via a worker thread
+ */
+ if (!next_xl || (local->scars >= SCAR_LIMIT)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ HILITE("ran out of retries for %p"), frame->local);
+ goto unwind;
+ }
+
+ local->curr_xl = next_xl;
+ local->scars += 1;
+ spec.tv_sec = 1;
+ spec.tv_nsec = 0;
+ /*
+ * WARNING
+ *
+ * Just calling gf_timer_call_after like this leaves open the
+ * possibility that writes will get reordered, if a first write is
+ * rescheduled and then a second comes along to find an updated
+ * priv->active before the first actually executes. We might need to
+ * implement a stricter (and more complicated) queuing mechanism to
+ * ensure absolute consistency in this case.
+ */
+ if (gf_timer_call_after(this->ctx,spec,nsrc_retry_cb,local)) {
+ return 0;
+ }
+
+unwind:
+ call_stub_destroy(local->stub);
+ STACK_UNWIND_STRICT ($NAME$, frame, op_ret, op_errno,
+ $ARGS_SHORT$);
+ return 0;
+}
+
+// template-name cont-func
+$TYPE$
+nsrc_$NAME$_continue (call_frame_t *frame, xlator_t *this,
+ $ARGS_LONG$)
+{
+ nsrc_local_t *local = frame->local;
+
+ STACK_WIND_COOKIE (frame, nsrc_$NAME$_cbk, local->curr_xl,
+ local->curr_xl, local->curr_xl->fops->$NAME$,
+ $ARGS_SHORT$);
+ return 0;
+}
diff --git a/xlators/cluster/nsr-client/src/gen-fops.py b/xlators/cluster/nsr-client/src/gen-fops.py
new file mode 100755
index 000000000..b07b3c5b1
--- /dev/null
+++ b/xlators/cluster/nsr-client/src/gen-fops.py
@@ -0,0 +1,57 @@
+#!/usr/bin/python
+
+# This script generates the boilerplate versions of most fops in the client,
+# mostly so that we can use STACK_WIND instead of STACK_WIND_TAIL (see
+# fop-template.c for the details). The problem we're solving is that we sit
+# under DHT, which makes assumptions about getting callbacks only from its
+# direct children. If we didn't define our own versions of these fops, the
+# default versions would use STACK_WIND_TAIL and the callbacks would come from
+# DHT's grandchildren. The code-generation approach allows us to handle this
+# with a minimum of code, and also keep up with any changes to the fop table.
+
+import sys
+sys.path.append("../../nsr-server/src") # Blech.
+import codegen
+
+type_re = "([a-z_0-9]+)"
+name_re = "\(\*fop_([a-z0-9]+)_t\)"
+full_re = type_re + " *" + name_re
+fop_cg = codegen.CodeGenerator()
+fop_cg.skip = 2
+fop_cg.parse_decls(sys.argv[1],full_re)
+fop_cg.load_templates(sys.argv[2])
+
+# Use the multi-template feature to generate multiple callbacks from the same
+# parsed declarations.
+type_re = "([a-z_0-9]+)"
+name_re = "\(\*fop_([a-z0-9]+)_cbk_t\)"
+full_re = type_re + " *" + name_re
+cbk_cg = codegen.CodeGenerator()
+cbk_cg.skip = 5
+cbk_cg.parse_decls(sys.argv[1],full_re)
+cbk_cg.load_templates(sys.argv[2])
+
+# This is a nasty little trick to handle the case where a generated fop needs
+# a set of default arguments for the corresponding callback.
+#
+# Yes, it's ironic that I'm copying and pasting the generator code.
+fop_cg.make_defaults = cbk_cg.make_defaults
+
+# Sorry, getspec, you're not a real fop until someone writes a stub function
+# for you.
+del fop_cg.decls["getspec"]
+del cbk_cg.decls["getspec"]
+
+# cbk is used by both fop and continue, so emit first
+for f_name in cbk_cg.decls.keys():
+ cbk_cg.emit(f_name,"cbk")
+ print("")
+
+# continue is used by fop, so emit next
+for f_name in fop_cg.decls.keys():
+ fop_cg.emit(f_name,"cont-func")
+ print("")
+
+for f_name in fop_cg.decls.keys():
+ fop_cg.emit(f_name,"fop")
+ print("")
diff --git a/xlators/cluster/nsr-client/src/nsrc.c b/xlators/cluster/nsr-client/src/nsrc.c
new file mode 100644
index 000000000..4551a1432
--- /dev/null
+++ b/xlators/cluster/nsr-client/src/nsrc.c
@@ -0,0 +1,243 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "call-stub.h"
+#include "defaults.h"
+#include "timer.h"
+#include "xlator.h"
+
+#define SCAR_LIMIT 20
+#define HILITE(x) (""x"")
+
+/*
+ * The fops are actually generated by gen-fops.py; the rest was mostly copied
+ * from defaults.c (commit cd253754 on 27 August 2013).
+ */
+
+enum gf_dht_mem_types_ {
+ gf_mt_nsrc_private_t = gf_common_mt_end + 1,
+ gf_mt_nsrc_end
+};
+
+typedef struct {
+ xlator_t *active;
+} nsrc_private_t;
+
+typedef struct {
+ call_stub_t *stub;
+ xlator_t *curr_xl;
+ uint16_t scars;
+} nsrc_local_t;
+
+char *NSRC_XATTR = "user.nsr.active";
+
+static inline
+xlator_t *
+ACTIVE_CHILD (xlator_t *parent)
+{
+ nsrc_private_t *priv = parent->private;
+
+ return priv ? priv->active : FIRST_CHILD(parent);
+}
+
+xlator_t *
+next_xlator (xlator_t *this, xlator_t *prev)
+{
+ xlator_list_t *trav;
+
+ for (trav = this->children; trav; trav = trav->next) {
+ if (trav->xlator == prev) {
+ return trav->next ? trav->next->xlator
+ : this->children->xlator;
+ }
+ }
+
+ return NULL;
+}
+
+void
+nsrc_retry_cb (void *cb_arg)
+{
+ nsrc_local_t *local = cb_arg;
+
+ gf_log (__func__, GF_LOG_INFO, HILITE("retrying %p"), local);
+ call_resume_wind(local->stub);
+}
+
+#include "nsrc-cg.c"
+
+int32_t
+nsrc_forget (xlator_t *this, inode_t *inode)
+{
+ gf_log_callingfn (this->name, GF_LOG_WARNING, "xlator does not "
+ "implement forget_cbk");
+ return 0;
+}
+
+
+int32_t
+nsrc_releasedir (xlator_t *this, fd_t *fd)
+{
+ gf_log_callingfn (this->name, GF_LOG_WARNING, "xlator does not "
+ "implement releasedir_cbk");
+ return 0;
+}
+
+int32_t
+nsrc_release (xlator_t *this, fd_t *fd)
+{
+ gf_log_callingfn (this->name, GF_LOG_WARNING, "xlator does not "
+ "implement release_cbk");
+ return 0;
+}
+
+struct xlator_fops fops = {
+ .lookup = nsrc_lookup,
+ .stat = nsrc_stat,
+ .fstat = nsrc_fstat,
+ .truncate = nsrc_truncate,
+ .ftruncate = nsrc_ftruncate,
+ .access = nsrc_access,
+ .readlink = nsrc_readlink,
+ .mknod = nsrc_mknod,
+ .mkdir = nsrc_mkdir,
+ .unlink = nsrc_unlink,
+ .rmdir = nsrc_rmdir,
+ .symlink = nsrc_symlink,
+ .rename = nsrc_rename,
+ .link = nsrc_link,
+ .create = nsrc_create,
+ .open = nsrc_open,
+ .readv = nsrc_readv,
+ .writev = nsrc_writev,
+ .flush = nsrc_flush,
+ .fsync = nsrc_fsync,
+ .opendir = nsrc_opendir,
+ .readdir = nsrc_readdir,
+ .readdirp = nsrc_readdirp,
+ .fsyncdir = nsrc_fsyncdir,
+ .statfs = nsrc_statfs,
+ .setxattr = nsrc_setxattr,
+ .getxattr = nsrc_getxattr,
+ .fsetxattr = nsrc_fsetxattr,
+ .fgetxattr = nsrc_fgetxattr,
+ .removexattr = nsrc_removexattr,
+ .fremovexattr = nsrc_fremovexattr,
+ .lk = nsrc_lk,
+ .inodelk = nsrc_inodelk,
+ .finodelk = nsrc_finodelk,
+ .entrylk = nsrc_entrylk,
+ .fentrylk = nsrc_fentrylk,
+ .rchecksum = nsrc_rchecksum,
+ .xattrop = nsrc_xattrop,
+ .fxattrop = nsrc_fxattrop,
+ .setattr = nsrc_setattr,
+ .fsetattr = nsrc_fsetattr,
+ .fallocate = nsrc_fallocate,
+ .discard = nsrc_discard,
+};
+
+struct xlator_cbks cbks = {
+};
+
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("nsrc", this, out);
+
+ ret = xlator_mem_acct_init (this, gf_mt_nsrc_end + 1);
+
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Memory accounting init" "failed");
+ return ret;
+ }
+out:
+ return ret;
+}
+
+
+int32_t
+nsrc_init (xlator_t *this)
+{
+ nsrc_private_t *priv = NULL;
+
+ this->local_pool = mem_pool_new (nsrc_local_t, 128);
+ if (!this->local_pool) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to create nsrc_local_t pool");
+ goto err;
+ }
+
+ priv = GF_CALLOC (1, sizeof (*priv), gf_mt_nsrc_private_t);
+ if (!priv) {
+ goto err;
+ }
+
+ priv->active = FIRST_CHILD(this);
+ this->private = priv;
+ return 0;
+
+err:
+ if (priv) {
+ GF_FREE(priv);
+ }
+ return -1;
+}
+
+void
+nsrc_fini (xlator_t *this)
+{
+ GF_FREE(this->private);
+}
+
+int32_t
+nsrc_notify (xlator_t *this, int32_t event, void *data, ...)
+{
+ int32_t ret = 0;
+
+ switch (event) {
+ case GF_EVENT_CHILD_DOWN:
+ /*
+ * TBD: handle this properly
+ *
+ * What we really should do is propagate this only if it caused
+ * us to lose quorum, and likewise for GF_EVENT_CHILD_UP only
+ * if it caused us to gain quorum. However, that requires
+ * tracking child states and for now it's easier to swallow
+ * these unconditionally. The consequence of failing to do
+ * this is that DHT sees the first GF_EVENT_CHILD_DOWN and gets
+ * confused, so it doesn't call us and doesn't get up-to-date
+ * directory listings etc.
+ */
+ break;
+ default:
+ ret = default_notify (this, event, data);
+ }
+
+ return ret;
+}
+
+class_methods_t class_methods = {
+ .init = nsrc_init,
+ .fini = nsrc_fini,
+ .notify = nsrc_notify,
+};
+
+struct volume_options options[] = {
+ { .key = {NULL} },
+};
diff --git a/xlators/cluster/nsr-recon/Makefile.am b/xlators/cluster/nsr-recon/Makefile.am
new file mode 100644
index 000000000..d471a3f92
--- /dev/null
+++ b/xlators/cluster/nsr-recon/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/cluster/nsr-recon/src/Makefile.am b/xlators/cluster/nsr-recon/src/Makefile.am
new file mode 100644
index 000000000..e639e4437
--- /dev/null
+++ b/xlators/cluster/nsr-recon/src/Makefile.am
@@ -0,0 +1,23 @@
+xlator_LTLIBRARIES = nsr_recon.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
+
+nsr_recon_la_LDFLAGS = -module -avoid-version
+nsr_recon_la_SOURCES = recon_driver.c recon_xlator.c
+
+nsr_recon_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
+ $(top_builddir)/api/src/libgfapi.la
+
+noinst_HEADERS = recon_driver.h recon_xlator.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) \
+ -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \
+ -I$(top_srcdir)/rpc/rpc-lib/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+XLATOR_HEADER = $(top_srcdir)/libglusterfs/src/xlator.h
+
+CLEANFILES =
+
+uninstall-local:
+ rm -f $(DESTDIR)$(xlatordir)/nsr.so
diff --git a/xlators/cluster/nsr-recon/src/recon_driver.c b/xlators/cluster/nsr-recon/src/recon_driver.c
new file mode 100644
index 000000000..8c7622a02
--- /dev/null
+++ b/xlators/cluster/nsr-recon/src/recon_driver.c
@@ -0,0 +1,3130 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <sys/types.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+#include <fnmatch.h>
+
+
+#include "call-stub.h"
+#include "defaults.h"
+#include "xlator.h"
+
+
+#include "recon_driver.h"
+#include "recon_xlator.h"
+#include "api/src/glfs-internal.h"
+#include "api/src/glfs-handles.h"
+
+/* TBD: move declarations here and nsr.c into a common place */
+#define NSR_TERM_XATTR "trusted.nsr.term"
+#define RECON_TERM_XATTR "trusted.nsr.recon-term"
+#define RECON_INDEX_XATTR "trusted.nsr.recon-index"
+
+/*
+ * Execution architecture for the NSR reconciliation driver. The driver runs
+ * as a seperate process in each node where the brick is. The main function of
+ * the driver is nsr_reconciliation_driver() (last function below) The driver
+ * just sits in a tight loop waiting for state changes. When a brick becomes a
+ * replica leader, it fences IO, contacts this process and waits for
+ * reconciliation to finish.
+ *
+ * The replica leader talks to other bricks in replica group which are alive
+ * and gets the last term info using which it decides which has the latest
+ * data. That brick is referred to as the "reconciliator"; leader sends a
+ * message to reconciliator to freeze its data (by reading any incomplete data
+ * from other nodes from that term if required)
+ *
+ * Once that is done leader sends a message to all nodes except the
+ * reconciliator to sync themselves with the reconciliator. This process is
+ * referred to as "resolution".
+ *
+ * Hence the reconciliation processes need to talk to each other to get a given
+ * term info. This is implemented using the recon translator IOs which
+ * implements a bare bone RPC by exposing a file interface to which
+ * reads/writes are done to pass control messages. This is referred to as the
+ * "control plane". This implementation allows the control plane to be
+ * implemented as a bunch of threads for each of the nodes.
+ *
+ * The reconciliation process also needs to talk to the brick process on that
+ * node to actually write the data as part of reconciliation/resolution. This
+ * is referred to as the "data plane". Again there are a bunch of threads that
+ * do this work.
+ *
+ * The way the worker threads are organised is that main driver context has a
+ * pointer to contexts for each of these thread contexts. The thread context at
+ * index 0 always refers to talking with local recon process/brick. So the
+ * control worker at index 0 will get the local changelog info and data worker
+ * at index 0 will talk to local brick.
+ *
+ * All the ops from the control/data planes are implemented using the glfs
+ * APIs.
+ */
+
+#if defined(NSR_DEBUG)
+
+/* This lets us change on the fly even if NSR_DEBUG is defined. */
+int nsr_debug_level = GF_LOG_TRACE;
+
+FILE *
+recon_create_log (char *member, char *module)
+{
+ char *dpath = NULL;
+ char *p;
+ char *fpath = NULL;
+ FILE *fp = NULL;
+ int fd = -1;
+
+ (void)mkdir(NSR_LOG_DIR,0777);
+ (void)asprintf(&dpath,NSR_LOG_DIR"/%s",member);
+ if (dpath) {
+ for (p = dpath + strlen(NSR_LOG_DIR) + 1; *p; ++p) {
+ if (*p == '/') {
+ *p = '-';
+ }
+ }
+ (void)mkdir(dpath,0777);
+ (void)asprintf(&fpath,"%s/%s",dpath,module);
+ if (fpath) {
+ fd = open(fpath,O_WRONLY|O_CREAT|O_APPEND|O_SYNC,0666);
+ if (fd >= 0) {
+ fp = fdopen(fd,"a");
+ if (!fp) {
+ close(fd);
+ }
+ }
+ if (fp) {
+ if (setvbuf (fp, NULL, _IONBF, 0)) {
+ /*
+ * Might as well take advantage of it
+ * to log the error.
+ */
+ fprintf (fp,
+ "setvbuf failed for log\n");
+ fprintf (fp,
+ "log output may be async\n");
+ fflush(fp);
+ }
+ }
+ free(fpath);
+ }
+ free(dpath);
+ }
+
+ return fp;
+}
+
+void
+_nsr_driver_log (const char *func, int line, char *member, FILE *fp,
+ char *fmt, ...)
+{
+ va_list ap;
+ char *buf = NULL;
+ int retval;
+
+ if (!fp) {
+ fp = recon_create_log(member,"nsr-driver-log");
+ if (!fp) {
+ return;
+ }
+ }
+
+ va_start(ap,fmt);
+ retval = vasprintf(&buf,fmt,ap);
+ if (buf) {
+ fprintf(fp,"[%s:%d] %.*s\n",func,line,retval,buf);
+ free(buf);
+ }
+ va_end(ap);
+}
+
+void
+_nsr_worker_log (const char *func, int line, char *member,
+ char *type, uint32_t index, FILE *fp,
+ char *fmt, ...)
+{
+ va_list ap;
+ char *buf = NULL;
+ int retval;
+
+ if (!fp) {
+ char *name;
+ if (asprintf(&name,"%s-%u",type,index) < 1) {
+ return;
+ }
+ fp = recon_create_log (member, name);
+ if (!fp) {
+ return;
+ }
+ }
+
+ va_start(ap,fmt);
+ retval = vasprintf(&buf,fmt,ap);
+ if (buf) {
+ fprintf(fp,"[%s:%d] %.*s\n",func,line,retval,buf);
+ free(buf);
+ }
+ va_end(ap);
+}
+
+#endif
+
+/*
+ * Recon Driver Calloc
+ *
+ * We need this because all of this messing about with gfapi from within a
+ * translator keeps scrambling THIS (only one reason it's a terrible idea) and
+ * we need THIS to have a value that represents our initialization with our
+ * memory types.
+ *
+ * Note that the macro requires "this" to be defined in the current scope.
+ */
+
+#define RD_CALLOC(x,y,z) ({THIS = this; GF_CALLOC(x,y,z); })
+
+/*
+ * This function gets the size of all the extended attributes for a file.
+ * This is used so that caller knows how much to allocate for key-value storage.
+ *
+ * Input Arguments:
+ * fd - the file opened using glfs API.
+ * dict - passed so that NSR translator can get this from the required brick
+ *
+ * Output Arguments:
+ * b - pointer to the buffer where the attributes are filled up.
+ * key_size - the size of all keys
+ * val_size - the size of all values
+ * num - number of key/values
+ */
+static int32_t
+get_xattr_total_size( struct glfs_fd *fd,
+ char **b,
+ uint32_t *key_size,
+ uint32_t *val_size,
+ uint32_t* num,
+ dict_t *dict)
+{
+ int32_t s = -1, ret = -1;
+ char *c = NULL;
+
+ *key_size = 0;
+ *val_size = 0;
+ *num = 0;
+
+ // First get the size of the keys
+ s = glfs_flistxattr_with_xdata(fd, NULL,0, dict);
+ if (s == -1) {
+ goto out;
+ }
+ *key_size = s;
+
+ // TBD - use the regular calloc
+ (*b) = c = calloc(s+1,1);
+
+ // get the keys themselves
+ if (glfs_flistxattr_with_xdata(fd, c, s+1, dict) == -1) {
+ goto out;
+ }
+ do {
+ int32_t r;
+ uint32_t len = 0;
+ // for each key get the size of the value
+ r = glfs_fgetxattr_with_xdata(fd, c, NULL, 0, dict);
+ if (r == -1)
+ goto out;
+ (*val_size) += r;
+ len = strlen(c) + 1;
+ c += len;
+ s -= len;
+ (*num)++;
+ } while(s);
+ ret = 0;
+out:
+ return ret;
+}
+
+/*
+ * This function gets bunch of xattr values given set of keys.
+ *
+ * Input Arguments:
+ * fd - the file opened using glfs API.
+ * keys - the bunch of keys
+ * size - size of values
+ * num - number of keys
+ * dict - passed so that NSR translator can get this from the required brick
+ *
+ * Output Arguments:
+ * buf - where the values are written one after the other (NULL seperated)
+ */
+static int32_t
+get_xattr(struct glfs_fd *fd,
+ char *keys,
+ char *buf,
+ uint32_t size,
+ uint32_t num,
+ dict_t *dict)
+{
+ while(num--) {
+ int32_t r;
+ uint32_t len = 0;
+
+ // copy the key
+ strcpy(buf, keys);
+ len = strlen(keys);
+ len++;
+ buf += len;
+
+ // get the value and copy the value after incrementing buf after the key
+ r = glfs_fgetxattr_with_xdata(fd, keys, buf, size, dict);
+
+ // TBD - handle error
+ if (r == -1)
+ return -1;
+
+ // increment the key to next value
+ keys += len;
+
+ // increment buf to hold the next key
+ buf += strlen(buf) + 1;
+ }
+ return 0;
+}
+
+/*
+ * Function deletes a bunch of key values in extended attributes of a file.
+ * Input Arguments:
+ * fd - the file opened using glfs API.
+ * dict - passed so that NSR translator can do this from the required brick
+ * keys - bunch of NULL seperated key names
+ * num - number of keys
+ */
+static int32_t delete_xattr(struct glfs_fd *fd,
+ dict_t *dict_t,
+ char *keys,
+ uint32_t num)
+{
+ while(num--) {
+ // get the value and copy the value
+ // TBD - handle failure cases when calling glfs_fremovexattr_with_xdata()
+ if (glfs_fremovexattr_with_xdata(fd, keys, dict_t) == -1)
+ return -1;
+ keys += strlen(keys) +1;
+ }
+ return 0;
+}
+
+/*
+ * Given a bunch of key value pairs, fill them as xattrs for a file
+ *
+ * Input Arguments:
+ * fd - the file opened using glfs API.
+ * dict - passed so that NSR translator can do this from the required brick
+ * buf - buffer containing the keys-values pairs. The key value are NULL seperated.
+ * Each of the key-value is seperated by NULL in turn.
+ * num - Number of such key value pairs.
+ */
+static int32_t
+fill_xattr(struct glfs_fd *fd,
+ dict_t *dict,
+ char *buf,
+ uint32_t num)
+{
+ char *k = buf, *val = NULL;
+
+ while(num--) {
+ int32_t r;
+
+ val = k + strlen(k) + 1;
+
+ // TBD - handle failure cases when calling glfs_fsetxattr_with_xdata()
+ r = glfs_fsetxattr_with_xdata(fd, k, val, strlen(val), 0, dict);
+ if (r == -1)
+ return -1;
+ k = val + strlen(val) + 1;
+ }
+ return 0;
+}
+
+/*
+ * This function gets a file that can be used for doing glfs_init later.
+ * The control file is used by control thread(function) to talk to peer reconciliation process.
+ * The data file is used by the data thread(function) to talk to the bricks.
+ * The control file is of name such as con:gfs1:-mnt-a1 where "gfs1" is name of host
+ * and the brick path is "/mnt/a1".
+ * The data file is of name such as data:gfs1:-mnt-a1.
+ *
+ * Input Arguments:
+ * vol - name of the volume. This is used to build the full path of the control and data file
+ * such as /var/lib/glusterd/vols/test/bricks/gfs2:-mnt-test1-nsr-recon.vol.
+ * In above example the volume name is test and brick on gfs2 is on path /mnt/test1
+ *
+ * worker - The worker for a given node. This worker has 2 threads - one on the data plane
+ * and one on the control plane. The worker->name is already filled with hostname:brickname
+ * in the function nsr_reconciliation_driver(). Use that to build the volume file.
+ * So if worker->name has gfs1:/mnt/a1, control file is con:gfs1:-mnt-a1
+ * and data file is data:gfs1:-mnt-a1.
+ * All these files are under the bricks directory. TBD - move this to a NSR recon directory later.
+ */
+static void
+nsr_recon_get_file(char *vol, nsr_replica_worker_t *worker)
+{
+ char *ptr;
+ char tr[256];
+
+ // Replace the "/" to -
+ strcpy(tr, worker->name);
+ ptr = strchr (tr, '/');
+ while (ptr) {
+ *ptr = '-';
+ ptr = strchr (tr, '/');
+ }
+
+ // Build the base directory such as "/var/lib/glusterd/vols/test/bricks/"
+ sprintf(worker->control_worker->vol_file,
+ "/%s/%s/%s/%s/",
+ GLUSTERD_DEFAULT_WORKDIR,
+ GLUSTERD_VOLUME_DIR_PREFIX,
+ vol,
+ GLUSTERD_BRICK_INFO_DIR);
+
+ strcat(worker->control_worker->vol_file, "con:");
+ strcat(worker->control_worker->vol_file, tr);
+
+ sprintf(worker->data_worker->vol_file,
+ "/%s/%s/%s/%s/",
+ GLUSTERD_DEFAULT_WORKDIR,
+ GLUSTERD_VOLUME_DIR_PREFIX,
+ vol,
+ GLUSTERD_BRICK_INFO_DIR);
+ strcat(worker->data_worker->vol_file, "data:");
+ strcat(worker->data_worker->vol_file, tr);
+}
+
+/*
+ * This function does all the glfs initialisation
+ * so that reconciliation process can talk to other recon processes/bricks
+ * for the control/data messages.
+ * This will be done everytime a worker needs to be kicked off to talk
+ * across any plane.
+ *
+ * Input arguments:
+ * ctx - The per worker based context
+ * control - set to true if this worker is for the control plane
+ */
+static int32_t
+nsr_recon_start_work(nsr_per_node_worker_t *ctx,
+ gf_boolean_t control)
+{
+ glfs_t *fs = NULL;
+ xlator_t *this = ctx->driver_ctx->this;
+ int32_t ret = 0;
+ glfs_fd_t *aux_fd = NULL; // fd of auxilary log
+ char lf[256];
+ nsr_recon_private_t *priv = NULL;
+ char *my_name = NULL;
+ char *morph_name = NULL, *ptr = NULL;
+
+ priv = this->private;
+ my_name = RD_CALLOC (1,
+ strlen (priv->replica_group_members[0]) + 1,
+ gf_mt_recon_member_name_t);
+ strcpy (my_name, priv->replica_group_members[0]);
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "starting work with volfile %s\n",
+ ctx->vol_file);
+
+ fs = glfs_new(ctx->id);
+ if (!fs) {
+ glusterfs_this_set(this);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "cannot create gfls context for thread %s\n",ctx->id);
+ return -1;
+ }
+
+ // For some vague reason, glfs init APIs seem to be clobbering "this". hence resetting it.
+ glusterfs_this_set(this);
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "init done. setting volfile %s\n",
+ ctx->vol_file);
+
+ ret = glfs_set_volfile(fs, ctx->vol_file);
+ if (ret != 0) {
+ glusterfs_this_set(this);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "cannot set volfile %s for thread %s\n",ctx->vol_file, ctx->id);
+ return -1;
+ }
+
+ morph_name = RD_CALLOC (1, strlen (my_name) + 1,
+ gf_mt_recon_member_name_t);
+ strcpy (morph_name, my_name);
+
+ ptr = strchr (morph_name, '/');
+ while (ptr)
+ {
+ *ptr = '-';
+ ptr = strchr (morph_name, '/');
+ }
+ // TBD - convert this to right /usr/local/var/log based log files.
+
+ sprintf(lf, NSR_LOG_DIR"/%s/%s-%"PRIu32, morph_name,
+ (control == _gf_true)?"glfs-con":"glfs-data", ctx->index);
+ ret = glfs_set_logging (fs, lf, 7);
+ if (ret) {
+ glusterfs_this_set(this);
+ gf_log (this->name, GF_LOG_ERROR, "glfs logging set failed (%s)",
+ strerror (errno));
+ return -1;
+ }
+
+ ret = glfs_init (fs);
+ if (ret != 0) {
+ glusterfs_this_set(this);
+ nsr_worker_log(this->name, GF_LOG_ERROR, "cannot do init for thread %s with volfile %s\n",ctx->id, ctx->vol_file);
+ return -1;
+ }
+ glusterfs_this_set(this);
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "setting volfile %s done\n",
+ ctx->vol_file);
+
+ // If it is control thread, open the "/" as the aux_fd.
+ // All IOs happening via the fd will do the RPCs across the reconciliation
+ // processes. For some vague reason, the root seems to be open'able like a file.
+ // TBD - try to clean this up. (implement a virtual file???)
+ if (control == _gf_true) {
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "doing open for / \n");
+ aux_fd = glfs_open (fs, "/", O_RDWR);
+ // TBD - proper error handling. Stall reconciliation if such a thing happens?
+ if (aux_fd == NULL) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "cannot open aux log file for thread %s\n",ctx->id);
+ return -1;
+ } else {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "---opened aux log file for thread %s\n",ctx->id);
+ }
+ ctx->aux_fd = aux_fd;
+ }
+ glusterfs_this_set(this);
+ ctx->fs = fs;
+ return 0;
+}
+
+/*
+ *
+ * This function does the cleanup after reconciliation is done
+ * or before we start a new reconciliation.
+ *
+ * Input arguments:
+ * ctx - The per worker based context
+ * control - set to true if this worker is for the control plane
+ */
+static int32_t
+nsr_recon_end_work(nsr_per_node_worker_t *ctx,
+ gf_boolean_t control)
+{
+ int32_t ret = 0;
+ xlator_t *this = ctx->driver_ctx->this;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "doing fini for recon worker\n");
+
+ ret = glfs_fini(ctx->fs);
+ if (ret != 0) {
+ glusterfs_this_set(this);
+ nsr_worker_log(this->name, GF_LOG_ERROR, "cannot do fini for thread %s with volfile %s\n",ctx->id, ctx->vol_file);
+ return -1;
+ }
+ glusterfs_this_set(this);
+ ctx->fs = NULL;
+ if (control == _gf_true) {
+ glfs_close (ctx->aux_fd);
+ ctx->aux_fd = NULL;
+ }
+ return 0;
+}
+
+//called in case all worker functions run as sepeerate threads
+static void
+init_worker(nsr_per_node_worker_t *ctx, gf_boolean_t control)
+{
+ pthread_mutex_init(&(ctx->mutex), NULL);
+ pthread_cond_init(&(ctx->cv), NULL);
+ INIT_LIST_HEAD(&(ctx->head.list));
+}
+
+
+/*
+ * Control worker funct for getting changelog info on this node.
+ * calls directly functions to parse the changelog.
+ *
+ * Input arguments:
+ * ctx - The per worker based context
+ * control - set to true if this worker is for the control plane
+ */
+static void
+control_worker_func_0(nsr_per_node_worker_t *ctx,
+ nsr_recon_work_t *work)
+{
+ unsigned int index = ctx->index;
+ nsr_replica_worker_t *rw = &(ctx->driver_ctx->workers[index]);
+ nsr_recon_driver_ctx_t *dr = ctx->driver_ctx;
+ xlator_t *this = dr->this;
+ nsr_recon_private_t *priv = this->private;
+
+ ctx->is_control = _gf_true;
+
+ switch (work->req_id) {
+ case NSR_WORK_ID_INI:
+ {
+ break;
+ }
+ case NSR_WORK_ID_FINI:
+ {
+ break;
+ }
+ case NSR_WORK_ID_GET_LAST_TERM_INFO:
+ {
+ nsr_recon_last_term_info_t lt;
+ nsr_reconciliator_info_t *recon_info = rw->recon_info;
+ // term is stuffed inside work->index. overloading.
+ int32_t term = work->index;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "trying to get last term info for node %d with current term %d\n",index, term);
+
+ // TBD - handle errors
+ // This is called by the leader after it gets the current term.
+ // Makes searching easier.
+ nsr_recon_libchangelog_get_last_term_info(this, priv->changelog_base_path, term, &lt);
+ recon_info->last_term = lt.last_term;
+ recon_info->commited_ops = lt.commited_ops;
+ recon_info->last_index = lt.last_index;
+ recon_info->first_index = lt.first_index;
+
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "out of get last term info with current term %d. got ops %d with first %d and last %d \n",
+ recon_info->last_term, recon_info->commited_ops,
+ recon_info->first_index, recon_info->last_index);
+ break;
+ }
+ case NSR_WORK_ID_GET_GIVEN_TERM_INFO:
+ {
+ nsr_recon_last_term_info_t lt;
+ nsr_reconciliator_info_t *recon_info = rw->recon_info;
+ // term is stuffed inside work->index. overloading.
+ int32_t term = work->index;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "trying to get term info for node %d for term %d\n",index, term);
+
+ // TBD - handle errors
+ nsr_recon_libchangelog_get_this_term_info(this,priv->changelog_base_path, term, &lt);
+
+ recon_info->last_term = lt.last_term;
+ recon_info->commited_ops = lt.commited_ops;
+ recon_info->last_index = lt.last_index;
+ recon_info->first_index = lt.first_index;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "out of get term info for term %d. got ops %d with first %d and last %d \n",
+ recon_info->last_term, recon_info->commited_ops,
+ recon_info->first_index, recon_info->last_index);
+
+ break;
+ }
+ case NSR_WORK_ID_RECONCILIATOR_DO_WORK:
+ {
+ // For local resolution, the main driver thread does it.
+ // SO there is no way we can have this message for this node.
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "this message should not be sent \n");
+ break;
+ }
+ case NSR_WORK_ID_RESOLUTION_DO_WORK:
+ {
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "this message should not be sent \n");
+ ctx->result = -1;
+ break;
+ }
+ case NSR_WORK_ID_GET_RECONCILATION_WINDOW:
+ {
+ nsr_reconciliator_info_t *recon_info = rw->recon_info;
+ // first_index and last_index at 0 indicates empty log.
+ // For non empty log, the first_index always starts at 1.
+ uint32_t num = (dr->workers[index].recon_info->last_index -
+ dr->workers[index].recon_info->first_index + 1);
+ nsr_recon_record_details_t *rd;
+ uint32_t i=0;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "trying to get reconciliation window records for node %d for term %d with first %d last %d\n",
+ index, recon_info->last_term, recon_info->first_index, recon_info->last_index);
+
+
+ // TBD - handle buffer allocation errors
+ rd = RD_CALLOC(num,
+ sizeof(nsr_recon_record_details_t),
+ gf_mt_recon_record_details_t);
+ if (rd == NULL) {
+ ctx->result = -1;
+ return;
+ }
+
+ recon_info->records = RD_CALLOC(num,
+ sizeof(nsr_reconciliation_record_t),
+ gf_mt_recon_record_t);
+ if (recon_info->records == NULL) {
+ ctx->result = -1;
+ return;
+ }
+
+ // TBD - handle errors
+ if (nsr_recon_libchangelog_get_records(this, priv->changelog_base_path,
+ recon_info->last_term,
+ recon_info->first_index,
+ recon_info->last_index,
+ rd) == _gf_false) {
+ ctx->result = -1;
+ return;
+ }
+
+ // The above function writes into rd from 0 to (num -1)
+ // We need to take care of this whenever we deal with records
+ for (i=0; i < num; i++) {
+ ENDIAN_CONVERSION_RD(rd[i], _gf_true); //ntohl
+ memcpy(&(recon_info->records[i].rec),
+ &(rd[i]),
+ sizeof(nsr_recon_record_details_t));
+ }
+
+ GF_FREE(rd);
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "got reconciliation window records for node %d for term %d \n",
+ index, recon_info->last_term);
+ break;
+ }
+
+ default:
+ nsr_worker_log (this->name, GF_LOG_ERROR,
+ "bad req id %u", work->req_id);
+ }
+
+ return;
+}
+
+// Control worker thread
+static void*
+control_worker_main_0(nsr_per_node_worker_t *ctx)
+{
+
+ ctx->is_control = _gf_true;
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "starting control worker func 0\n");
+
+ init_worker(ctx, 1);
+
+ while(1)
+ {
+ nsr_recon_work_t *work = NULL;
+ nsr_recon_driver_ctx_t *dr = ctx->driver_ctx;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "waiting for work\n");
+
+ pthread_mutex_lock(&ctx->mutex);
+ while (list_empty(&(ctx->head.list))) {
+ pthread_cond_wait(&ctx->cv, &ctx->mutex);
+ }
+ pthread_mutex_unlock(&ctx->mutex);
+
+
+ list_for_each_entry(work, &(ctx->head.list), list) {
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "got work with id %d\n", work->req_id);
+ work->in_use = _gf_false;
+
+ // Call the main function.
+ control_worker_func_0(ctx, work);
+
+ atomic_dec(&(dr->outstanding));
+ break;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,"deleting work item\n");
+ list_del_init (&work->list);
+ GF_FREE(work);
+ nsr_worker_log(this->name, GF_LOG_INFO,"finished deleting work item\n");
+ }
+
+ return NULL;
+}
+
+static void
+control_worker_do_reconciliation (nsr_per_node_worker_t *ctx,
+ nsr_recon_work_t *work)
+{
+ unsigned int index = ctx->index;
+ nsr_recon_driver_ctx_t *dr = ctx->driver_ctx;
+ nsr_recon_role_t rr;
+ uint32_t i=0;
+ uint32_t num=0;
+ uint32_t idx = dr->reconciliator_index;
+ uint32_t term = dr->workers[idx].recon_info->last_term;
+
+ GF_ASSERT(idx == index);
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "trying to make this index %d as reconciliator for term %d\n", index, term);
+
+ // TBD - error handling for all the glfs APIs
+ if (glfs_lseek(ctx->aux_fd,
+ nsr_recon_xlator_sector_1,
+ SEEK_SET) == -1) {
+ ctx->result = -1;
+ return;
+ }
+
+ // We have all the info for all other nodes.
+ // Fill all that info when sending data to that process.
+ for (i=0; i < dr->replica_group_size; i++) {
+ if ( dr->workers[i].in_use &&
+ (dr->workers[i].recon_info->last_term == term)) {
+ rr.info[num].last_term =
+ dr->workers[i].recon_info->last_term;
+ rr.info[num].commited_ops =
+ dr->workers[i].recon_info->commited_ops;
+ rr.info[num].last_index =
+ dr->workers[i].recon_info->last_index;
+ rr.info[num].first_index =
+ dr->workers[i].recon_info->first_index;
+ strcpy(rr.info[num].name,
+ dr->workers[i].name);
+ }
+ num++;
+ }
+ rr.num = num;
+ rr.role = reconciliator;
+ ENDIAN_CONVERSION_RR(rr, _gf_false); //htonl
+ if (glfs_write(ctx->aux_fd, &rr, sizeof(rr), 0) == -1) {
+ ctx->result = -1;
+ // Put the errno only for this case since we are bothered about
+ // retrying only for this case. For rest of the cases we will
+ // just return EIO in errno.
+ ctx->op_errno = errno;
+ return;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "sent reconciliator info for term %d with node count as %d\n", term, num);
+}
+
+static void
+control_worker_do_resolution (nsr_per_node_worker_t *ctx,
+ nsr_recon_work_t *work)
+{
+ unsigned int index = ctx->index;
+ nsr_recon_driver_ctx_t *dr = ctx->driver_ctx;
+ nsr_recon_role_t rr;
+ unsigned int i=0, j=0;
+ unsigned int rec = dr->reconciliator_index;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "trying to make this index %d as resolutor with reconciliator as %d\n",index, rec);
+
+ // TBD - error handling for all the glfs APIs
+ if (glfs_lseek(ctx->aux_fd,
+ nsr_recon_xlator_sector_1,
+ SEEK_SET) == -1) {
+ ctx->result = -1;
+ return;
+ }
+
+ rr.num = 2;
+
+ // Fill in info[0] as info for the node for which we are seeking
+ // resolution. Fill in info[1] as info of the reconciliator node. The
+ // function nsr_recon_driver_get_role() that will be called when this
+ // message reaches the node will look at index 1 for term information
+ // related to the reconciliator.
+ for (i=0; i < 2; i++) {
+ (i == 0) ? (j = index) : (j = rec);
+ rr.info[i].last_term =
+ dr->workers[j].recon_info->last_term;
+ rr.info[i].commited_ops =
+ dr->workers[j].recon_info->commited_ops;
+ rr.info[i].last_index =
+ dr->workers[j].recon_info->last_index;
+ rr.info[i].first_index =
+ dr->workers[j].recon_info->first_index;
+ // The name is used as the key to convert indices since the
+ // reconciliator index could be different across the nodes.
+ strcpy(rr.info[i].name,
+ dr->workers[j].name);
+ if (i == 0) {
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "this node info term=%d, ops=%d, first=%d, last=%d\n",
+ rr.info[i].last_term, rr.info[i].commited_ops,
+ rr.info[i].first_index,rr.info[i].last_index);
+ } else {
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "reconciliator node info term=%d, ops=%d, first=%d, last=%d\n",
+ rr.info[i].last_term, rr.info[i].commited_ops,
+ rr.info[i].first_index,rr.info[i].last_index);
+ }
+ }
+ rr.role = resolutor;
+ ENDIAN_CONVERSION_RR(rr, _gf_false); //htonl
+ if (glfs_write(ctx->aux_fd, &rr, sizeof(rr), 0) == -1) {
+ ctx->result = -1;
+ // Put the errno only for this case since we are bothered about
+ // retrying only for this case. For rest of the cases we will
+ // just return EIO in errno.
+ ctx->op_errno = errno;
+ return;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "sent message to this node %d resolutor with reconciliator as %d\n", index, rec);
+}
+
+static void
+control_worker_get_window (nsr_per_node_worker_t *ctx, nsr_recon_work_t *work)
+{
+ unsigned int index = ctx->index;
+ nsr_replica_worker_t *rw = &(ctx->driver_ctx->workers[index]);
+ nsr_recon_driver_ctx_t *dr = ctx->driver_ctx;
+ xlator_t *this = dr->this;
+ nsr_recon_log_info_t li;
+ nsr_reconciliator_info_t *recon_info = rw->recon_info;
+ uint32_t i = 0;
+ uint32_t num = (dr->workers[index].recon_info->last_index -
+ dr->workers[index].recon_info->first_index +1);
+ nsr_recon_record_details_t *rd;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "trying to get reconciliation window records for node %d for term %d with first %d last %d\n",
+ index, recon_info->last_term, recon_info->first_index, recon_info->last_index);
+
+ // TBD - error handling for all the glfs APIs
+ if (glfs_lseek(ctx->aux_fd, nsr_recon_xlator_sector_2, SEEK_SET) == -1) {
+ ctx->result = -1;
+ return;
+ }
+
+ // write to node what term & indices we are interested
+ li.term = recon_info->last_term;
+ li.first_index = recon_info->first_index;
+ li.last_index = recon_info->last_index;
+ ENDIAN_CONVERSION_LI(li, _gf_false); //htonl
+ if (glfs_write(ctx->aux_fd, &li, sizeof(li), 0) == -1) {
+ ctx->result = -1;
+ return;
+ }
+
+ // then read
+ rd = RD_CALLOC(num,
+ sizeof(nsr_recon_record_details_t),
+ gf_mt_recon_private_t);
+ if (rd == NULL) {
+ ctx->result = -1;
+ return;
+ }
+ recon_info->records = RD_CALLOC(num,
+ sizeof(nsr_reconciliation_record_t),
+ gf_mt_recon_private_t);
+ if (recon_info->records == NULL) {
+ ctx->result = -1;
+ goto err;
+ }
+
+ if (glfs_read(ctx->aux_fd, rd, num * sizeof(nsr_recon_record_details_t), 0) == -1) {
+ ctx->result = -1;
+ goto err;
+ }
+
+ for (i=0; i < num; i++) {
+ ENDIAN_CONVERSION_RD(rd[i], _gf_true); //ntohl
+ memcpy (&(recon_info->records[i].rec), &(rd[i]),
+ sizeof(nsr_recon_record_details_t));
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "get_reconcilaition_window:Got %d at index %d\n",
+ recon_info->records[i].rec.type,
+ i + recon_info->first_index);
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "got reconciliation window records for node %d for term %d \n",
+ index, recon_info->last_term);
+
+err:
+ GF_FREE(rd);
+}
+
+/*
+ * Control worker funct for getting changelog info on some other node.
+ * calls glfs functions to seek/read/write on aux_fd.
+ *
+ * Input arguments:
+ * ctx - The per worker based context
+ * control - set to true if this worker is for the control plane
+ */
+static void
+control_worker_func(nsr_per_node_worker_t *ctx,
+ nsr_recon_work_t *work)
+{
+ unsigned int index = ctx->index;
+ nsr_replica_worker_t *rw = &(ctx->driver_ctx->workers[index]);
+ nsr_recon_last_term_info_t lt;
+ nsr_reconciliator_info_t *recon_info = rw->recon_info;
+ int32_t term = htonl(work->index); // overloading it
+
+ ctx->is_control = _gf_true;
+
+ switch (work->req_id){
+
+ case NSR_WORK_ID_INI:
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "calling nsr_recon_start_work\n");
+
+ // TBD - handle error in case nsr_recon_start_work gives error
+ if (nsr_recon_start_work(ctx, _gf_true) != 0) {
+ ctx->result = -1;
+ return;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "finished nsr_recon_start_work\n");
+ break;
+
+ case NSR_WORK_ID_FINI:
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "calling nsr_recon_end_work\n");
+
+ // TBD - handle error in case nsr_recon_end_work gives error
+ if (nsr_recon_end_work(ctx, _gf_true) != 0) {
+ ctx->result = -1;
+ return;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "finished nsr_recon_end_work\n");
+ break;
+
+ case NSR_WORK_ID_GET_LAST_TERM_INFO:
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "trying to get last term info for node %d with current term %d\n",index, work->index);
+
+ // first write the current term term number
+ // TBD - error handling for all the glfs APIs
+ if (glfs_lseek(ctx->aux_fd, nsr_recon_xlator_sector_4, SEEK_SET) == -1) {
+ ctx->result = -1;
+ return;
+ }
+ if (glfs_write(ctx->aux_fd, &term, sizeof(term), 0) == -1) {
+ ctx->result = -1;
+ return;
+ }
+ if (glfs_read(ctx->aux_fd, &lt, sizeof(lt), 0) == -1) {
+ ctx->result = -1;
+ return;
+ }
+ ENDIAN_CONVERSION_LT(lt, _gf_true); //ntohl
+ recon_info->last_term = lt.last_term;
+ recon_info->commited_ops = lt.commited_ops;
+ recon_info->last_index = lt.last_index;
+ recon_info->first_index = lt.first_index;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "out of get last term info with current term %d. got ops %d with first %d and last %d \n",
+ recon_info->last_term, recon_info->commited_ops,
+ recon_info->first_index, recon_info->last_index);
+
+ break;
+
+ case NSR_WORK_ID_GET_GIVEN_TERM_INFO:
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "trying to get term info for node %d for term %d\n",index, work->index);
+
+ // first write the term number
+ // TBD - error handling for all the glfs APIs
+ if (glfs_lseek(ctx->aux_fd, nsr_recon_xlator_sector_3, SEEK_SET) == -1) {
+ ctx->result = -1;
+ return;
+ }
+ if (glfs_write(ctx->aux_fd, &term, sizeof(term), 0) == -1) {
+ ctx->result = -1;
+ return;
+ }
+ if (glfs_read(ctx->aux_fd, &lt, sizeof(lt), 0) == -1) {
+ ctx->result = -1;
+ return;
+ }
+ ENDIAN_CONVERSION_LT(lt, _gf_true); //ntohl
+ recon_info->last_term = lt.last_term;
+ recon_info->commited_ops = lt.commited_ops;
+ recon_info->last_index = lt.last_index;
+ recon_info->first_index = lt.first_index;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "out of get term info for term %d. got ops %d with first %d and last %d \n",
+ recon_info->last_term, recon_info->commited_ops,
+ recon_info->first_index, recon_info->last_index);
+
+ break;
+
+ case NSR_WORK_ID_RECONCILIATOR_DO_WORK:
+ control_worker_do_reconciliation(ctx,work);
+ break;
+
+ case NSR_WORK_ID_RESOLUTION_DO_WORK:
+ control_worker_do_resolution(ctx,work);
+ break;
+
+ case NSR_WORK_ID_GET_RECONCILATION_WINDOW:
+ control_worker_get_window(ctx,work);
+ break;
+
+ default:
+ nsr_worker_log (this->name, GF_LOG_ERROR,
+ "bad work type %d", work->req_id);
+ }
+
+ return;
+}
+
+// Control worker thread
+static void*
+control_worker_main(nsr_per_node_worker_t *ctx)
+{
+ unsigned int index = ctx->index;
+
+ ctx->is_control = _gf_true;
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "starting control worker func\n");
+
+ // if this is for local processing, call the changelog parsing calls directly
+ if (index == 0) {
+ control_worker_main_0(ctx);
+ return NULL;
+ }
+
+ init_worker(ctx, 1);
+
+
+ while(1)
+ {
+ nsr_recon_work_t *work = NULL;
+ nsr_recon_driver_ctx_t *dr = ctx->driver_ctx;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "waiting for work\n");
+
+ pthread_mutex_lock(&ctx->mutex);
+ while (list_empty(&(ctx->head.list))) {
+ pthread_cond_wait(&ctx->cv, &ctx->mutex);
+ }
+ pthread_mutex_unlock(&ctx->mutex);
+
+
+ list_for_each_entry(work, &(ctx->head.list), list) {
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "got work with id %d\n", work->req_id);
+ work->in_use = _gf_false;
+ control_worker_func(ctx,work);
+ atomic_dec(&(dr->outstanding));
+ break;
+ }
+ nsr_worker_log(this->name, GF_LOG_INFO,"deleting work item\n");
+ list_del_init (&work->list);
+ GF_FREE(work);
+ nsr_worker_log(this->name, GF_LOG_INFO,"finished deleting work item\n");
+ }
+
+ return NULL;
+}
+
+/*
+ * This function gets called if this process is chosen as the reconciliator
+ * for this replica group. It would have already got the records for the last term
+ * for the indices that are required (from the first HOLE to last index) from
+ * all other nodes that also witnessed that term. COmpare all the records and
+ * compute the work required.
+ *
+ * Input arguments
+ * ctx - driver context. All recon work is stored in workers[0].recon_info
+ */
+static void
+compute_reconciliation_work(nsr_recon_driver_ctx_t *ctx)
+{
+ uint32_t i=0, j=0;
+ nsr_reconciliator_info_t *my_recon = ctx->workers[0].recon_info;
+ uint32_t num = (my_recon->last_index - my_recon->first_index + 1);
+
+ for (i=0; i < num; i++) {
+ nsr_log_type_t orig, new;
+ unsigned int src = 0;
+ orig = new = my_recon->records[i].rec.type;
+ nsr_recon_work_type_t tw = NSR_RECON_WORK_NONE;
+ // index 0 means this node. Look at all other nodes.
+ for (j=1; j < ctx->replica_group_size; j++) {
+ if (ctx->workers[j].in_use) {
+ nsr_log_type_t pr = ctx->workers[j].recon_info->records[i].work.type;
+ if ((new != pr) && (pr > new)) {
+ src = j;
+ new = (new | pr);
+ }
+ }
+ }
+ // TBD - compare data if new and orig are all FILLs. (can detect changelog corruption)
+ // Right now we compare if both orig and new are psuedo holes since
+ // only that is of interest to us.
+ if (orig != new) {
+ if ((orig == NSR_LOG_HOLE) && (new == NSR_LOG_PSEUDO_HOLE))
+ tw = NSR_RECON_WORK_HOLE_TO_PSEUDO_HOLE;
+ else if ((orig == NSR_LOG_HOLE) && (new == NSR_LOG_FILL))
+ tw = NSR_RECON_WORK_HOLE_TO_FILL;
+ else if ((orig == NSR_LOG_PSEUDO_HOLE) && (new == NSR_LOG_PSEUDO_HOLE))
+ tw = NSR_RECON_WORK_COMPARE_PSEUDO_HOLE;
+ else if ((orig == NSR_LOG_PSEUDO_HOLE) && (new == NSR_LOG_FILL))
+ tw = NSR_RECON_WORK_HOLE_TO_FILL;
+ }
+ if (tw != NSR_RECON_WORK_NONE) {
+ my_recon->records[i].work.type = tw;
+ my_recon->records[i].work.source = src;
+ // Overwrite the record
+ memcpy(&(my_recon->records[i].rec),
+ &(ctx->workers[src].recon_info->records[i].rec),
+ sizeof(nsr_recon_record_details_t));
+ }
+ }
+ return;
+}
+
+static int32_t
+nsr_recon_in_use(nsr_recon_driver_ctx_t *ctx,
+ uint32_t i,
+ gf_boolean_t in_use);
+
+/*
+ * Write the role and associated information to the node.
+ * This gets called from recon xlator indicating node is either
+ * leader, reconciliator or should do resolution.
+ */
+gf_boolean_t
+nsr_recon_driver_set_role(nsr_recon_driver_ctx_t *ctx,
+ nsr_recon_role_t *rr,
+ uint32_t term)
+{
+ nsr_role_work_t *rw;
+ xlator_t *this = ctx->this;
+
+ nsr_driver_log(this->name, GF_LOG_INFO, "set role called \n");
+ rw = RD_CALLOC(1, sizeof (nsr_role_work_t), gf_mt_recon_role_work_t);
+ memcpy(&rw->role, rr, sizeof(nsr_recon_role_t));
+ rw->term = term;
+ INIT_LIST_HEAD(&(rw->list));
+ pthread_mutex_lock(&(ctx->mutex));
+ list_add_tail(&rw->list, &ctx->role_head.list);
+ pthread_cond_signal(&(ctx->cv));
+ pthread_mutex_unlock(&(ctx->mutex));
+ nsr_driver_log(this->name, GF_LOG_INFO, "set role returns \n");
+ return _gf_true;
+}
+
+/*
+ * First we undo the last role to make sure we clean up.
+ *
+ * Input arguments
+ * ctx - driver context.
+ * rr - Role information.
+ * If leader, the thread now sends the list of all nodes that are part of
+ * the current replica group. Use that to find out the activate the
+ * required worker threads.
+ * If reconciliator, the leader node would have sent information about
+ * all nodes which saw last term as the reconciliator.
+ * If resolution to be done, then rr.info[0] will have this node's info
+ * which the leader would have got earlier. rr[1].info will have the
+ * info regarding the reconciliator.
+ * term - leader's term that is causing this role
+ */
+nsr_recon_driver_state_t
+nsr_recon_driver_get_role(int32_t *status,
+ nsr_recon_driver_ctx_t *ctx,
+ nsr_role_work_t *rw)
+{
+ uint8_t i=0, j=0;
+ nsr_recon_role_t *rr = &(rw->role);
+ nsr_reconciliator_info_t *tmp;
+ xlator_t *this = ctx->this;
+
+ // First make all the threads uninitialise
+ for (i = 0; i < ctx->replica_group_size; i++) {
+ if (nsr_recon_in_use(ctx, i, _gf_false) == -1) {
+ *status = -1;
+ return 0;
+ }
+ }
+
+ switch (rr->role) {
+ case leader:
+ case joiner:
+
+ // First set info this node
+ tmp = RD_CALLOC (1, sizeof (nsr_reconciliator_info_t),
+ gf_mt_recon_reconciliator_info_t);
+ if (!tmp) {
+ *status = -1;
+ return 0;
+ }
+ ctx->workers[0].recon_info = tmp;
+ if (nsr_recon_in_use(ctx, 0, _gf_true) == -1) {
+ *status = -1;
+ return 0;
+ }
+ ctx->current_term = rr->current_term;
+
+ // Find rest of the nodes
+ for (i=1; i < ctx->replica_group_size; i++) {
+ for (j=0 ; /* nothing */; j++) {
+ if (j >= rr->num) {
+ nsr_driver_log (this->name, GF_LOG_ERROR,
+ "failed to find %s",
+ ctx->workers[i].name);
+ break;
+ }
+ if (strcmp(ctx->workers[i].name,
+ rr->info[j].name)) {
+ continue;
+ }
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "nsr_recon_driver_get_role: this as %s. found other server %s\n",
+ (rr->role == leader) ? "leader"
+ : "joiner",
+ ctx->workers[i].name);
+
+ // Allocate this here. This will get later
+ // filled when the leader tries to get last term
+ // information from all the nodes
+ tmp = RD_CALLOC (1,
+ sizeof (nsr_reconciliator_info_t),
+ gf_mt_recon_reconciliator_info_t);
+ if (!tmp) {
+ *status = -1;
+ return 0;
+ }
+ ctx->workers[i].recon_info = tmp;
+ if (nsr_recon_in_use(ctx, i, _gf_true) == -1) {
+ *status = -1;
+ return 0;
+ }
+ break;
+ }
+ }
+ // If leader, reconciliator has to be chosen.
+ // If joiner, we are the reconciliator.
+ if (rr->role == leader)
+ ctx->reconciliator_index = -1;
+ else
+ ctx->reconciliator_index = 0;
+ break;
+
+ case reconciliator:
+ ctx->reconciliator_index = 0;
+ // Copy information about all the other members which had the
+ // same term
+ for (i=0; i < rr->num; i++) {
+ for (j=0; /* nothing */; j++) {
+ if (j >= ctx->replica_group_size) {
+ nsr_driver_log (this->name, GF_LOG_ERROR,
+ "failed to find %s",
+ rr->info[i].name);
+ break;
+ }
+ if (strcmp(rr->info[i].name,
+ ctx->workers[j].name)) {
+ continue;
+ }
+ nsr_driver_log(this->name, GF_LOG_INFO,
+ "nsr_recon_driver_get_role: this as reconciliator. found other server %s\n",
+ ctx->workers[j].name);
+ tmp = RD_CALLOC (1,
+ sizeof (nsr_reconciliator_info_t),
+ gf_mt_recon_reconciliator_info_t);
+ if (!tmp) {
+ *status = -1;
+ return 0;
+ }
+ tmp->last_term = rr->info[i].last_term;
+ tmp->commited_ops = rr->info[i].commited_ops;
+ tmp->last_index = rr->info[i].last_index;
+ tmp->first_index = rr->info[i].first_index;
+ ctx->workers[j].recon_info = tmp;
+ if (nsr_recon_in_use(ctx, j, _gf_true) == -1) {
+ *status = -1;
+ return 0;
+ }
+ break;
+ }
+ }
+ break;
+
+ case resolutor:
+ for (j=0; /* nothing */; j++) {
+ // info[1] has the information regarding the
+ // reconciliator
+ if (j >= ctx->replica_group_size) {
+ nsr_driver_log (this->name, GF_LOG_ERROR,
+ "failed to find %s",
+ rr->info[1].name);
+ break;
+ }
+ if (strcmp(rr->info[1].name,
+ ctx->workers[j].name)) {
+ continue;
+ }
+ nsr_driver_log(this->name, GF_LOG_INFO,
+ "nsr_recon_driver_get_role: this as resolutor. found other server %s as reconciliator\n",
+ ctx->workers[j].name);
+ tmp = RD_CALLOC (1,
+ sizeof (nsr_reconciliator_info_t),
+ gf_mt_recon_reconciliator_info_t);
+ if (!tmp) {
+ *status = -1;
+ return 0;
+ }
+ tmp->last_term = rr->info[1].last_term;
+ tmp->commited_ops = rr->info[1].commited_ops;
+ tmp->last_index = rr->info[1].last_index;
+ tmp->first_index = rr->info[1].first_index;
+ ctx->reconciliator_index = j;
+ ctx->workers[j].recon_info = tmp;
+ if (nsr_recon_in_use(ctx, j, _gf_true) == -1) {
+ *status = -1;
+ return 0;
+ }
+ GF_ASSERT(ctx->reconciliator_index != 0);
+ break;
+ }
+ tmp = RD_CALLOC (1,
+ sizeof (nsr_reconciliator_info_t),
+ gf_mt_recon_reconciliator_info_t);
+ if (!tmp) {
+ *status = -1;
+ return 0;
+ }
+ // info[0] has all info for this node
+ tmp->last_term = rr->info[0].last_term;
+ tmp->commited_ops = rr->info[0].commited_ops;
+ tmp->last_index = rr->info[0].last_index;
+ tmp->first_index = rr->info[0].first_index;
+ ctx->workers[0].recon_info = tmp;
+ if (nsr_recon_in_use(ctx, 0, _gf_true) == -1) {
+ *status = -1;
+ return 0;
+ }
+ }
+
+ ctx->term = rw->term;
+
+ *status = 0;
+ return rr->role;
+}
+
+
+/*
+ * This function gets called if this process is chosen to sync itself with
+ * the reconciliator.
+ *
+ * Input arguments
+ * ctx - driver context.
+ * my_info - local changelog info that has all the local records for indices that require work
+ * his_info - reconciliator's info that has all the golden copies
+ * invalidate - if set to true, then do not consult local records
+ */
+
+static void
+compute_resolution_work(nsr_recon_driver_ctx_t *ctx,
+ nsr_reconciliator_info_t *my_info,
+ nsr_reconciliator_info_t *his_info,
+ gf_boolean_t invalidate)
+{
+ uint32_t i=0;
+ uint32_t num = (my_info->last_index - my_info->first_index + 1);
+ xlator_t *this = ctx->this;
+
+ if (invalidate) {
+ if (my_info->records) {
+ GF_FREE(my_info->records);
+ }
+ my_info->records = RD_CALLOC(num,
+ sizeof(nsr_reconciliation_record_t),
+ gf_mt_recon_record_t);
+ }
+
+ for (i=0; i < num; i++) {
+ nsr_log_type_t orig, new;
+ nsr_recon_work_type_t tw = NSR_RECON_WORK_NONE;
+ orig = my_info->records[i].rec.type;
+ if (invalidate)
+ orig = NSR_LOG_HOLE;
+ new = his_info->records[i].rec.type;
+ // TBD - we can never have PSUEDO_HOLE in reconciliator's info
+ // We should have taken care of that during reconciliation.
+ // Put an assert to validate that.
+ if (new != orig) {
+ if ((orig != NSR_LOG_FILL) && (new == NSR_LOG_FILL))
+ tw = NSR_RECON_WORK_HOLE_TO_FILL;
+ else if ((orig != NSR_LOG_HOLE) && (new == NSR_LOG_HOLE))
+ tw = NSR_RECON_WORK_UNDO_FILL;
+ }
+ // copy the records anyway
+ my_info->records[i].work.type = tw;
+ my_info->records[i].work.source = ctx->reconciliator_index;
+ memcpy(&(my_info->records[i].rec),
+ &(his_info->records[i].rec),
+ sizeof(nsr_recon_record_details_t));
+ }
+ return;
+}
+
+
+// Create an glfs object
+static struct glfs_object *
+create_obj(nsr_per_node_worker_t *ctx, char *gfid_str)
+{
+ struct glfs_object *obj = NULL;
+ uuid_t gfid;
+
+ uuid_parse(gfid_str, gfid);
+
+ obj = glfs_h_create_from_handle(ctx->fs, gfid, GFAPI_HANDLE_LENGTH, NULL);
+ if (obj == NULL) {
+ GF_ASSERT(obj != NULL);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "creating of handle failed\n");
+ return NULL;
+ }
+ return obj;
+}
+
+/*
+ * Function to apply the actual record onto the local brick.
+ * prior to this we should have read all the data from the
+ * brick that has the data.
+ *
+ * Input parameters:
+ * ctx - per node worker context that has the fs for communicating to brick
+ * ri - Reconciliation record that needs fixup
+ * dict - So that NSR server translator on brick applis fixup only on this brick
+ * and the changelog translator consumes term and index.
+ */
+
+static gf_boolean_t
+apply_record(nsr_per_node_worker_t *ctx,
+ nsr_reconciliation_record_t *ri,
+ dict_t * dict)
+{
+ struct glfs_fd *fd = NULL;
+ struct glfs_object *obj = NULL;
+ struct glfs_object *to_obj = NULL;
+ gf_boolean_t retval = _gf_false;
+
+ if (ri->rec.op == GF_FOP_WRITE) {
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "DOing write for file %s @offset %d for len %d\n",
+ ri->rec.gfid, ri->rec.offset, ri->rec.len);
+
+ // The file has got deleted on the source. Hence just ignore
+ // this.
+ // TBD - get a way to just stuff the log entry without writing
+ // the data so that changelogs remain identical.
+ if (ri->work.data == NULL) {
+ return _gf_true;
+ }
+
+ if ((obj = create_obj(ctx,ri->rec.gfid)) == NULL)
+ goto err;
+
+ fd = glfs_h_open_with_xdata(ctx->fs, obj, O_RDWR, dict);
+ if (fd == NULL) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "open for file %s failed\n",
+ ri->rec.gfid);
+ goto err;
+ }
+ if (glfs_lseek_with_xdata(fd, ri->rec.offset, SEEK_SET, dict) != ri->rec.offset) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "lseek for file %s failed at offset %d\n",
+ ri->rec.gfid, ri->rec.offset);
+ goto err;
+ }
+ if (glfs_write_with_xdata(fd, ri->work.data, ri->rec.len, 0, dict) != ri->rec.len) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "write for file %s failed for bytes %d\n",
+ ri->rec.gfid, ri->rec.len);
+ goto err;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Finished DOing write for gfid %s @offset %d for len %d\n",
+ ri->rec.gfid, ri->rec.offset, ri->rec.len);
+
+ } else if (ri->rec.op == GF_FOP_FTRUNCATE) {
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "DOing truncate for file %s @offset %d \n",
+ ri->rec.gfid, ri->rec.offset);
+
+ if ((obj = create_obj(ctx, ri->rec.gfid)) == NULL) {
+ goto err;
+ }
+
+ fd = glfs_h_open_with_xdata(ctx->fs, obj, O_RDWR, dict);
+ if (fd == NULL) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "open for file %s failed\n",
+ ri->rec.gfid);
+ goto err;
+ }
+ if (glfs_ftruncate_with_xdata(fd, ri->rec.offset, dict) == -1) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "trunctae for file %s failed @offset %d\n",
+ ri->rec.gfid,ri->rec.offset );
+ goto err;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Finished DOing truncate for gfid %s @offset %d \n",
+ ri->rec.gfid, ri->rec.offset);
+
+ } else if ((ri->rec.op == GF_FOP_FREMOVEXATTR) ||
+ (ri->rec.op == GF_FOP_REMOVEXATTR) ||
+ (ri->rec.op == GF_FOP_SETXATTR) ||
+ (ri->rec.op == GF_FOP_FSETXATTR)) {
+
+ uint32_t k_s = 0, v_s = 0;
+ char *t_b= NULL;
+ uint32_t num = 0;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Doing set extended attr for file %s \n",
+ ri->rec.gfid);
+
+ // The file has got deleted on the source. Hence just ignore
+ // this. TBD - get a way to just stuff the log entry without
+ // writing the data so that changelogs remain identical.
+ if (ri->work.data == NULL) {
+ return _gf_true;
+ }
+
+ if ((obj = create_obj(ctx, ri->rec.gfid)) == NULL) {
+ goto err;
+ }
+
+ if (obj->inode->ia_type == IA_IFDIR)
+ fd = glfs_h_opendir_with_xdata(ctx->fs, obj, dict);
+ else
+ fd = glfs_h_open_with_xdata(ctx->fs, obj, O_RDWR, dict);
+ if (fd == NULL) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "open for file %s failed\n",
+ ri->rec.gfid);
+ goto err;
+ }
+
+ if(get_xattr_total_size(fd, &t_b, &k_s, &v_s, &num, dict) == -1) {
+ if (t_b) free(t_b);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "list of xattr of %s failed\n", ri->rec.gfid);
+ goto err;
+ }
+
+ if (delete_xattr(fd, dict, t_b, num) == -1) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "deleting xattrs failed\n");
+ goto err;
+ }
+
+ // Set one special dict flag to indicate the opcode so that
+ // the opcode gets set to this
+ if (dict_set_int32(dict,"recon-xattr-opcode",ri->rec.op)) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "setting opcode to %d failed\n",ri->rec.op);
+ goto err;
+ }
+
+ if (fill_xattr(fd, dict, ri->work.data, ri->work.num) == -1) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "filling xattrs failed\n");
+ goto err;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Finsihed Doing set extended attr for %s \n",
+ ri->rec.gfid);
+
+ } else if (ri->rec.op == GF_FOP_CREATE) {
+
+ uuid_t gfid;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Doing create for file %s \n",
+ ri->rec.gfid);
+
+ // TBD - add mode and flags later
+ uuid_parse(ri->rec.gfid, gfid);
+ if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) {
+ goto err;
+ }
+
+ nsr_worker_log (this->name, GF_LOG_INFO,
+ "creating with mode 0%o", ri->rec.mode);
+ if (glfs_h_creat_with_xdata(ctx->fs, obj, ri->rec.entry, O_RDWR, ri->rec.mode, NULL, gfid, dict) == NULL) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "Failure for Doing create for file %s\n",
+ ri->rec.entry);
+ goto err;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Finished Doing create for file %s \n",
+ ri->rec.entry);
+
+ } else if (ri->rec.op == GF_FOP_MKNOD) {
+
+ uuid_t gfid;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Doing mknod for file %s \n",
+ ri->rec.entry);
+
+ // TBD - add mode and flags later
+ uuid_parse(ri->rec.gfid, gfid);
+ if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) {
+ goto err;
+ }
+
+ if (glfs_h_mknod_with_xdata(ctx->fs, obj, ri->rec.entry, O_RDWR, 0777, NULL, gfid, dict) == NULL) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "Failure for Doing mknod for file %s\n",
+ ri->rec.entry);
+ goto err;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Finished Doing mknod for file %s \n",
+ ri->rec.entry);
+
+ } else if (ri->rec.op == GF_FOP_MKDIR) {
+
+ uuid_t gfid;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Doing mkdir for dir %s \n",
+ ri->rec.gfid);
+
+ // TBD - add mode and flags later
+ uuid_parse(ri->rec.gfid, gfid);
+ if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) {
+ goto err;
+ }
+
+ if (glfs_h_mkdir_with_xdata(ctx->fs, obj, ri->rec.entry, 0777, NULL, gfid, dict) != 0) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "Failure for Doing mkdir for file %s\n",
+ ri->rec.entry);
+ goto err;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Finished Doing mkdir for file %s \n",
+ ri->rec.entry);
+
+ } else if ((ri->rec.op == GF_FOP_RMDIR) || (ri->rec.op == GF_FOP_UNLINK)) {
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Doing rmdir/ublink for dir %s \n",
+ ri->rec.entry);
+
+ if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) {
+ goto err;
+ }
+ if (glfs_h_unlink_with_xdata(ctx->fs, obj, ri->rec.entry, dict) != 0) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "Failure for Doing rmdir/unlink for file %s\n",
+ ri->rec.entry);
+ goto err;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Finished Doing rmdir/unlink for file %s \n",
+ ri->rec.entry);
+
+ } else if (ri->rec.op == GF_FOP_SYMLINK) {
+
+ uuid_t gfid;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Doing symlink for file %s to file %s \n",
+ ri->rec.entry, ri->rec.link_path);
+
+ if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) {
+ goto err;
+ }
+ uuid_parse(ri->rec.gfid, gfid);
+
+ if (glfs_h_symlink_with_xdata(ctx->fs, obj, ri->rec.entry, ri->rec.link_path, NULL, gfid, dict) == NULL) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "Failed to Doing symlink for file %s to file %s \n",
+ ri->rec.entry, ri->rec.link_path);
+ goto err;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Finished Doing symlink for file %s to file %s \n",
+ ri->rec.entry, ri->rec.link_path);
+
+ } else if (ri->rec.op == GF_FOP_LINK) {
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Doing hard link for file %s to file %s \n",
+ ri->rec.entry, ri->rec.gfid);
+
+ if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) {
+ goto err;
+ }
+ if ((to_obj = create_obj(ctx, ri->rec.gfid)) == NULL) {
+ goto err;
+ }
+
+ if (glfs_h_link_with_xdata(ctx->fs, to_obj, obj, ri->rec.entry, dict) == -1) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "Failed to Doing hard link for file %s to file %s \n",
+ ri->rec.entry, ri->rec.gfid);
+ goto err;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Finsihed doing hard link for file %s to file %s \n",
+ ri->rec.entry, ri->rec.gfid);
+
+ } else if (ri->rec.op == GF_FOP_RENAME) {
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Doing rename for file %s to file %s \n",
+ ri->rec.entry, ri->rec.newloc);
+
+ if ((obj = create_obj(ctx, ri->rec.pargfid)) == NULL) {
+ goto err;
+ }
+ if ((to_obj = create_obj(ctx, ri->rec.gfid)) == NULL) {
+ goto err;
+ }
+
+ if (glfs_h_rename_with_xdata(ctx->fs, obj, ri->rec.entry, to_obj, ri->rec.newloc, dict) == -1) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "Failed to Doing rename for file %s to file %s \n",
+ ri->rec.entry, ri->rec.newloc);
+ goto err;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Finsihed doing renam for file %s to file %s \n",
+ ri->rec.entry, ri->rec.newloc);
+
+
+ } else if ((ri->rec.op == GF_FOP_SETATTR) || (ri->rec.op == GF_FOP_FSETATTR)) {
+
+ struct iatt iatt = {0, };
+ int valid = 0;
+ int ret = -1;
+
+ // TBD - do the actual settings once we do that
+ // right now we just set the mode so that changelog gets filled
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Doing attr for file %s \n",
+ ri->rec.gfid);
+
+ if ((obj = create_obj(ctx, ri->rec.gfid)) == NULL) {
+ goto err;
+ }
+
+ if (obj->inode->ia_type == IA_IFDIR)
+ fd = glfs_h_opendir_with_xdata(ctx->fs, obj, dict);
+ else
+ fd = glfs_h_open_with_xdata(ctx->fs, obj, O_RDWR, dict);
+ if (fd == NULL) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "open for file %s failed\n",
+ ri->rec.gfid);
+ goto err;
+ }
+
+ iatt.ia_prot = ia_prot_from_st_mode(777);
+ valid = GF_SET_ATTR_MODE;
+
+
+ // Set one special dict flag to indicate the opcode so that
+ // the opcode gets set to this
+ if (dict_set_int32(dict,"recon-attr-opcode",ri->rec.op)) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "setting opcode to %d failed\n",ri->rec.op);
+ goto err;
+ }
+
+ ret = glfs_fsetattr_with_xdata(fd, &iatt, valid, dict);
+ if (ret == -1) {
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "failed Doing attr for file %s \n",
+ ri->rec.gfid);
+ goto err;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "Doing attr for file %s \n",
+ ri->rec.gfid);
+
+ }
+
+ retval = _gf_true;
+
+err:
+ if (fd) {
+ /*
+ * It's not clear that we should be passing the same dict to
+ * glfs_close that was passed to us for glfs_open, but that's
+ * the prior behavior so let's preserve it for now.
+ */
+ if (glfs_close_with_xdata(fd, dict) == -1) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "close failed\n");
+ }
+ }
+ if (obj) {
+ /*
+ * AFAICT fd operations do not borrow this reference, so we
+ * still need to drop it ourselves.
+ */
+ glfs_h_close(obj);
+ }
+ if (to_obj) {
+ /*
+ * AFAICT fd operations do not borrow this reference, so we
+ * still need to drop it ourselves.
+ */
+ glfs_h_close(to_obj);
+ }
+ return retval;
+}
+
+//return back opcodes that requires reading from source
+static gf_boolean_t
+recon_check_changelog(nsr_recon_record_details_t *rd)
+{
+ return((rd->op == GF_FOP_WRITE) ||
+ (rd->op == GF_FOP_FSETATTR) ||
+ (rd-> op == GF_FOP_SETATTR) ||
+ (rd->op == GF_FOP_FREMOVEXATTR) ||
+ (rd->op == GF_FOP_SETXATTR) ||
+ (rd->op == GF_FOP_FSETXATTR) ||
+ (rd->op == GF_FOP_SYMLINK));
+
+}
+
+// TBD
+static gf_boolean_t
+recon_compute_undo(nsr_recon_record_details_t *rd)
+{
+ return(_gf_false);
+}
+
+
+/*
+ * Function that talks to the brick for data tranfer.
+ *
+ * Input arguments:
+ * ctx - worker context
+ * work - pointer to work object
+ */
+static void
+data_worker_func(nsr_per_node_worker_t *ctx,
+ nsr_recon_work_t *work)
+{
+ nsr_recon_driver_ctx_t *dr = ctx->driver_ctx;
+ xlator_t *this = dr->this;
+ nsr_reconciliation_record_t *ri = NULL;
+ nsr_recon_record_details_t *rd = NULL;
+ int wip = 0;
+ dict_t * dict = NULL;
+ struct glfs_fd *fd = NULL;
+ struct glfs_object *obj = NULL;
+ uuid_t gfid;
+ uint32_t k_s = 0, v_s = 0;
+ char *t_b= NULL;
+ uint32_t num=0;
+
+ switch (work->req_id){
+ case NSR_WORK_ID_INI:
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "started data ini \n");
+
+ if (nsr_recon_start_work(ctx, _gf_false) != 0) {
+ ctx->result = -1;
+ return;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "finished data ini \n");
+ break;
+ case NSR_WORK_ID_FINI:
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "started data fini \n");
+
+ if (nsr_recon_end_work(ctx, _gf_false) != 0) {
+ ctx->result = -1;
+ return;
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "finished data fini \n");
+ break;
+ case NSR_WORK_ID_SINGLE_RECONCILIATION_READ:
+ // first_index always starts with 1 but records starts at 0.
+ wip = work->index - (dr->workers[0].recon_info->first_index);
+ ri = &(dr->workers[0].recon_info->records[wip]);
+ rd = &(ri->rec);
+
+ dict = dict_new ();
+ if (!dict) {
+ ctx->result = -1;
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "failed allocating for dictionary\n");
+ break;
+ }
+ if (dict_set_int32(dict,RECON_TERM_XATTR,ri->work.term)) {
+ ctx->result = -1;
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "error setting term in dict\n");
+ break;
+ }
+ if (dict_set_int32(dict,RECON_INDEX_XATTR,ri->work.index)) {
+ ctx->result = -1;
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "error setting term in dict\n");
+ break;
+ }
+
+ switch (rd->op) {
+ case GF_FOP_WRITE:
+
+ // record already copied.
+ // copy data to this node's info.
+
+ uuid_parse(ri->rec.gfid, gfid);
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "started recon read for file %s at offset %d at len %d\n",
+ ri->rec.gfid, rd->offset, rd->len);
+
+ obj = glfs_h_create_from_handle (ctx->fs, gfid,
+ GFAPI_HANDLE_LENGTH,
+ NULL);
+ if (obj == NULL) {
+ GF_ASSERT(obj != NULL);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "creating of handle failed\n");
+ break;
+ }
+
+ // The file has probably got deleted.
+ fd = glfs_h_open_with_xdata (ctx->fs, obj, O_RDONLY,
+ dict);
+ if (fd == NULL) {
+ GF_ASSERT(fd != NULL);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "opening of file failed\n");
+ break;
+ }
+
+ if (glfs_lseek_with_xdata (fd, rd->offset, SEEK_SET,
+ dict) != rd->offset) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "lseek of file failed to offset %d\n",
+ rd->offset);
+ break;
+ }
+
+ ri->work.data = RD_CALLOC (rd->len , sizeof(char),
+ gf_mt_recon_work_data_t);
+ if (glfs_read_with_xdata (fd, ri->work.data, rd->len,
+ 0, dict) != rd->len) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "read of file failed to offset %d for bytes %d\n",
+ rd->offset, rd->len);
+ break;
+ }
+ break;
+
+ case GF_FOP_FTRUNCATE:
+ case GF_FOP_SYMLINK:
+ case GF_FOP_RMDIR:
+ case GF_FOP_UNLINK:
+ case GF_FOP_MKNOD:
+ case GF_FOP_CREATE:
+ case GF_FOP_LINK:
+ case GF_FOP_MKDIR:
+ case GF_FOP_RENAME:
+ nsr_worker_log (this->name, GF_LOG_ERROR,
+ "unimplemented fop %u\n", rd->op);
+ break;
+
+ case GF_FOP_FREMOVEXATTR:
+ case GF_FOP_REMOVEXATTR:
+ case GF_FOP_SETXATTR:
+ case GF_FOP_FSETXATTR:
+
+ uuid_parse(ri->rec.gfid, gfid);
+
+
+ // This is for all the set attribute/extended
+ // attributes commands. Get all the attributes from
+ // the source and fill it in the buffer as a NULL
+ // seperated key and value which are in turn seperated
+ // by NULL.
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "doing getattr for gfid %s \n",
+ ri->rec.gfid);
+
+ obj = glfs_h_create_from_handle (ctx->fs, gfid,
+ GFAPI_HANDLE_LENGTH,
+ NULL);
+ if (obj == NULL) {
+ GF_ASSERT(fd != NULL);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "creating of handle failed\n");
+ break;
+ }
+
+ if (obj->inode->ia_type == IA_IFDIR)
+ fd = glfs_h_opendir_with_xdata (ctx->fs, obj,
+ dict);
+ else
+ fd = glfs_h_open_with_xdata (ctx->fs, obj,
+ O_RDONLY, dict);
+
+ if (fd == NULL) {
+ GF_ASSERT(fd != NULL);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "opening of file failed\n");
+ break;
+ }
+
+ if (get_xattr_total_size (fd, &t_b, &k_s, &v_s, &num,
+ dict) == -1) {
+ if (t_b) free(t_b);
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "list of xattr of gfid %s failed\n",
+ rd->gfid);
+ break;
+ }
+ ri->work.data = RD_CALLOC ((k_s + v_s) , sizeof(char),
+ gf_mt_recon_work_data_t);
+ if (get_xattr(fd, t_b, ri->work.data, v_s, num, dict) == -1) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "get xattr of gfid %s failed\n", rd->gfid);
+ break;
+ }
+ ri->work.num = num;
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "finished getattr for gfid %s \n",
+ ri->rec.gfid);
+ free(t_b);
+ break;
+
+ case GF_FOP_FSETATTR:
+ case GF_FOP_SETATTR:
+ //TBD - to get the actual attrbutes and fill
+ // mode, uid, gid, size, atime, mtime
+ nsr_worker_log (this->name, GF_LOG_ERROR,
+ "unimplemented fop %u\n", rd->op);
+ break;
+ default:
+ nsr_worker_log (this->name, GF_LOG_ERROR,
+ "unrecognized fop %u\n", rd->op);
+
+ }
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "finished recon read for gfid %s at offset %d for %d bytes \n",
+ rd->gfid, rd->offset, rd->len);
+ break;
+
+ case NSR_WORK_ID_SINGLE_RECONCILIATION_COMMIT:
+ // first_index always starts with 1 but records starts at 0.
+ wip = work->index - (dr->workers[0].recon_info->first_index);
+ ri = &(dr->workers[0].recon_info->records[wip]);
+ rd = &(ri->rec);
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "got recon commit for index %d that has gfid %s \n",
+ wip, rd->gfid);
+ dict = dict_new ();
+ if (!dict) {
+ ctx->result = -1;
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "failed allocating for dictionary\n");
+ break;
+ }
+ if (dict_set_int32(dict,RECON_TERM_XATTR,ri->work.term)) {
+ ctx->result = -1;
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "error setting term in dict\n");
+ break;
+ }
+ if (dict_set_int32(dict,RECON_INDEX_XATTR,ri->work.index)) {
+ ctx->result = -1;
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "error setting term in dict\n");
+ break;
+ }
+ if (apply_record(ctx, ri, dict) == _gf_false) {
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "apply_record fails\n");
+ }
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "finished recon commit for gfid %s \n",
+ rd->gfid);
+ break;
+
+ case NSR_WORK_ID_SINGLE_RECONCILIATION_FLUSH:
+ dict = dict_new ();
+ if (!dict) {
+ ctx->result = -1;
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "failed allocating for dictionary\n");
+ break;
+ }
+ if (dict_set_int32(dict,RECON_TERM_XATTR,ri->work.term)) {
+ ctx->result = -1;
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "error setting term in dict\n");
+ break;
+ }
+ if (dict_set_int32(dict,RECON_INDEX_XATTR,ri->work.index)) {
+ ctx->result = -1;
+ nsr_worker_log(this->name, GF_LOG_ERROR,
+ "error setting term in dict\n");
+ break;
+ }
+
+ // Increment work index with the start index
+ wip = work->index - (dr->workers[0].recon_info->first_index);
+ ri = &(dr->workers[0].recon_info->records[wip]);
+ rd = &(ri->rec);
+
+ glfs_fsync_with_xdata(fd, dict);
+ break;
+
+ default:
+ nsr_worker_log (this->name, GF_LOG_ERROR,
+ "unrecognized request id %u\n", work->req_id);
+ }
+
+ if (fd) {
+ glfs_close_with_xdata(fd, dict);
+ }
+ if (obj) {
+ glfs_h_close(obj);
+ }
+ if (dict) {
+ dict_unref(dict);
+ }
+}
+
+// thread for doing data work
+static void *
+data_worker_main(nsr_per_node_worker_t *ctx)
+{
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "starting data worker func\n");
+ init_worker(ctx, 0);
+
+ while(1) {
+ nsr_recon_work_t *work = NULL;
+ nsr_recon_driver_ctx_t *dr = ctx->driver_ctx;
+
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "waiting for work\n");
+
+ pthread_mutex_lock(&(ctx->mutex));
+ while (list_empty(&(ctx->head.list))) {
+ pthread_cond_wait(&(ctx->cv), &(ctx->mutex));
+ }
+ pthread_mutex_unlock(&(ctx->mutex));
+ list_for_each_entry(work, &(ctx->head.list), list) {
+ nsr_worker_log(this->name, GF_LOG_INFO,
+ "got work with id %d\n",work->req_id);
+ work->in_use = _gf_false;
+ data_worker_func(ctx, work);
+ atomic_dec(&(dr->outstanding));
+ break;
+ }
+ nsr_worker_log(this->name, GF_LOG_INFO,"deleting work item\n");
+ list_del_init (&work->list);
+ GF_FREE(work);
+ nsr_worker_log(this->name, GF_LOG_INFO,"finished deleting work item\n");
+ }
+
+ return NULL;
+}
+
+
+//make recon work
+static void
+recon_make_work(nsr_recon_driver_ctx_t *ctx,
+ nsr_recon_work_t **work,
+ nsr_recon_work_req_id_t req_id,
+ int32_t i)
+{
+ xlator_t *this = ctx->this;
+
+ // TBD - change this to get from a static pool
+ // This cannot fail
+ (*work) = RD_CALLOC (1, sizeof (nsr_recon_work_t), gf_mt_recon_work_t);
+ (*work)->req_id = req_id;
+ (*work)->index = i;
+ (*work)->in_use = _gf_true;
+ INIT_LIST_HEAD(&((*work)->list));
+ return;
+}
+
+// Schedule a work object to a worker thread.
+static void
+recon_queue_to_worker(nsr_recon_driver_ctx_t *ctx,
+ nsr_recon_work_t *work,
+ unsigned int id,
+ nsr_recon_queue_type_t type)
+{
+ nsr_per_node_worker_t *worker;
+ if (type == NSR_RECON_QUEUE_TO_CONTROL) {
+ worker = ctx->workers[id].control_worker;
+ nsr_driver_log(this->name, GF_LOG_INFO,
+ "queueing work to control index %d\n",id);
+ } else {
+ worker= ctx->workers[id].data_worker;
+ nsr_driver_log(this->name, GF_LOG_INFO,
+ "queueing work to data index %d\n",id);
+ }
+ pthread_mutex_lock(&worker->mutex);
+ list_add_tail(&work->list, &worker->head.list);
+ pthread_cond_signal(&worker->cv);
+ pthread_mutex_unlock(&worker->mutex);
+ return;
+}
+
+typedef void * (*F_t) (void *);
+
+// In case mode is set to NSR_USE_THREADS, create worker threads.
+static gf_boolean_t
+create_worker_threads(nsr_recon_private_t *priv,
+ nsr_recon_driver_ctx_t *ctx,
+ nsr_per_node_worker_t *w,
+ gf_boolean_t control_or_data,
+ F_t f,
+ uint32_t num)
+{
+ uint32_t i;
+ nsr_per_node_worker_t *worker = w;
+ xlator_t *this = ctx->this;
+
+ for (i=0; i < num; i++) {
+ worker->id = RD_CALLOC(1, 10, gf_mt_recon_id_t);
+ if (!worker->id) {
+ nsr_driver_log (priv->this->name, GF_LOG_ERROR, "memory allocation error \n");
+ return _gf_false;
+ }
+ sprintf(worker->id,"recon_%d", i);
+ worker->driver_ctx = ctx ;
+
+ if (ctx->mode == NSR_USE_THREADS) {
+ if (pthread_create(&worker->thread_id, NULL, f, worker)) {
+ nsr_driver_log (ctx->this->name, GF_LOG_ERROR, "control work thread creation error \n");
+ return _gf_false;
+ }
+ }
+ worker->index = i;
+ worker++;
+ }
+ return _gf_true;
+}
+
+/*
+ * In case of thread, send the work item; else call the function directly.
+ *
+ * Input arguments:
+ * bm - bitmap containing indices of nodes we want to send work
+ * num - number of such indices
+ * ctx - driver context from where we derive per worker context
+ * id - request ID
+ * q - control or data
+ * misc - used to overload such as index.
+ */
+static void
+send_and_wait(int32_t *result,
+ int32_t *op_errno,
+ int32_t bm,
+ uint32_t num,
+ nsr_recon_driver_ctx_t *ctx,
+ nsr_recon_work_req_id_t id,
+ nsr_recon_queue_type_t q,
+ int32_t misc)
+{
+ uint32_t i = 0;
+ nsr_recon_work_t *work;
+
+#define CONTROL_WORKER(i) ctx->workers[i].control_worker
+#define DATA_WORKER(i) ctx->workers[i].data_worker
+#define WORKER(i) ((q == NSR_RECON_QUEUE_TO_CONTROL) ? (CONTROL_WORKER(i)) : (DATA_WORKER(i)))
+
+ *result = *op_errno = 0;
+
+ for (i=0; i < num; i++) {
+ if ((bm & (1 << i)) && ctx->workers[i].in_use) {
+ WORKER(i)->result = 0;
+ WORKER(i)->op_errno = 0;
+ }
+ }
+ if (ctx->mode == NSR_SEQ) {
+ for (i=0; i < num; i++) {
+ if ((bm & (1 << i)) && ctx->workers[i].in_use) {
+ recon_make_work(ctx,&work, id, misc);
+ if (q == NSR_RECON_QUEUE_TO_CONTROL) {
+ if (i == 0)
+ control_worker_func_0(ctx->workers[0].control_worker, work);
+ else
+ control_worker_func(ctx->workers[i].control_worker, work);
+ } else {
+ data_worker_func(ctx->workers[i].data_worker, work);
+ }
+ GF_FREE(work);
+ }
+ }
+ goto out;
+ }
+
+ for (i=0; i < num; i++) {
+ if ((bm & (1 << i)) && ctx->workers[i].in_use) {
+ recon_make_work(ctx,&work, id, misc);
+ atomic_inc(&(ctx->outstanding));
+ recon_queue_to_worker(ctx, work, i, q);
+ }
+ }
+
+ nsr_driver_log(this->name, GF_LOG_INFO, "send_and_wait: waiting\n");
+ while (ctx->outstanding) {
+ pthread_yield();
+ }
+out:
+ for (i=0; i < num; i++) {
+ if ((bm & (1 << i)) && ctx->workers[i].in_use) {
+ if (WORKER(i)->result == -1) {
+ *result = -1;
+ }
+ }
+ }
+ if (*result == -1) {
+ for (i=0; i < num; i++) {
+ if ((bm & (1 << i)) && ctx->workers[i].in_use) {
+ if (WORKER(i)->op_errno == EAGAIN) {
+ *op_errno = EAGAIN;
+ break;
+ } else {
+ *op_errno = EIO;
+ }
+ }
+ }
+ }
+
+ nsr_driver_log(this->name, GF_LOG_INFO, "send_and_wait: all workers have returned with result: %d errno:%d\n", *result, *op_errno);
+ return;
+}
+
+// send INI or FINI
+static int32_t
+nsr_recon_in_use(nsr_recon_driver_ctx_t *ctx,
+ uint32_t i,
+ gf_boolean_t in_use)
+{
+ uint32_t bm = 1 << i;
+ gf_boolean_t send = _gf_false;
+ int32_t status =0, op_errno = 0;
+
+ send = (ctx->workers[i].in_use != in_use);
+ ctx->workers[i].in_use = in_use;
+
+ if (!send) {
+ return 0;
+ }
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "sending %s to index %d\n",in_use?"INI":"FINI",i);
+
+ send_and_wait(&status, &op_errno, bm, ctx->replica_group_size, ctx,
+ (in_use == _gf_true) ? NSR_WORK_ID_INI : NSR_WORK_ID_FINI,
+ NSR_RECON_QUEUE_TO_CONTROL, -1);
+ if (status == -1)
+ goto err;
+
+ send_and_wait(&status, &op_errno, bm, ctx->replica_group_size, ctx,
+ (in_use == _gf_true) ? NSR_WORK_ID_INI : NSR_WORK_ID_FINI,
+ NSR_RECON_QUEUE_TO_DATA, -1);
+ if (status == -1)
+ goto err;
+
+ /*
+ * We really need better error recovery. To activate a worker, we
+ * allocate memory and send two messages. If any of those actions
+ * fail, we should undo the others. It would probably be good to
+ * collapse the two messages into one, because it's pretty trivial to
+ * allocate a temporary structure and either link it in or free it
+ * depending on the result here.
+ */
+
+ if (in_use == _gf_false) {
+ GF_FREE(ctx->workers[i].recon_info);
+ }
+
+ return 0;
+
+err:
+ GF_FREE(ctx->workers[i].recon_info);
+ ctx->workers[i].recon_info = NULL;
+ return -1;
+}
+
+gf_boolean_t
+nsr_recon_driver_reconciliator (nsr_recon_private_t *priv)
+{
+ uint32_t replica_group_size = priv->replica_group_size;
+ uint32_t i;
+ nsr_recon_driver_ctx_t *ctx = priv->driver_thread_context;
+ int32_t bm;
+ int32_t status = 0;
+ int32_t op_errno = 0;
+ gf_boolean_t do_recon = _gf_false;
+ uint32_t start_index = ctx->workers[0].recon_info->first_index;
+ uint32_t end_index = ctx->workers[0].recon_info->last_index;
+ uint32_t num = ((start_index == 0) && (end_index == 0)) ? 0 : (end_index - start_index + 1);
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "starting reconciliation work as reconciliator \n");
+
+ // nothing to be done? signal back to the recon translator that this
+ // phase done.
+ bm = 1;
+ for (i=1; i < replica_group_size; i++) {
+ if (ctx->workers[i].in_use &&
+ (ctx->workers[0].recon_info->last_term == ctx->workers[i].recon_info->last_term)) {
+ ctx->workers[i].recon_info->last_index = end_index;
+ ctx->workers[i].recon_info->first_index = start_index;
+ bm |= (1 << i);
+ do_recon = _gf_true;
+ }
+ }
+
+ if (!do_recon || !num) {
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "nothing needs to be done as resolutor \n");
+ return _gf_true;
+ }
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "getting reconciliation window for term %d from %dto %d \n",
+ ctx->workers[0].recon_info->last_term,
+ start_index, end_index);
+ // We have set the bm in the above for loop where we go thru all nodes
+ // including this node that have seen the last term.
+ send_and_wait(&status, &op_errno, bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_GET_RECONCILATION_WINDOW,
+ NSR_RECON_QUEUE_TO_CONTROL, -1);
+ if (status == -1)
+ return _gf_false;
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished getting reconciliation window for term %d from %dto %d \n",
+ ctx->workers[0].recon_info->last_term,
+ start_index, end_index);
+
+
+ // from the changelogs, calculate the entries that need action and the
+ // source for each of these entries
+ compute_reconciliation_work(ctx);
+
+ // for each of the entries that need fixup, issue IO
+ for (i=start_index; i < (start_index + num); i++) {
+ nsr_reconciliator_info_t *my_recon_info =
+ ctx->workers[0].recon_info;
+ nsr_reconciliation_record_t *record =
+ &(my_recon_info->records[i - start_index]);
+
+ record->work.term = ctx->workers[0].recon_info->last_term;
+ record->work.index = i;
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "fixing index %d\n",i);
+ if ((record->work.type == NSR_RECON_WORK_HOLE_TO_PSEUDO_HOLE) ||
+ (record->work.type == NSR_RECON_WORK_HOLE_TO_FILL)) {
+ // 1st case (RECON_WORK_HOLE_TO_PSEUDO_HOLE): If there
+ // are only pseudo_holes in others, it is best effort.
+ // Just pick from the first node that has it and
+ // proceed.
+ // 2nd case (RECON_WORK_HOLE_TO_FILL): this node has
+ // either a HOLE or PSUEDO_HOLE and some one else has a
+ // FILL(source). analyse the changelog to check if data
+ // needs to be read or if the log has all the data
+ // required
+
+ if (recon_check_changelog(&record->rec)) {
+ bm = (1 << record->work.source);
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "reading data from source %d\n",record->work.source);
+ send_and_wait(&status, &op_errno, bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_SINGLE_RECONCILIATION_READ,
+ NSR_RECON_QUEUE_TO_DATA,
+ i);
+ if (status == -1)
+ return _gf_false;
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "got data from source %d\n",record->work.source);
+ }
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "fixing local data as part of reconciliation\n");
+
+ bm = 1;
+ send_and_wait(&status, &op_errno, bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_SINGLE_RECONCILIATION_COMMIT,
+ NSR_RECON_QUEUE_TO_DATA,
+ i);
+ if (status == -1)
+ return _gf_false;
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished fixing local data as part of reconciliation\n");
+
+ } else if (record->work.type == NSR_RECON_WORK_COMPARE_PSEUDO_HOLE) {
+ // this node has a pseudo_hole and some others have just
+ // that too. Just convert this to FILL. let others
+ // blindly pick it from here.
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "fixing this record as a fill\n");
+ bm = 1;
+ send_and_wait(&status, &op_errno, bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_SINGLE_RECONCILIATION_FLUSH,
+ NSR_RECON_QUEUE_TO_DATA,
+ i);
+ if (status == -1)
+ return _gf_false;
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished fixing this record as a fill\n");
+ }
+ }
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished reconciliation work as reconciliator \n");
+
+ // tbd - mark this term golden in the reconciliator
+ return _gf_true;
+}
+
+gf_boolean_t
+nsr_recon_driver_resolutor (nsr_recon_private_t *priv)
+{
+ uint32_t replica_group_size = priv->replica_group_size;
+ uint32_t i;
+ nsr_recon_driver_ctx_t *ctx = priv->driver_thread_context;
+ int32_t bm;
+ int32_t status = 0;
+ int32_t op_errno = 0;
+ // This node's last term is filled when it gets a message from the
+ // leader to act as a reconciliator.
+ uint32_t recon_index = ctx->reconciliator_index;
+ nsr_reconciliator_info_t *my_info =
+ ctx->workers[0].recon_info;
+ nsr_reconciliator_info_t *his_info =
+ ctx->workers[recon_index].recon_info;
+ uint32_t my_last_term = my_info->last_term;
+ uint32_t to_do_term = his_info->last_term;
+ uint32_t my_start_index = 1, my_end_index = 1;
+ uint32_t his_start_index = 1, his_end_index = 1;
+ uint32_t num = 0;
+ gf_boolean_t fl = _gf_true;
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "starting resolutor work with reconciliator as %d from term %d to term %d \n",
+ recon_index, my_last_term, to_do_term);
+
+ do {
+
+ if (!fl) {
+ (his_info->last_term)++;
+ (my_info->last_term)++;
+ } else {
+ his_info->last_term = my_last_term;
+ }
+
+ nsr_driver_log (this->name, GF_LOG_INFO, "resolving term %d \n", my_info->last_term);
+
+ // Get reconciliator's term information for that term
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "getting info from reconciliator for term %d \n", my_info->last_term);
+ bm = (1 << recon_index);
+ send_and_wait(&status, &op_errno, bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_GET_GIVEN_TERM_INFO,
+ NSR_RECON_QUEUE_TO_CONTROL, his_info->last_term);
+ if (status == -1)
+ return _gf_false;
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished getting info from reconciliator for term %d \n", my_info->last_term);
+
+
+ // empty term
+ if (!his_info->commited_ops) {
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "reconciliator for term %d is empty. moving to next term. \n", my_info->last_term);
+ // TBD - mark the term golden
+ fl = _gf_false;
+ continue;
+ }
+
+ // calculate the resolution window boundary. for the last term
+ // this node saw, we compare the resolution window of this and
+ // reconciliator. for the rest of the nodes, we just accept the
+ // reconciliator info.
+ if (fl) {
+ my_start_index = my_info->first_index;
+ my_end_index = my_info->last_index;
+ his_start_index = his_info->first_index;
+ his_end_index = his_info->last_index;
+ my_info->first_index = (my_start_index < his_start_index) ? my_start_index : his_start_index;
+ my_info->last_index = (my_end_index > his_end_index) ? my_end_index : his_end_index;
+ } else {
+ my_info->first_index = his_info->first_index;
+ my_info->last_index = his_info->last_index;
+ my_info->commited_ops = his_info->commited_ops;
+ }
+ if (my_info->first_index == 0)
+ my_info->first_index = 1;
+ num = (my_info->last_index - my_info->first_index) + 1;
+
+
+ // Get the logs from the reconciliator (and this node for this
+ // term)
+ if (fl)
+ bm = ((1 << recon_index) | 1);
+ else
+ bm = (1 << recon_index);
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "getting reconciliation window for term %d from %d to %d \n",
+ my_info->last_term,
+ my_info->first_index, my_info->last_index);
+ send_and_wait(&status, &op_errno, bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_GET_RECONCILATION_WINDOW,
+ NSR_RECON_QUEUE_TO_CONTROL, -1);
+ if (status == -1)
+ return _gf_false;
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished getting reconciliation window for term %d from %d to %d \n",
+ my_info->last_term,
+ my_info->first_index, my_info->last_index);
+
+ // from the changelogs, calculate the entries that need action
+ compute_resolution_work(ctx, my_info, his_info, !fl);
+
+
+ // for each of the entries that need fixup, issue IO
+ for (i=my_info->first_index; i < (my_info->first_index + num); i++) {
+ nsr_reconciliation_record_t *record =
+ &(my_info->records[i - my_info->first_index]);
+
+ record->work.term = my_info->last_term;
+ record->work.index = i;
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "fixing index %d\n",i);
+ if ((record->work.type == NSR_RECON_WORK_HOLE_TO_FILL) ||
+ (record->work.type == NSR_RECON_WORK_UNDO_FILL)) {
+ if (((record->work.type == NSR_RECON_WORK_HOLE_TO_FILL) &&
+ recon_check_changelog(&record->rec)) ||
+ ((record->work.type == NSR_RECON_WORK_UNDO_FILL) &&
+ recon_compute_undo(&record->rec))) {
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "reading data from source %d\n",recon_index);
+ bm = (1 << recon_index);
+ send_and_wait(&status, &op_errno, bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_SINGLE_RECONCILIATION_READ,
+ NSR_RECON_QUEUE_TO_DATA,
+ i);
+ if (status == -1)
+ return _gf_false;
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished reading data from source %d\n",recon_index);
+ }
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "fixing local data as part of resolutor\n");
+
+ bm = 1;
+ send_and_wait(&status, &op_errno, bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_SINGLE_RECONCILIATION_COMMIT,
+ NSR_RECON_QUEUE_TO_DATA,
+ i);
+ if (status == -1)
+ return _gf_false;
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished fixing local data as part of resolutor\n");
+ }
+ }
+ fl = _gf_false;
+
+ // tbd - mark this term golden in the reconciliator
+ } while (my_last_term++ != to_do_term);
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished resolutor work \n");
+ return _gf_true;
+}
+
+gf_boolean_t
+nsr_recon_driver_leader (nsr_recon_private_t *priv)
+{
+ uint32_t replica_group_size = priv->replica_group_size;
+ uint32_t i;
+ nsr_recon_driver_ctx_t *ctx = priv->driver_thread_context;
+ int32_t bm;
+ int32_t status = 0;
+ int32_t op_errno = 0;
+ int32_t chosen = -1;
+ int32_t last_term = -1, last_ops = -1;
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "getting last term info from all members of this group\n");
+ // Get last term info from all members for this group
+ send_and_wait(&status, &op_errno, -1,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_GET_LAST_TERM_INFO,
+ NSR_RECON_QUEUE_TO_CONTROL, ctx->current_term);
+ if (status == -1)
+ return _gf_false;
+
+
+ // compare all the info received and choose the reconciliator First
+ // choose all with latest term
+ for (i=0; i < replica_group_size; i++) {
+ if (ctx->workers[i].in_use) {
+ if (ctx->workers[i].recon_info->last_term > last_term) {
+ last_term = ctx->workers[i].recon_info->last_term;
+ }
+ }
+ }
+ // First choose all with latest term and highest ops
+ for (i=0; i < replica_group_size; i++) {
+ if ((ctx->workers[i].in_use) && (last_term == ctx->workers[i].recon_info->last_term)) {
+ if (ctx->workers[i].recon_info->commited_ops > last_ops) {
+ last_ops = ctx->workers[i].recon_info->commited_ops;
+ }
+ }
+ }
+ // choose the first among the lot
+ for (i=0; i < replica_group_size; i++) {
+ if ((ctx->workers[i].in_use) &&
+ (last_term == ctx->workers[i].recon_info->last_term) &&
+ (last_ops == ctx->workers[i].recon_info->commited_ops)) {
+ chosen = i;
+ break;
+ }
+ }
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "reconciliator chosen is %d\n", chosen);
+ ctx->reconciliator_index = chosen;
+ GF_ASSERT(chosen != -1);
+ if (chosen == -1) {
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "no reconciliatior chosen\n");
+ return _gf_false;
+ }
+
+ // send the message to reconciliator to do reconciliation with list of
+ // nodes that are part of this quorum
+ if (chosen != 0) {
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "sending reconciliation work to %d\n", chosen);
+ bm = 1 << ctx->reconciliator_index;
+ send_and_wait(&status, &op_errno, bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_RECONCILIATOR_DO_WORK,
+ NSR_RECON_QUEUE_TO_CONTROL, -1);
+ if (status == -1)
+ return _gf_false;
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished reconciliation work to %d\n", chosen);
+ } else {
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "local node is reconciliator. before set jmp\n");
+ nsr_recon_driver_reconciliator(priv);
+ }
+
+ // send message to all other nodes to sync up with the reconciliator
+ // including itself if required
+ // requires optimisation - TBD
+ if (chosen != 0) {
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "local node resolution needs to be done. before set jmp\n");
+ nsr_recon_driver_resolutor(priv);
+ }
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "sending resolution work to all nodes except this node and reconciliator\n");
+ bm = ~((1 << ctx->reconciliator_index) || 1);
+ send_and_wait(&status, &op_errno, bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_RESOLUTION_DO_WORK,
+ NSR_RECON_QUEUE_TO_CONTROL, -1);
+ if (status == -1)
+ return _gf_false;
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished reconciliation work as leader \n");
+ return _gf_true;
+}
+
+// main recon driver thread
+void *
+nsr_reconciliation_driver(void *arg)
+{
+ nsr_recon_private_t *priv = (nsr_recon_private_t *) arg;
+ uint32_t replica_group_size = priv->replica_group_size;
+ uint32_t i;
+ nsr_per_node_worker_t *control_s, *data_s;
+ nsr_recon_driver_ctx_t **driver_ctx, *ctx;
+ int32_t bm;
+ xlator_t *this = priv->this;
+ char *con_name, *data_name;
+ int32_t status = 0;
+ int32_t op_errno = 0;
+
+ driver_ctx = &priv->driver_thread_context;
+ (*driver_ctx) = GF_CALLOC (1,
+ sizeof (nsr_recon_driver_ctx_t),
+ gf_mt_recon_driver_ctx_t);
+ if (!driver_ctx) {
+ gf_log (this->name, GF_LOG_ERROR, "memory allocation error \n");
+ return NULL;
+ }
+ ctx = *driver_ctx;
+ ctx->this = priv->this;
+ ctx->replica_group_size = replica_group_size;
+
+ ctx->fp = recon_create_log (priv->replica_group_members[0], "nsr-driver-log");
+ if (!ctx->fp)
+ return NULL;
+
+ if ((pthread_mutex_init(&(ctx->mutex), NULL)) ||
+ (pthread_cond_init(&(ctx->cv), NULL))){
+ nsr_driver_log (this->name, GF_LOG_ERROR, "mutex init error \n");
+ return NULL;
+ }
+ INIT_LIST_HEAD(&(ctx->role_head.list));
+
+ ctx->workers = RD_CALLOC (replica_group_size,
+ sizeof(nsr_replica_worker_t),
+ gf_mt_recon_worker_t);
+ if (!ctx->workers) {
+ nsr_driver_log (this->name, GF_LOG_ERROR, "memory allocation error \n");
+ return NULL;
+ }
+ for (i=0; i < replica_group_size; i++) {
+ strcpy(ctx->workers[i].name, priv->replica_group_members[i]);
+ }
+
+ control_s = RD_CALLOC (replica_group_size,
+ sizeof(nsr_per_node_worker_t),
+ gf_mt_recon_per_node_worker_t);
+ if (!control_s) {
+ nsr_driver_log (this->name, GF_LOG_ERROR, "memory allocation error \n");
+ return NULL;
+ }
+
+ data_s = RD_CALLOC (replica_group_size,
+ sizeof(nsr_per_node_worker_t),
+ gf_mt_recon_per_node_worker_t);
+ if (!data_s) {
+ nsr_driver_log (this->name, GF_LOG_ERROR, "memory allocation error \n");
+ return NULL;
+ }
+ for (i=0; i < replica_group_size; i++) {
+ ctx->workers[i].control_worker = &control_s[i];
+ if (asprintf(&con_name,"recon-con-%u",i) < 1) {
+ return NULL;
+ }
+ ctx->workers[i].control_worker->fp = recon_create_log
+ (priv->replica_group_members[0], con_name);
+ if (!ctx->workers[i].control_worker->fp)
+ return NULL;
+ ctx->workers[i].data_worker = &data_s[i];
+ if (asprintf (&data_name,"recon-data-%u",i) <1) {
+ return NULL;
+ }
+ ctx->workers[i].data_worker->fp = recon_create_log
+ (priv->replica_group_members[0], data_name);
+ if (!ctx->workers[i].data_worker->fp)
+ return NULL;
+ }
+
+ nsr_driver_log (this->name, GF_LOG_INFO, "creating threads \n");
+ // Create the worker threads
+ // For every brick including itself there will be 2 worker threads:
+ // one for data and one for control
+ if (!create_worker_threads(priv, ctx, control_s, _gf_true,
+ (F_t) control_worker_main, replica_group_size) ||
+ !create_worker_threads(priv, ctx, data_s, _gf_false,
+ (F_t) data_worker_main, replica_group_size)) {
+ return NULL;
+ }
+
+ for (i=0; i < replica_group_size; i++) {
+ nsr_recon_get_file(priv->volname, &(ctx->workers[i]));
+ }
+
+ while (1) {
+
+ nsr_role_work_t *rr;
+
+ nsr_driver_log (this->name, GF_LOG_INFO, "waiting for role to be queued \n");
+ pthread_mutex_lock(&(ctx->mutex));
+ while (list_empty(&(ctx->role_head.list))) {
+ pthread_cond_wait(&(ctx->cv), &(ctx->mutex));
+ }
+ pthread_mutex_unlock(&(ctx->mutex));
+
+ list_for_each_entry(rr, &(ctx->role_head.list), list) {
+ nsr_recon_driver_state_t state;
+ state = nsr_recon_driver_get_role(&status, ctx, rr);
+
+ if (status == -1) {
+ op_errno = EIO;
+ goto out;
+ }
+
+ switch (state) {
+
+ case leader:
+ if (!nsr_recon_driver_leader(priv)) {
+ goto out;
+ }
+ break;
+ case reconciliator:
+ if (!nsr_recon_driver_reconciliator(priv)) {
+ goto out;
+ }
+ break;
+ case resolutor:
+ if (!nsr_recon_driver_resolutor(priv)) {
+ goto out;
+ }
+ break;
+
+ case joiner:
+
+ nsr_driver_log (this->name, GF_LOG_INFO, "getting last term info from all members of this group\n");
+ // Get last term info from all members for this group
+ // which will be the leader(this node) and the node that wants to join.
+ send_and_wait(&status, &op_errno, -1,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_GET_LAST_TERM_INFO,
+ NSR_RECON_QUEUE_TO_CONTROL, ctx->current_term);
+ if (status == -1)
+ goto out;
+
+
+ // send message to other node that just joined to sync up with this node which is also the leader
+ nsr_driver_log (this->name, GF_LOG_INFO, "sending resolution work to all nodes except this\n");
+ bm = ~(1);
+ send_and_wait(&status, &op_errno, bm,
+ replica_group_size,
+ ctx,
+ NSR_WORK_ID_RESOLUTION_DO_WORK,
+ NSR_RECON_QUEUE_TO_CONTROL, -1);
+ if (status == -1)
+ goto out;
+
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished recon work as joiner \n");
+ break;
+
+ default:
+ nsr_driver_log (this->name, GF_LOG_ERROR,
+ "bad state %d", state);
+ }
+
+
+ // free the asasociated recon_info contexts created as part of this role
+
+out:
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "sending end of reconciliation message \n");
+ nsr_recon_return_back(priv, ctx->term, status, op_errno);
+ nsr_driver_log (this->name, GF_LOG_INFO,
+ "finished sending end of reconciliation message \n");
+ }
+ list_del_init (&rr->list);
+ }
+
+ return NULL;
+}
diff --git a/xlators/cluster/nsr-recon/src/recon_driver.h b/xlators/cluster/nsr-recon/src/recon_driver.h
new file mode 100644
index 000000000..3efb26269
--- /dev/null
+++ b/xlators/cluster/nsr-recon/src/recon_driver.h
@@ -0,0 +1,325 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __RECON_DRIVER_H__
+#define __RECON_DRIVER_H__
+
+
+#include "api/src/glfs.h"
+
+#define MAX_HOSTNAME_LEN 32
+#define MAXIMUM_REPLICA_STRENGTH 8
+#define MAX_RECONCILIATION_WINDOW_SIZE 10000
+
+#define GLUSTERD_DEFAULT_WORKDIR "/var/lib/glusterd"
+#define GLUSTERD_VOLUME_DIR_PREFIX "vols"
+#define GLUSTERD_BRICK_INFO_DIR "bricks"
+
+/*
+ * Even with the names fixed, the non-NSR_DEBUG definitions of nsr_*_log don't
+ * work because many callers don't have "this" defined.
+ *
+ * TBD: use gf_log, fix "this" problem, eliminate extra fields and newlines.
+ */
+#define NSR_DEBUG
+
+typedef enum nsr_recon_work_req_id_t {
+ NSR_WORK_ID_GET_NONE = 0,
+ NSR_WORK_ID_GET_LAST_TERM_INFO = NSR_WORK_ID_GET_NONE + 1,
+ NSR_WORK_ID_GET_GIVEN_TERM_INFO = NSR_WORK_ID_GET_LAST_TERM_INFO + 1,
+ NSR_WORK_ID_RECONCILIATOR_DO_WORK = NSR_WORK_ID_GET_GIVEN_TERM_INFO + 1,
+ NSR_WORK_ID_RESOLUTION_DO_WORK = NSR_WORK_ID_RECONCILIATOR_DO_WORK + 1,
+ NSR_WORK_ID_GET_RECONCILATION_WINDOW = NSR_WORK_ID_RESOLUTION_DO_WORK + 1,
+ NSR_WORK_ID_SINGLE_RECONCILIATION_READ = NSR_WORK_ID_GET_RECONCILATION_WINDOW + 1,
+ NSR_WORK_ID_SINGLE_RECONCILIATION_COMMIT = NSR_WORK_ID_SINGLE_RECONCILIATION_READ + 1,
+ NSR_WORK_ID_SINGLE_RECONCILIATION_FLUSH = NSR_WORK_ID_SINGLE_RECONCILIATION_COMMIT + 1,
+ NSR_WORK_ID_GET_RESOLUTION_WINDOW = NSR_WORK_ID_SINGLE_RECONCILIATION_FLUSH + 1,
+ NSR_WORK_ID_END_RECONCILIATION = NSR_WORK_ID_GET_RESOLUTION_WINDOW + 1,
+ NSR_WORK_ID_INI = NSR_WORK_ID_END_RECONCILIATION + 1,
+ NSR_WORK_ID_FINI = NSR_WORK_ID_INI + 1
+} nsr_recon_work_req_id_t;
+
+typedef enum nsr_recon_queue_type_t {
+ NSR_RECON_QUEUE_TO_CONTROL = 0,
+ NSR_RECON_QUEUE_TO_DATA =NSR_RECON_QUEUE_TO_CONTROL + 1,
+} nsr_recon_queue_type_t;
+
+typedef enum nsr_log_type_t {
+ NSR_LOG_HOLE = 0b0,
+ NSR_LOG_PSEUDO_HOLE = 0b1,
+ NSR_LOG_FILL = 0b11
+} nsr_log_type_t;
+
+typedef enum nsr_mode_t {
+ NSR_SEQ = 0,
+ NSR_USE_THREADS = 1,
+ NSR_ASYNC = 2
+} nsr_mode_t;
+
+typedef enum nsr_recon_work_type_t {
+ NSR_RECON_WORK_NONE = 0,
+ NSR_RECON_WORK_HOLE_TO_NOOP = NSR_RECON_WORK_NONE + 1,
+ NSR_RECON_WORK_HOLE_TO_PSEUDO_HOLE = NSR_RECON_WORK_HOLE_TO_NOOP + 1,
+ NSR_RECON_WORK_COMPARE_PSEUDO_HOLE = NSR_RECON_WORK_HOLE_TO_PSEUDO_HOLE + 1,
+ NSR_RECON_WORK_HOLE_TO_FILL = NSR_RECON_WORK_COMPARE_PSEUDO_HOLE + 1,
+ NSR_RECON_WORK_UNDO_FILL = NSR_RECON_WORK_HOLE_TO_FILL + 1,
+} nsr_recon_work_type_t;
+
+typedef enum nsr_recon_driver_state_t {
+ none = 0,
+ leader = 1,
+ reconciliator = 2,
+ resolutor = 3,
+ joiner = 4,
+} nsr_recon_driver_state_t;
+
+// role structure
+#pragma pack(push, 1)
+typedef struct _nsr_recon_role_s {
+ uint32_t role; // leader, reconciliator, resolutor
+ uint32_t num; // required in case state is reconciliator
+ uint32_t current_term; // current term used in case of leader
+ // In case this is reconciliator, num is set to nodes that were part
+ // of previous term.
+ // In case this is resolutor, num is set to 2.
+ // info[0] - information for this node.
+ // info[1] - information of the reconciliator.
+ // In case this is leader, num is set to this term's membership list
+ // set info.name to all members including the leader
+ struct {
+ int32_t last_term;
+ int32_t commited_ops;
+ uint32_t last_index;
+ uint32_t first_index;
+ char name[MAX_HOSTNAME_LEN];
+ } info[MAXIMUM_REPLICA_STRENGTH];
+} nsr_recon_role_t;
+#pragma pack(pop)
+
+#define ENDIAN_CONVERSION_RR(rr, is_true) \
+{ \
+ uint32_t i=0; \
+ uint32_t (*f)(uint32_t) = ((is_true == _gf_true) ? ntohl : htonl); \
+ if (is_true == _gf_true) rr.num = f(rr.num); \
+ rr.current_term = f(rr.current_term); \
+ for (i=0; i < rr.num; i++) { \
+ rr.info[i].last_term = f(rr.info[i].last_term); \
+ rr.info[i].commited_ops = f(rr.info[i].commited_ops); \
+ rr.info[i].last_index = f(rr.info[i].last_index); \
+ rr.info[i].first_index = f(rr.info[i].first_index); \
+ } \
+ if (is_true == _gf_false) rr.num = f(rr.num); \
+}
+
+// last term info structure
+#pragma pack(push, 1)
+typedef struct _nsr_recon_last_term_info_s {
+ int32_t last_term;
+ int32_t commited_ops;
+ uint32_t last_index;
+ uint32_t first_index;
+} nsr_recon_last_term_info_t;
+#pragma pack(pop)
+
+#define ENDIAN_CONVERSION_LT(lt, is_true) \
+{ \
+ uint32_t (*f)(uint32_t) = ((is_true == _gf_true) ? ntohl : htonl); \
+ lt.last_term = f(lt.last_term); \
+ lt.commited_ops = f(lt.commited_ops); \
+ lt.last_index = f(lt.last_index); \
+ lt.first_index = f(lt.first_index); \
+}
+
+// log information
+#pragma pack(push, 1)
+typedef struct _nsr_recon_log_info_s {
+ uint32_t term;
+ uint32_t first_index;
+ uint32_t last_index;
+} nsr_recon_log_info_t;
+#pragma pack(pop)
+
+#define ENDIAN_CONVERSION_LI(li, is_true) \
+{ \
+ uint32_t (*f)(uint32_t) = ((is_true == _gf_true) ? ntohl : htonl); \
+ li.term = f(li.term); \
+ li.first_index = f(li.first_index); \
+ li.last_index = f(li.last_index); \
+}
+
+#pragma pack(push, 1)
+typedef struct nsr_recon_record_details_s {
+ uint32_t type;
+ uint32_t op;
+ char gfid[36+1];
+ char pargfid[36+1];
+ char link_path[256]; // should it be PATH_MAX?
+ uint32_t offset;
+ uint32_t len;
+ char entry[128];
+ char newloc[128]; // for rename. can you overload link_path for this? TBD
+ mode_t mode;
+} nsr_recon_record_details_t;
+#pragma pack(pop)
+
+#define ENDIAN_CONVERSION_RD(rd, is_true) \
+{ \
+ uint32_t (*f)(uint32_t) = ((is_true == _gf_true) ? ntohl : htonl); \
+ rd.type = f(rd.type); \
+ rd.op = f(rd.op); \
+ rd.offset = f(rd.offset); \
+ rd.len = f(rd.len); \
+}
+
+typedef struct _nsr_role_work_s {
+ nsr_recon_role_t role;
+ uint32_t term;
+ struct list_head list;
+} nsr_role_work_t;
+
+typedef struct _nsr_recon_work_s {
+ gf_boolean_t in_use;
+ uint32_t index;
+ uint32_t req_id;
+ struct list_head list;
+} nsr_recon_work_t;
+
+typedef struct _nsr_reconciliation_work_s {
+ uint32_t term;
+ uint32_t index;
+ uint32_t type;
+ uint32_t source;
+ void *data;
+
+ uint32_t num; // used for xattr
+
+} nsr_reconciliation_work_t;
+
+typedef struct _nsr_reconciliation_record_s {
+ nsr_reconciliation_work_t work; // will store the computed work
+ nsr_recon_record_details_t rec;
+} nsr_reconciliation_record_t;
+
+typedef struct _nsr_reconciliator_info {
+ uint32_t reconcilator_index;
+ int32_t last_term;
+ int32_t commited_ops;
+ uint32_t last_index;
+ uint32_t first_index;
+ //nsr_reconciliation_record_t records[MAX_RECONCILIATION_WINDOW_SIZE];
+ nsr_reconciliation_record_t *records;
+} nsr_reconciliator_info_t;
+
+typedef struct _nsr_per_node_worker_s {
+ char *id; // identifier
+ char vol_file[256]; //volfile that will be used by this thread
+ glfs_t *fs;
+ glfs_fd_t *aux_fd;
+ uint32_t index; // index into array of workers
+ pthread_t thread_id; // thread id
+ void * context; // thread context
+ struct _nsr_recon_driver_ctxt *driver_ctx;
+ char local; // local data worker
+ //struct list_head list; //list of work items
+ nsr_recon_work_t head;
+ pthread_mutex_t mutex; //mutex to guard the state
+ pthread_cond_t cv; //condition variable for signaling the worker thread
+ gf_boolean_t is_control;
+#if defined(NSR_DEBUG)
+ FILE *fp;
+#endif
+ int32_t result; // result of latest work
+ int32_t op_errno; // errno
+} nsr_per_node_worker_t;
+
+typedef struct _nsr_replica_worker_s {
+ char name[256];
+ nsr_per_node_worker_t *control_worker;
+ nsr_per_node_worker_t *data_worker;
+ gf_boolean_t in_use;
+ nsr_reconciliator_info_t *recon_info; // Bunch of infos kept for this reconciliation
+} nsr_replica_worker_t;
+
+typedef struct _nsr_recon_driver_ctxt {
+ xlator_t *this;
+ uint32_t replica_group_size; // number of static members of replica group
+ nsr_replica_worker_t *workers; // worker info
+ int32_t reconciliator;
+ pthread_mutex_t mutex;
+ pthread_cond_t cv;
+ nsr_role_work_t role_head;
+ volatile int32_t outstanding;
+ uint32_t reconciliator_index;
+ uint32_t term;
+ uint32_t current_term;
+ nsr_mode_t mode; // default set to seq
+#if defined(NSR_DEBUG)
+ FILE *fp;
+#endif
+} nsr_recon_driver_ctx_t;
+
+void *
+nsr_reconciliation_driver(void *);
+
+gf_boolean_t
+nsr_recon_driver_set_role(nsr_recon_driver_ctx_t *ctx, nsr_recon_role_t *rr, uint32_t term);
+
+#define atomic_inc(ptr) ((void) __sync_fetch_and_add(ptr, 1))
+#define atomic_dec(ptr) ((void) __sync_fetch_and_add(ptr, -1))
+#define atomic_fetch_and __sync_fetch_and_and
+#define atomic_fetch_or __sync_fetch_and_or
+
+#if defined(NSR_DEBUG)
+
+#define NSR_LOG_DIR "/var/log/nsr-logs"
+
+extern int nsr_debug_level;
+extern FILE *recon_create_log (char *member, char *module);
+
+extern void
+_nsr_driver_log (const char *func, int line, char *member, FILE *fp,
+ char *fmt, ...);
+
+#define nsr_driver_log(dom, levl, fmt...) do { \
+ FMT_WARN (fmt); \
+ if (levl <= nsr_debug_level) { \
+ nsr_recon_private_t *priv = ctx->this->private; \
+ _nsr_driver_log (__FUNCTION__, __LINE__, \
+ priv->replica_group_members[0], \
+ ctx->fp, \
+ ##fmt); \
+ } \
+} while (0)
+
+extern void
+_nsr_worker_log (const char *func, int line, char *member,
+ char *type, uint32_t index, FILE *fp,
+ char *fmt, ...);
+
+#define nsr_worker_log(dom, levl, fmt...) do { \
+ FMT_WARN (fmt); \
+ if (levl <= nsr_debug_level) { \
+ nsr_recon_private_t *priv; \
+ priv = ctx->driver_ctx->this->private; \
+ _nsr_worker_log (__FUNCTION__, __LINE__, \
+ priv->replica_group_members[0], \
+ ctx->is_control ? "recon-con" : \
+ "recon-data", \
+ ctx->index, ctx->fp, \
+ ##fmt); \
+ } \
+} while (0)
+
+#else
+#define nsr_driver_log(dom, levl, fmt...) gf_log(dom, levl, fmt)
+#define nsr_worker_log(dom, levl, fmt...) gf_log(dom, levl, fmt)
+#endif
+
+#endif /* #ifndef __RECON_DRIVER_H__ */
diff --git a/xlators/cluster/nsr-recon/src/recon_xlator.c b/xlators/cluster/nsr-recon/src/recon_xlator.c
new file mode 100644
index 000000000..272c35dc2
--- /dev/null
+++ b/xlators/cluster/nsr-recon/src/recon_xlator.c
@@ -0,0 +1,1010 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <sys/types.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "call-stub.h"
+#include "defaults.h"
+#include "xlator.h"
+
+#include "recon_driver.h"
+#include "recon_xlator.h"
+
+typedef struct _nsr_recon_fd_s {
+ int32_t term;
+ nsr_recon_driver_state_t state;
+ uint32_t first_index;
+ uint32_t last_index;
+ call_frame_t *frame;
+} nsr_recon_fd_t;
+
+#if defined(NSR_DEBUG)
+
+void
+_recon_main_log (const char *func, int line, char *member, FILE *fp,
+ char *fmt, ...)
+{
+ va_list ap;
+ char *buf = NULL;
+ int retval;
+
+ if (!fp) {
+ fp = recon_create_log(member,"recon-main-log");
+ if (!fp) {
+ return;
+ }
+ }
+
+ va_start(ap,fmt);
+ retval = vasprintf(&buf,fmt,ap);
+ if (buf) {
+ fprintf(fp,"[%s:%d] %.*s\n",func,line,retval,buf);
+ free(buf);
+ }
+ va_end(ap);
+}
+
+#endif
+
+// Given fd, get back the NSR based fd context.
+static int32_t this_fd_ctx_get(fd_t *fd, xlator_t *this, nsr_recon_fd_t **rfd)
+{
+ uint64_t tmp = 0;
+ int32_t ret = -1;
+
+ if ((ret = fd_ctx_get(fd, this, &tmp)) != 0) {
+ return ret;
+ } else {
+ *rfd = (nsr_recon_fd_t *)tmp;
+ return 0;
+ }
+}
+
+// Add the frame in q after associating with term
+// term usage tbd
+static void put_frame(nsr_recon_private_t *priv,
+ call_frame_t *frame,
+ uint32_t term)
+{
+ xlator_t *this = priv->this;
+ recon_main_log (this->name, GF_LOG_INFO, "adding frame for term %d \n", term);
+ priv->frame = frame;
+ return;
+}
+
+// get the frame from the queue given the term
+// term usage tbd
+static void get_frame(nsr_recon_private_t *priv,
+ call_frame_t **frame,
+ uint32_t term)
+{
+ if (frame != NULL)
+ *frame = priv->frame;
+ priv->frame = NULL;
+ return;
+}
+
+// check if there are outstanding frames
+static gf_boolean_t is_frame(nsr_recon_private_t *priv)
+{
+ return((priv->frame != NULL) ? _gf_true : _gf_false);
+}
+
+#define ENTRY_SIZE 128
+
+long
+get_entry_count (char *path)
+{
+ int fd;
+ struct stat buf;
+ unsigned long entries = -1;
+ long min; /* last entry not known to be empty */
+ long max; /* first entry known to be empty */
+ long curr;
+ char entry[ENTRY_SIZE];
+ void *err_label = &&done;
+
+ fd = open(path,O_RDONLY);
+ if (fd < 0) {
+ goto *err_label;
+ }
+ err_label = &&close_fd;
+
+ if (fstat(fd,&buf) < 0) {
+ goto *err_label;
+ }
+
+ min = 0;
+ max = buf.st_size / ENTRY_SIZE;
+ printf("max = %ld\n",max);
+
+ while ((min+1) < max) {
+ curr = (min + max) / 2;
+ printf("trying entry %ld\n",curr);
+ if (lseek(fd,curr*ENTRY_SIZE,SEEK_SET) < 0) {
+ goto *err_label;
+ }
+ if (read(fd,entry,sizeof(entry)) != sizeof(entry)) {
+ goto *err_label;
+ }
+ if ((entry[0] == '_') && (entry[1] == 'P')) {
+ min = curr;
+ }
+ else {
+ max = curr;
+ }
+ }
+
+ entries = max;
+
+close_fd:
+ close(fd);
+done:
+ return entries;
+}
+
+// Get the term info for the term number specified
+void nsr_recon_libchangelog_get_this_term_info(xlator_t *this, char *bp, int32_t term, nsr_recon_last_term_info_t *lt)
+{
+ char path[PATH_MAX];
+ long entries;
+
+ bzero(lt, sizeof(nsr_recon_last_term_info_t));
+ lt->last_term = term;
+ sprintf(path,"%s/%s%d",bp,"TERM.",term);
+ entries = get_entry_count(path);
+ if (entries > 1) {
+ /* The first entry is actually a header. */
+ lt->first_index = 1;
+ /*
+ * This seems wrong, because it means that last_index*128 will
+ * be exactly at EOF and commited_ops will be one greater than
+ * it should be. Maybe some other code makes the exact
+ * opposite mistake to compensate.
+ */
+ lt->last_index = lt->commited_ops = (int)entries;
+ }
+ recon_main_log (this->name, GF_LOG_INFO, "for term=%d got first_index=%d last_index=%d commited_ops=%d\n",
+ term, lt->first_index, lt->last_index, lt->commited_ops);
+ return;
+}
+
+// Given the term number, find the last term in the changelogs
+void nsr_recon_libchangelog_get_last_term_info(xlator_t *this, char *bp, int32_t term, nsr_recon_last_term_info_t *lt)
+{
+ uint32_t t = term;
+ struct stat buf;
+ char path[PATH_MAX];
+ bzero(lt, sizeof(nsr_recon_last_term_info_t));
+ while(t) {
+ // journal file is of type TERM-1.jnl
+ sprintf(path,"%s/%s%d",bp,"TERM.",t);
+ if (!stat(path, &buf)) {
+ nsr_recon_libchangelog_get_this_term_info(this, bp, t, lt);
+ recon_main_log (this->name, GF_LOG_INFO, "got last term given current term %d as %d\n", term, t);
+ return;
+ }
+ t--;
+ }
+ recon_main_log (this->name, GF_LOG_INFO, "got no last term given current term %d \n", term);
+
+ return;
+}
+
+// Return back the frame stored against the term
+void nsr_recon_return_back(nsr_recon_private_t *priv, uint32_t term, int32_t status, int32_t op_errno)
+{
+ call_frame_t *old_frame = NULL;
+ xlator_t *this = priv->this;
+
+ get_frame(priv, &old_frame, term);
+ if (old_frame) {
+ recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_writev returns old frame \n");
+ // first return the original write for which this ack was sent
+ STACK_UNWIND_STRICT (writev, old_frame, status, op_errno, NULL, NULL, NULL);
+ } else {
+ recon_main_log (this->name, GF_LOG_ERROR, "EIII---nsr_recon_writev cnnot return old frame \n");
+ }
+}
+
+typedef enum records_type_t {
+ fop_gfid_pgfid_oldloc_newloc = 1,
+ fop_gfid_pgfid_entry = fop_gfid_pgfid_oldloc_newloc + 1,
+ fop_gfid = fop_gfid_pgfid_entry + 1 ,
+ fop_gfid_offset = fop_gfid + 1,
+ fop_gfid_offset_len = fop_gfid_offset + 1,
+} records_type_t;
+
+// Get the backend ./glusterfs/xx/xx/<...> path
+static void
+get_gfid_path(nsr_recon_private_t *priv, char *gfid, char *path)
+{
+ strcpy(path, priv->base_dir);
+ strcat(path, "/.glusterfs/");
+ strncat(path,gfid,2);
+ strcat(path,"/");
+ strncat(path,gfid+2,2);
+ strcat(path,"/");
+ strcat(path,gfid);
+}
+
+
+// Get the link to which backend points to
+static gf_boolean_t
+get_link_using_gfid(nsr_recon_private_t *priv, char *gfid, char *path)
+{
+ char lp[PATH_MAX];
+ xlator_t *this = priv->this;
+ get_gfid_path(priv,gfid, lp);
+ if (readlink(lp, path, 255) == -1) {
+ GF_ASSERT(0);
+ recon_main_log(priv->this, GF_LOG_ERROR,
+ "cannot get readlink for %s\n",lp);
+ return _gf_false;
+ }
+ return _gf_true;
+}
+
+// Get the list of changelog records given a term , first and last index.
+//
+// TBD: rewrite this hideous ball of mud in at least the following ways:
+//
+// (1) Break out the code for handling a single record into a separate
+// function, to make error handling easier and reduce "indentation
+// creep" so the code's readable.
+//
+// (2) Change all of the fop_xxx_yyy nonsense to OR together values
+// like FOP_HAS_FIELD_XXX and FOP_HAS_FIELD_YYY, to reduce code
+// duplication and facilitate the addition of new fields.
+//
+// (3) Stop making so many assumptions about the underlying formats.
+// The code as it is won't even work for the existing binary format,
+// let alone as changelog evolves over time.
+//
+// Really, 90% of this code should just GO AWAY in favor of using
+// libgfchangelog, enhanced as necessary to support our needs.
+
+/*
+ * Use this macro to skip over a field we're not using yet.
+ * NB: the body is a null statement on purpose
+ * TBD: all instances of this should be removed eventually!
+ */
+#define SKIP_FIELD do /* nothing */ ; while (*(start++) != '\0')
+
+#define SKIP_OVER
+gf_boolean_t nsr_recon_libchangelog_get_records(xlator_t *this, char *bp, int32_t term, uint32_t first, uint32_t last, void *buf)
+{
+ // do a mmap; seek into the first and read all records till last.
+ // TBD - right now all records are pseudo holes but mark them as fills.
+ // TBD - pseudo hole to be implemented when actual fsync gets done on data.
+ char *rb = NULL, *orig = NULL;
+ char path[PATH_MAX];
+ int fd;
+ uint32_t index = 0;
+
+ recon_main_log (this->name, GF_LOG_INFO,
+ "libchangelog_get_records called for term %d index from %d to %d \n",
+ term, first, last );
+
+ orig = rb = GF_CALLOC(128, ((last - first) + 1),
+ gf_mt_recon_changelog_buf_t);
+
+ sprintf(path,"%s/%s%d",bp,"TERM.",term);
+ fd = open(path, O_RDONLY);
+ if (fd == -1) {
+ return _gf_false;
+ } else {
+ char *start = NULL;
+ nsr_recon_record_details_t * rec = (nsr_recon_record_details_t *)buf;
+
+ if (first == 0)
+ lseek(fd, 128, SEEK_SET);
+ else
+ lseek(fd, first * 128, SEEK_SET);
+ if (read(fd, rb, (last - first + 1) * 128) == -1) {
+ return _gf_false;
+ }
+ start = rb;
+ index = first;
+ do {
+ recon_main_log (this->name, GF_LOG_INFO,
+ "libchangelog_get_records start inspecting records at index %d \n",
+ index );
+ if (!strncmp(start, "_PRE_", 5)) {
+ uint32_t i;
+ uint32_t opcode = 0;
+ records_type_t type;
+
+ start += 5;
+ // increment by the NULLs after the PRE
+ start += 4;
+ SKIP_FIELD; // real index
+ // now we have the opcode
+ while (*start != '\0') {
+ opcode *= 10;
+ opcode += (*(start++) - '0');
+ }
+ ++start;
+ recon_main_log (this->name, GF_LOG_ERROR,
+ "libchangelog_get_records: got opcode %d @index %d\n", opcode, index);
+ if ((opcode == GF_FOP_RENAME)) {
+ type = fop_gfid_pgfid_oldloc_newloc;
+ } else if ((opcode == GF_FOP_UNLINK) ||
+ (opcode == GF_FOP_RMDIR) ||
+ (opcode == GF_FOP_LINK) ||
+ (opcode == GF_FOP_MKDIR) ||
+ (opcode == GF_FOP_SYMLINK) ||
+ (opcode == GF_FOP_MKNOD) ||
+ (opcode == GF_FOP_CREATE)) {
+ type = fop_gfid_pgfid_entry;
+ } else if ((opcode == GF_FOP_FSETATTR) ||
+ (opcode == GF_FOP_SETATTR) ||
+ (opcode == GF_FOP_FREMOVEXATTR) ||
+ (opcode == GF_FOP_REMOVEXATTR) ||
+ (opcode == GF_FOP_SETXATTR) ||
+ (opcode == GF_FOP_FSETXATTR)) {
+ type = fop_gfid;
+ } else if ((opcode == GF_FOP_TRUNCATE) ||
+ (opcode == GF_FOP_FTRUNCATE)) {
+ type = fop_gfid_offset;
+ } else if (opcode == GF_FOP_WRITE) {
+ type = fop_gfid_offset_len;
+ } else {
+ recon_main_log (this->name,
+ GF_LOG_ERROR,
+ "libchangelog_get_records:got no proper opcode %d @index %d\n",
+ opcode, index);
+ //GF_ASSERT(0);
+ // make this as a hole.
+ // TBD - check this logic later. maybe we should raise alarm here because
+ // this means that changelog is corrupted. We are not handling changelog
+ // corruptions as of now.
+ rec->type = NSR_LOG_HOLE;
+ goto finish;
+ }
+ // TBD - handle psuedo holes once that logic is in.
+ rec->type = NSR_LOG_FILL;
+ recon_main_log (this->name, GF_LOG_ERROR,
+ "libchangelog_get_records:got type %d at index %d \n",
+ rec->type, index);
+ rec->op = opcode;
+
+ // Now get the gfid and parse it
+ // before that increment the pointer
+ for (i=0; i < 36; i++) {
+ rec->gfid[i] = (*start);
+ start++;
+ }
+ rec->gfid[i] = '\0';
+
+ GF_ASSERT(*start == 0);
+ start ++;
+
+ if (opcode == GF_FOP_SYMLINK) {
+ i = 0;
+ do {
+ if (i >= 256) {
+ goto finish;
+ }
+ rec->link_path[i++] = *start;
+ } while (*(start++) != '\0');
+ }
+
+ i = 0;
+ // If type is fop_gfid_offset+_len, get offset
+ if ((type == fop_gfid_offset) || (type == fop_gfid_offset_len)) {
+ char offset_str[128];
+ while(*start != 0) {
+ offset_str[i++] = *start;
+ start ++;
+ }
+ offset_str[i] = '\0';
+ // get over the 0
+ start++;
+ rec->offset = strtoul(offset_str, NULL, 10);
+ recon_main_log (this->name,
+ GF_LOG_ERROR,
+ "libchangelog_get_records:got offset %d @index %d \n", rec->offset, index);
+
+ }
+ i = 0;
+ if (type == fop_gfid_offset_len) {
+ char len_str[128];
+ while(*start != 0) {
+ len_str[i++] = *start;
+ start ++;
+ }
+ len_str[i] = '\0';
+ // get over the 0
+ start++;
+ rec->len = strtoul(len_str, NULL, 10);
+ recon_main_log (this->name,
+ GF_LOG_ERROR,
+ "libchangelog_get_records:got length %d @index %d \n", rec->len, index);
+ }
+ i = 0;
+ if (type == fop_gfid_pgfid_entry) {
+ switch (opcode) {
+ case GF_FOP_CREATE:
+ case GF_FOP_MKDIR:
+ case GF_FOP_MKNOD:
+ SKIP_FIELD; // mode
+ break;
+ /* TBD: handle GF_FOP_SYMLINK target */
+ default:
+ ;
+ }
+ SKIP_FIELD; // uid
+ SKIP_FIELD; // gid
+ if (opcode == GF_FOP_MKNOD) {
+ SKIP_FIELD; // dev
+ }
+ // first get the gfid and then the path
+ for (i=0; i < 36; i++) {
+ rec->pargfid[i] = (*start);
+ start++;
+ }
+ rec->pargfid[i] = '\0';
+ GF_ASSERT(*start == '/');
+ start ++;
+
+ i = 0;
+ while(*start != 0) {
+ rec->entry[i++] = *start;
+ start ++;
+ }
+ rec->entry[i] = '\0';
+ // get over the 0
+ start++;
+ /*
+ * Having to add this as a special case
+ * is awful. See the function header
+ * comment for the real solution.
+ */
+ if (opcode == GF_FOP_CREATE) {
+ rec->mode = 0;
+ while (*start != '\0') {
+ rec->mode *= 10;
+ rec->mode += *start
+ - '0';
+ ++start;
+ }
+ ++start;
+ }
+ recon_main_log (this->name,
+ GF_LOG_ERROR,
+ "libchangelog_get_records:got entry %s @index %d \n", rec->entry, index);
+
+ }
+ i = 0;
+ if (type == fop_gfid_pgfid_oldloc_newloc) {
+
+ // first get the source and then the destination
+ // source stuff gets stored in pargfid/entry
+ for (i=0; i < 36; i++) {
+ rec->pargfid[i] = (*start);
+ start++;
+ }
+ rec->pargfid[i] = '\0';
+ GF_ASSERT(*start == '/');
+ start ++;
+
+ i=0;
+ while(*start != 0) {
+ rec->entry[i++] = *start;
+ start ++;
+ }
+ rec->entry[i] = '\0';
+ // get over the 0
+ start++;
+
+ // dst stuff gets stored in gfid/newloc
+ for (i=0; i < 36; i++) {
+ rec->gfid[i] = (*start);
+ start++;
+ }
+ rec->gfid[i] = '\0';
+ GF_ASSERT(*start == '/');
+ start ++;
+ i = 0;
+ while(*start != 0) {
+ rec->newloc[i++] = *start;
+ start ++;
+ }
+ rec->newloc[i] = '\0';
+ // get over the 0
+ start++;
+
+ }
+ ENDIAN_CONVERSION_RD((*rec), _gf_false); //htonl
+ }
+finish:
+ if (index == last)
+ break;
+ index++;
+ rb += 128;
+ start = rb;
+ rec++;
+ } while(1);
+ }
+ GF_FREE(orig);
+ close(fd);
+
+ recon_main_log (this->name, GF_LOG_INFO,
+ "libchangelog_get_records finsihed inspecting records for term %d \n",
+ term);
+ return _gf_true;
+}
+
+int32_t
+nsr_recon_open (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t flags, fd_t *fd, dict_t *xdata)
+{
+ int32_t op_ret = 0;
+ int32_t op_errno = 0;
+ nsr_recon_fd_t *rfd = NULL;
+
+ recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_open called for path %s \n",loc->path );
+ rfd = GF_CALLOC (1, sizeof (*rfd), gf_mt_recon_fd_t);
+ if (!rfd) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ }
+
+ op_ret = fd_ctx_set (fd, this, (uint64_t)(long)rfd);
+ if (op_ret) {
+ op_ret = -1;
+ op_errno = EINVAL;
+ }
+ recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_open returns with %d for path %s \n",op_ret,loc->path );
+ STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, NULL);
+ return 0;
+}
+
+int32_t
+nsr_recon_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset,
+ uint32_t flags, struct iobref *iobref, dict_t *xdata)
+{
+ nsr_recon_fd_t *rfd = NULL;
+ nsr_recon_private_t *priv = NULL;
+ int32_t op_ret = 0;
+ int32_t op_errno = 0;
+ int32_t ret = 0;
+
+ ret = this_fd_ctx_get (fd, this, &rfd);
+ if (ret < 0) {
+ return -1;
+ }
+ priv = (nsr_recon_private_t *)this->private;
+
+ recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_writev called for offset %d \n",(unsigned int)offset );
+ GF_ASSERT(count == 1);
+ switch (offset) {
+ // client(brick, leader) writes the role of the node
+ case nsr_recon_xlator_sector_1 :
+ {
+ nsr_recon_role_t rr;
+ memcpy((void *)&rr, (void *)vector[0].iov_base, sizeof(rr));
+ ENDIAN_CONVERSION_RR(rr, _gf_true); //ntohl
+
+ recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_writev called to set role %d\n", rr.role);
+ if ((rr.role != leader) &&
+ (rr.role != reconciliator) &&
+ (rr.role != resolutor) &&
+ (rr.role != joiner)) {
+ recon_main_log (this->name, GF_LOG_ERROR,
+ "EIII---nsr_recon_writev cannot set state \n");
+ STACK_UNWIND_STRICT (writev, frame, -1, op_errno,
+ NULL, NULL, NULL);
+ }
+
+ GF_ASSERT(rr.num <= MAXIMUM_REPLICA_STRENGTH);
+
+ // Check if already a role play is going on. If yes return with EAGAIN.
+ // Ideally we should check if we have got a higher term number while
+ // servicing a lower term number; if so abort the older one.
+ // However the abort infrastructure needs to be sketched properly; TBD.
+ if (is_frame(priv) == _gf_true) {
+ recon_main_log (this->name, GF_LOG_ERROR,
+ "nsr_recon_writev set_role - already role play \n");
+ STACK_UNWIND_STRICT (writev, frame, -1, EAGAIN,
+ NULL, NULL, NULL);
+ } else {
+
+ // Store the stack frame so that when the actual job gets finished
+ // we send the response back to the brick.
+ put_frame(priv, frame, rr.current_term);
+ if (nsr_recon_driver_set_role(priv->driver_thread_context,
+ &rr,
+ rr.current_term) == _gf_false) {
+ get_frame(priv, NULL, rr.current_term);
+ recon_main_log (this->name, GF_LOG_ERROR,
+ "nsr_recon_writev set_role - cannot seem to set role \n");
+ STACK_UNWIND_STRICT (writev, frame, -1, op_errno,
+ NULL, NULL, NULL);
+ } else {
+ recon_main_log (this->name, GF_LOG_INFO,
+ "nsr_recon_writev set_role - set role succesfully \n");
+ }
+ }
+ break;
+ }
+ // client(reconciliator) writes how much it needs for the read
+ case nsr_recon_xlator_sector_2 :
+ {
+ nsr_recon_log_info_t li;
+ memcpy((void *)&li, (void *)vector[0].iov_base, sizeof(li));
+ ENDIAN_CONVERSION_LI(li, _gf_true); //ntohl
+
+ recon_main_log (this->name, GF_LOG_INFO,
+ "nsr_recon_writev - setting term info for reconcilation info. term=%d, first_index=%d,start_index=%d \n",
+ li.term, li.first_index, li.last_index);
+ rfd->term = li.term;
+ rfd->last_index = li.last_index;
+ rfd->first_index = li.first_index;
+ STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno,
+ NULL, NULL, NULL);
+ break;
+ }
+ // client(reconciliator) writes term for which it needs info
+ case nsr_recon_xlator_sector_3 :
+ {
+ int32_t term;
+
+ memcpy((void *)&term, (void *)vector[0].iov_base, sizeof(term));
+ term = ntohl(term); //ntohl
+ recon_main_log (this->name, GF_LOG_INFO,
+ "nsr_recon_writev - setting term info for term info. term=%d\n",
+ term);
+ rfd->term = term;
+ STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno,
+ NULL, NULL, NULL);
+ break;
+ }
+ // client(reconciliator) writes current term so that it gets last term info later
+ case nsr_recon_xlator_sector_4 :
+ {
+ int32_t term;
+
+ memcpy((void *)&term, (void *)vector[0].iov_base, sizeof(term));
+ term = ntohl(term); //ntohl
+ recon_main_log (this->name, GF_LOG_INFO,
+ "nsr_recon_writev - setting term info for last term info given current term=%d\n",
+ term);
+ rfd->term = term;
+ STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno,
+ NULL, NULL, NULL);
+ break;
+ }
+ default:
+ {
+ recon_main_log (this->name, GF_LOG_ERROR,
+ "nsr_recon_writev called with wrong offset\n");
+ STACK_UNWIND_STRICT (writev, frame, -1, op_errno,
+ NULL, NULL, NULL);
+ break;
+ }
+ }
+
+ return 0;
+}
+
+int
+nsr_recon_readv (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata)
+{
+ nsr_recon_fd_t *rfd = NULL;
+ int32_t op_errno = 0;
+ // copied stuff from quick-read.c and posix.c
+ struct iobuf *iobuf = NULL;
+ struct iobref *iobref = NULL;
+ struct iovec iov = {0, };
+ int32_t ret = -1;
+ nsr_recon_private_t *priv = NULL;
+
+ iobuf = iobuf_get2 (this->ctx->iobuf_pool, size);
+ if (!iobuf) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ iobref = iobref_new ();
+ if (!iobref) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ iobref_add (iobref, iobuf);
+
+ ret = this_fd_ctx_get (fd, this, &rfd);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+ priv = (nsr_recon_private_t *)this->private;
+
+ recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_readv called for offset %d \n",(unsigned int)offset );
+ switch (offset) {
+ // client(leader) reads from here to get info for this term on this node
+ // invole libchagelog to get the information
+ case nsr_recon_xlator_sector_3 :
+ {
+ nsr_recon_last_term_info_t lt;
+ GF_ASSERT(size == sizeof(lt));
+ nsr_recon_libchangelog_get_this_term_info(this,priv->changelog_base_path, rfd->term, &lt);
+ recon_main_log (this->name, GF_LOG_INFO,
+ "nsr_recon_readv - getting term info for term=%d, ops=%d, first=%d, last=%d\n",
+ rfd->term, lt.commited_ops, lt.first_index, lt.last_index);
+ ENDIAN_CONVERSION_LT(lt, _gf_false); //htonl
+ memcpy(iobuf->ptr, &lt, size);
+ goto out;
+ }
+ // client(reconciliator) reads individual record information
+ case nsr_recon_xlator_sector_2 :
+ {
+ uint32_t num = (rfd->last_index - rfd->first_index + 1);
+ recon_main_log (this->name, GF_LOG_INFO,
+ "nsr_recon_readv - expected size %lu got size %lu\n",
+ (num * sizeof(nsr_recon_record_details_t)), size);
+
+ GF_ASSERT(size == (num * sizeof(nsr_recon_record_details_t)));
+ bzero(iobuf->ptr, size);
+ recon_main_log (this->name, GF_LOG_INFO,
+ "nsr_recon_readv - getting records for term=%d from %d to %d\n",
+ rfd->term, rfd->first_index, rfd->last_index);
+ nsr_recon_libchangelog_get_records(this, priv->changelog_base_path,
+ rfd->term, rfd->first_index, rfd->last_index, iobuf->ptr);
+ goto out;
+ }
+ // read last term info
+ case nsr_recon_xlator_sector_4 :
+ {
+ nsr_recon_last_term_info_t lt;
+ GF_ASSERT(size == sizeof(lt));
+ nsr_recon_libchangelog_get_last_term_info(this, priv->changelog_base_path, rfd->term, &lt);
+ recon_main_log (this->name, GF_LOG_INFO,
+ "nsr_recon_readv - getting last term info given current term=%d. last term = %d ops=%d, first=%d, last=%d\n",
+ rfd->term, lt.last_term, lt.commited_ops, lt.first_index, lt.last_index);
+ ENDIAN_CONVERSION_LT(lt, _gf_false); //htonl
+ memcpy(iobuf->ptr, &lt, size);
+ goto out;
+ }
+ default:
+ {
+ recon_main_log (this->name, GF_LOG_ERROR,
+ "nsr_recon_readv called with wrong offset\n");
+ op_errno = -1;
+ break;
+ }
+ }
+
+out:
+ if (op_errno == 0) {
+ iov.iov_base = iobuf->ptr;
+ ret = iov.iov_len = size;
+ }
+
+ STACK_UNWIND_STRICT (readv, frame, ret, op_errno, &iov, 1, NULL, iobref , NULL);
+
+ if (iobref)
+ iobref_unref (iobref);
+ if (iobuf)
+ iobuf_unref (iobuf);
+ return 0;
+}
+
+int
+nsr_recon_lookup (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, dict_t *xdata)
+{
+ struct iatt buf = {0, };
+ // dirty hack to set root as regular but seems to work.
+ buf.ia_type = IA_IFREG;
+ recon_main_log (this->name, GF_LOG_INFO, "nsr_recon_lookup called \n");
+
+ STACK_UNWIND_STRICT (lookup, frame, 0, 0, this->itable->root, &buf, NULL, NULL);
+ return 0;
+}
+
+
+int32_t
+nsr_recon_flush (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, dict_t *xdata)
+{
+ STACK_UNWIND_STRICT (flush, frame, 0, 0, NULL);
+ return 0;
+}
+
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("recon", this, out);
+
+ ret = xlator_mem_acct_init (this, gf_mt_recon_end + 1);
+
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Memory accounting init" "failed");
+ return ret;
+ }
+out:
+ return ret;
+}
+
+
+int32_t
+init (xlator_t *this)
+{
+ nsr_recon_private_t *priv = NULL;
+ char *local, *members;
+ unsigned int i=0;
+
+ priv = GF_CALLOC (1, sizeof (*priv), gf_mt_recon_private_t);
+ if (!priv) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "priv allocation error\n");
+ return -1;
+ }
+ GF_OPTION_INIT ("replica-group-size", priv->replica_group_size, uint32, err);
+ GF_OPTION_INIT ("vol-name", priv->volname, str, err);
+ if (!priv->volname) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "missing volname option (required)");
+ return -1;
+ }
+ GF_OPTION_INIT ("changelog-dir", priv->changelog_base_path, str, err);
+ if (!priv->changelog_base_path) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "missing changelog directory option (required)");
+ return -1;
+ }
+ GF_OPTION_INIT ("base-dir", priv->base_dir, str, err);
+ if (!priv->base_dir) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "missing brick base directory option (required)");
+ return -1;
+ }
+ GF_OPTION_INIT ("replica-group-members", members, str, err);
+ if (!members) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "missing membership option (required)");
+ return -1;
+ }
+ GF_OPTION_INIT ("local-member", local, str, err);
+ if (!local) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "missing local member option (required)");
+ return -1;
+ }
+
+ priv->replica_group_members = GF_CALLOC (priv->replica_group_size,
+ sizeof(char *),
+ gf_mt_recon_members_list_t);
+ priv->replica_group_members[0] = GF_CALLOC (1,
+ strlen(local),
+ gf_mt_recon_member_name_t);
+ if (!priv->replica_group_members || !(priv->replica_group_members[0])) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "str allocation error\n");
+ return -1;
+ }
+ strcpy(priv->replica_group_members[0], local);
+ for (i=1; i < priv->replica_group_size; i++) {
+ char *member;
+ if (i == 1)
+ member = strtok(members, ",");
+ else
+ member = strtok(NULL, ",");
+ priv->replica_group_members[i] = GF_CALLOC (1,
+ strlen(member) + 1, gf_mt_recon_member_name_t);
+ if (!priv->replica_group_members[i]) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "str allocation error\n");
+ return -1;
+ }
+ strcpy(priv->replica_group_members[i], member);
+ }
+
+
+ priv->this = this;
+ this->private = (void *)priv;
+
+ priv->fp = recon_create_log (priv->replica_group_members[0], "recon-main-log");
+ if (!priv->fp)
+ return -1;
+
+ recon_main_log (this->name, GF_LOG_INFO, "creating reconciliation driver \n");
+
+ if (pthread_create(&priv->thread_id, NULL, nsr_reconciliation_driver, priv)) {
+ recon_main_log (this->name, GF_LOG_ERROR,
+ "pthread creation error \n");
+ return -1;
+ }
+
+ INIT_LIST_HEAD(&(priv->list));
+
+
+ return 0;
+
+err:
+ return -1;
+}
+
+
+void
+fini (xlator_t *this)
+{
+ nsr_recon_private_t *priv = NULL;
+ void *ret = NULL;
+
+ priv = (nsr_recon_private_t *)this->private;
+
+ pthread_cancel(priv->thread_id);
+ pthread_join(priv->thread_id, &ret);
+}
+
+
+struct xlator_fops fops = {
+ .open = nsr_recon_open,
+ .readv = nsr_recon_readv,
+ .writev = nsr_recon_writev,
+ .lookup = nsr_recon_lookup,
+ .flush = nsr_recon_flush
+};
+
+struct xlator_cbks cbks = {
+};
+
+struct volume_options options[] = {
+ { .key = {"replica-group-size"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 2,
+ .max = INT_MAX,
+ .default_value = "2",
+ .description = "Number of bricks in replica group. can be derived but putting it here for testing."
+ },
+ {
+ .key = {"vol-name"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "volume name"
+ },
+ {
+ .key = {"local-member"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "member(brick) for which this translator is responsible."
+ },
+ {
+ .key = {"replica-group-members"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "Comma seperated member names other than local."
+ },
+ {
+ .key = {"changelog-dir"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "Base directory where per term changelogs are maintained."
+ },
+ {
+ .key = {"base-dir"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "Base directory for this brick. This should go away once we fix gfid based lookups"
+ },
+ { .key = {NULL} },
+};
diff --git a/xlators/cluster/nsr-recon/src/recon_xlator.h b/xlators/cluster/nsr-recon/src/recon_xlator.h
new file mode 100644
index 000000000..d9692a632
--- /dev/null
+++ b/xlators/cluster/nsr-recon/src/recon_xlator.h
@@ -0,0 +1,92 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __RECON_XLATOR_H__
+#define __RECON_XLATOR_H__
+
+#include <semaphore.h>
+#include <pthread.h>
+
+enum gf_dht_mem_types_ {
+ gf_mt_recon_changelog_buf_t = gf_common_mt_end + 1,
+ gf_mt_recon_driver_ctx_t,
+ gf_mt_recon_fd_t,
+ gf_mt_recon_id_t,
+ gf_mt_recon_member_name_t,
+ gf_mt_recon_members_list_t,
+ gf_mt_recon_per_node_worker_t,
+ gf_mt_recon_private_t,
+ gf_mt_recon_reconciliator_info_t,
+ gf_mt_recon_record_t,
+ gf_mt_recon_record_details_t,
+ gf_mt_recon_role_work_t,
+ gf_mt_recon_work_t,
+ gf_mt_recon_work_data_t,
+ gf_mt_recon_worker_t,
+ gf_mt_recon_end,
+};
+
+enum nsr_recon_xlator_sector_t {
+ nsr_recon_xlator_sector_0 = 0, // to report back the status of given transaction ids
+ nsr_recon_xlator_sector_1 = 512, // to write here information about leadership changes from the brick
+ nsr_recon_xlator_sector_2 = (512 * 2), // to write here individual roles and wait for that role to be done
+ nsr_recon_xlator_sector_3 = (512 *3), // read from here to get term info for given term
+ nsr_recon_xlator_sector_4 = (512 * 4), // read from here to get last term info
+};
+
+
+typedef struct _nsr_recon_private_s {
+ xlator_t *this; //back pointer
+ unsigned int replica_group_size; // number of static members of replica group
+ char **replica_group_members; // replica group members (including itself in first slot)
+ pthread_t thread_id; // driver thread id
+ nsr_recon_driver_ctx_t *driver_thread_context; //driver thread context
+ unsigned int outstanding; // for communicating with driver thread
+ call_frame_t *frame; // old frame that is pending (just one as of now)
+ struct list_head list;
+ char *volname;
+ uint32_t txn_id;
+ char *changelog_base_path;
+ char *base_dir;
+#if defined(NSR_DEBUG)
+ FILE *fp;
+#endif
+} nsr_recon_private_t;
+
+#define atomic_cmpxchg __sync_val_compare_and_swap
+
+#if defined(NSR_DEBUG)
+
+extern void
+_recon_main_log (const char *func, int line, char *member, FILE *fp,
+ char *fmt, ...);
+
+#define recon_main_log(dom, levl, fmt...) do { \
+ FMT_WARN (fmt); \
+ if (levl <= nsr_debug_level) { \
+ nsr_recon_private_t *priv = this->private; \
+ _recon_main_log (__FUNCTION__, __LINE__, \
+ priv->replica_group_members[0], \
+ priv->fp, \
+ ##fmt); \
+ } \
+} while (0)
+
+#else
+#define recon_main_log(dom, levl, fmt...) gf_log(dom, levl, fmt)
+#endif
+
+void nsr_recon_libchangelog_get_this_term_info(xlator_t *this, char *bp, int32_t term, nsr_recon_last_term_info_t *lt);
+void nsr_recon_libchangelog_get_last_term_info(xlator_t *this, char *bp, int32_t term, nsr_recon_last_term_info_t *lt);
+void nsr_recon_return_back(nsr_recon_private_t *priv, uint32_t term, int32_t status, int32_t op_errno);
+gf_boolean_t nsr_recon_libchangelog_get_records(xlator_t *this, char *bp, int32_t term, uint32_t first, uint32_t last, void *buf);
+
+
+#endif /* #ifndef __RECON_XLATOR_H__ */
diff --git a/xlators/cluster/nsr-server/Makefile.am b/xlators/cluster/nsr-server/Makefile.am
new file mode 100644
index 000000000..d471a3f92
--- /dev/null
+++ b/xlators/cluster/nsr-server/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/cluster/nsr-server/src/Makefile.am b/xlators/cluster/nsr-server/src/Makefile.am
new file mode 100644
index 000000000..0092aad4f
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/Makefile.am
@@ -0,0 +1,43 @@
+noinst_PYTHON = codegen.py gen-fops.py
+
+xlator_LTLIBRARIES = nsr.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
+
+nsr_la_LDFLAGS = -module -avoid-version -lcurl
+
+if ENABLE_ETCD_SIM
+nsr_la_SOURCES = nsr.c leader.c recon_notify.c etcd-sim.c
+else
+nsr_la_SOURCES = nsr.c leader.c recon_notify.c etcd-api.c \
+ yajl.c yajl_alloc.c yajl_buf.c yajl_encode.c yajl_gen.c \
+ yajl_lex.c yajl_parser.c yajl_tree.c yajl_version.c
+endif
+
+
+nsr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
+ $(top_builddir)/api/src/libgfapi.la
+
+noinst_HEADERS = nsr-internal.h etcd-api.h all-templates.c \
+ yajl_alloc.h yajl_buf.h yajl_bytestack.h yajl_encode.h \
+ yajl_lex.h yajl_parser.h yajl/yajl_common.h yajl/yajl_gen.h \
+ yajl/yajl_parse.h yajl/yajl_tree.h yajl/yajl_version.h \
+ $(top_srcdir)/xlators/lib/src/libxlator.h \
+ $(top_srcdir)/glusterfsd/src/glusterfsd.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) \
+ -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \
+ -I$(top_srcdir)/rpc/rpc-lib/src -DSBIN_DIR=\"$(sbindir)\"
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+XLATOR_HEADER = $(top_srcdir)/libglusterfs/src/xlator.h
+
+CLEANFILES = nsr-cg.c
+
+nsr-cg.c: gen-fops.py codegen.py $(XLATOR_HEADER) all-templates.c
+ $(PYTHON) ./gen-fops.py $(XLATOR_HEADER) all-templates.c > $@
+
+nsr.lo: nsr-cg.c
+
+uninstall-local:
+ rm -f $(DESTDIR)$(xlatordir)/nsr.so
diff --git a/xlators/cluster/nsr-server/src/all-templates.c b/xlators/cluster/nsr-server/src/all-templates.c
new file mode 100644
index 000000000..fa29de7b2
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/all-templates.c
@@ -0,0 +1,345 @@
+/*
+ * You can put anything here - it doesn't even have to be a comment - and it
+ * will be ignored until we reach the first template-name comment.
+ */
+
+
+// template-name read-fop
+$TYPE$
+nsr_$NAME$ (call_frame_t *frame, xlator_t *this,
+ $ARGS_LONG$)
+{
+ nsr_private_t *priv = this->private;
+ gf_boolean_t in_recon = _gf_false;
+ int32_t recon_term, recon_index;
+
+ // allow reads during reconciliation
+ // TBD: allow "dirty" reads on non-leaders
+ if (xdata &&
+ (dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) &&
+ (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) {
+ in_recon = _gf_true;
+ }
+
+ if ((!priv->leader) && (in_recon == _gf_false)) {
+ goto err;
+ }
+
+ STACK_WIND (frame, default_$NAME$_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->$NAME$,
+ $ARGS_SHORT$);
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT ($NAME$, frame, -1, EREMOTE,
+ $DEFAULTS$);
+ return 0;
+}
+
+// template-name read-dispatch
+/* No "dispatch" function needed for $NAME$ */
+
+// template-name read-fan-in
+/* No "fan-in" function needed for $NAME$ */
+
+// template-name read-continue
+/* No "continue" function needed for $NAME$ */
+
+// template-name read-complete
+/* No "complete" function needed for $NAME$ */
+
+// template-name write-fop
+$TYPE$
+nsr_$NAME$ (call_frame_t *frame, xlator_t *this,
+ $ARGS_LONG$)
+{
+ nsr_local_t *local = NULL;
+ nsr_private_t *priv = this->private;
+ int op_errno = ENOMEM;
+ int from_leader;
+ int from_recon;
+ uint32_t ti = 0;
+ double must_be_up;
+ double are_up;
+
+ /*
+ * Our first goal here is to avoid "split brain surprise" for users who
+ * specify exactly 50% with two- or three-way replication. That means
+ * either a more-than check against half the total replicas or an
+ * at-least check against half of our peers (one less). Of the two,
+ * only an at-least check supports the intuitive use of 100% to mean
+ * all replicas must be present, because "more than 100%" will never
+ * succeed regardless of which count we use. This leaves us with a
+ * slightly non-traditional definition of quorum ("at least X% of peers
+ * not including ourselves") but one that's useful enough to be worth
+ * it.
+ *
+ * Note that n_children and up_children *do* include the local
+ * subvolume, so we need to subtract one in each case.
+ */
+ must_be_up = ((double)(priv->n_children - 1)) * priv->quorum_pct;
+ are_up = ((double)(priv->up_children - 1)) * 100.0;
+ if (are_up < must_be_up) {
+ /* Emulate the AFR client-side-quorum behavior. */
+ op_errno = EROFS;
+ goto err;
+ }
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ goto err;
+ }
+#if defined(NSR_CG_NEED_FD)
+ local->fd = fd_ref(fd);
+#else
+ local->fd = NULL;
+#endif
+ INIT_LIST_HEAD(&local->qlinks);
+ frame->local = local;
+
+ if (xdata) {
+ from_leader = !!dict_get(xdata,NSR_TERM_XATTR);
+ from_recon = !!dict_get(xdata,RECON_TERM_XATTR)
+ && !!dict_get(xdata,RECON_INDEX_XATTR);
+ }
+ else {
+ from_leader = from_recon = _gf_false;
+ }
+
+ // follower/recon path
+ // just send it to local node
+ if (from_leader || from_recon) {
+ atomic_inc(&priv->ops_in_flight);
+ STACK_WIND (frame, nsr_$NAME$_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->$NAME$,
+ $ARGS_SHORT$);
+ return 0;
+ }
+
+
+ if (!priv->leader/* || priv->fence_io*/) {
+ op_errno = EREMOTE;
+ goto err;
+ }
+
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to allocate xdata");
+ goto err;
+ }
+ }
+
+ if (dict_set_int32(xdata,NSR_TERM_XATTR,priv->current_term) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set nsr-term");
+ goto err;
+ }
+
+ LOCK(&priv->index_lock);
+ ti = ++(priv->index);
+ UNLOCK(&priv->index_lock);
+ if (dict_set_int32(xdata,NSR_INDEX_XATTR,ti) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set index");
+ goto err;
+ }
+
+ local->stub = fop_$NAME$_stub (frame,nsr_$NAME$_continue,
+ $ARGS_SHORT$);
+ if (!local->stub) {
+ goto err;
+ }
+
+
+#if defined(NSR_CG_QUEUE)
+ nsr_inode_ctx_t *ictx = nsr_get_inode_ctx(this,fd->inode);
+ if (!ictx) {
+ op_errno = EIO;
+ goto err;
+ }
+ LOCK(&ictx->lock);
+ if (ictx->active) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "queuing request due to conflict");
+ /*
+ * TBD: enqueue only for real conflict
+ *
+ * Currently we just act like all writes are in
+ * conflict with one another. What we should really do
+ * is check the active/pending queues and defer only if
+ * there's a conflict there.
+ *
+ * It's important to check the pending queue because we
+ * might have an active request X which conflicts with
+ * a pending request Y, and this request Z might
+ * conflict with Y but not X. If we checked only the
+ * active queue then Z could jump ahead of Y, which
+ * would be incorrect.
+ */
+ local->qstub = fop_$NAME$_stub (frame,
+ nsr_$NAME$_dispatch,
+ $ARGS_SHORT$);
+ if (!local->qstub) {
+ UNLOCK(&ictx->lock);
+ goto err;
+ }
+ list_add_tail(&local->qlinks,&ictx->pqueue);
+ ++(ictx->pending);
+ UNLOCK(&ictx->lock);
+ return 0;
+ }
+ else {
+ list_add_tail(&local->qlinks,&ictx->aqueue);
+ ++(ictx->active);
+ }
+ UNLOCK(&ictx->lock);
+#endif
+
+ return nsr_$NAME$_dispatch (frame, this, $ARGS_SHORT$);
+
+err:
+ if (local) {
+ if (local->stub) {
+ call_stub_destroy(local->stub);
+ }
+ if (local->qstub) {
+ call_stub_destroy(local->qstub);
+ }
+ if (local->fd) {
+ fd_unref(local->fd);
+ }
+ mem_put(local);
+ }
+ STACK_UNWIND_STRICT ($NAME$, frame, -1, op_errno,
+ $DEFAULTS$);
+ return 0;
+}
+
+// template-name write-dispatch
+$TYPE$
+nsr_$NAME$_dispatch (call_frame_t *frame, xlator_t *this,
+ $ARGS_LONG$)
+{
+ nsr_local_t *local = frame->local;
+ nsr_private_t *priv = this->private;
+ xlator_list_t *trav;
+
+ atomic_inc(&priv->ops_in_flight);
+
+ /*
+ * TBD: unblock pending request(s) if we fail after this point but
+ * before we get to nsr_$NAME$_complete (where that code currently
+ * resides).
+ */
+
+ local->call_count = priv->n_children - 1;
+ for (trav = this->children->next; trav; trav = trav->next) {
+ STACK_WIND (frame, nsr_$NAME$_fan_in,
+ trav->xlator, trav->xlator->fops->$NAME$,
+ $ARGS_SHORT$);
+ }
+
+ // TBD: variable Issue count
+ return 0;
+}
+
+// template-name write-fan-in
+$TYPE$
+nsr_$NAME$_fan_in (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ $ARGS_LONG$)
+{
+ nsr_local_t *local = frame->local;
+ uint8_t call_count;
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "op_ret = %d, op_errno = %d\n", op_ret, op_errno);
+
+ LOCK(&frame->lock);
+ call_count = --(local->call_count);
+ UNLOCK(&frame->lock);
+
+ // TBD: variable Completion count
+ if (call_count == 0) {
+ call_resume(local->stub);
+ }
+
+ return 0;
+}
+
+// template-name write-continue
+$TYPE$
+nsr_$NAME$_continue (call_frame_t *frame, xlator_t *this,
+ $ARGS_LONG$)
+{
+ STACK_WIND (frame, nsr_$NAME$_complete,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->$NAME$,
+ $ARGS_SHORT$);
+ return 0;
+}
+
+// template-name write-complete
+$TYPE$
+nsr_$NAME$_complete (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ $ARGS_LONG$)
+{
+ nsr_private_t *priv = this->private;
+#if defined(NSR_CG_NEED_FD)
+ nsr_local_t *local = frame->local;
+#endif
+
+#if defined(NSR_CG_QUEUE)
+ nsr_inode_ctx_t *ictx;
+ nsr_local_t *next;
+ if (local->qlinks.next != &local->qlinks) {
+ list_del(&local->qlinks);
+ ictx = nsr_get_inode_ctx(this,local->fd->inode);
+ if (ictx) {
+ LOCK(&ictx->lock);
+ if (ictx->pending) {
+ /*
+ * TBD: dequeue *all* non-conflicting reqs
+ *
+ * With the stub implementation there can only
+ * be one request active at a time (zero here)
+ * so it's not an issue. In a real
+ * implementation there might still be other
+ * active requests to check against, and
+ * multiple pending requests that could
+ * continue.
+ */
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unblocking next request");
+ --(ictx->pending);
+ next = list_entry (ictx->pqueue.next,
+ nsr_local_t, qlinks);
+ list_del(&next->qlinks);
+ list_add_tail(&next->qlinks,&ictx->aqueue);
+ call_resume(next->qstub);
+ }
+ else {
+ --(ictx->active);
+ }
+ UNLOCK(&ictx->lock);
+ }
+ }
+#endif
+
+#if defined(NSR_CG_FSYNC)
+ nsr_mark_fd_dirty(this,local);
+#endif
+
+#if defined(NSR_CG_NEED_FD)
+ fd_unref(local->fd);
+#endif
+
+ STACK_UNWIND_STRICT ($NAME$, frame, op_ret, op_errno,
+ $ARGS_SHORT$);
+ atomic_dec(&priv->ops_in_flight);
+ return 0;
+
+}
diff --git a/xlators/cluster/nsr-server/src/codegen.py b/xlators/cluster/nsr-server/src/codegen.py
new file mode 100755
index 000000000..709f5662f
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/codegen.py
@@ -0,0 +1,174 @@
+#!/usr/bin/python
+
+# This module lets us auto-generate boilerplate versions of fops and cbks,
+# both for the client side and (eventually) on the server side as well. This
+# allows us to implement common logic (e.g. leader fan-out and sequencing)
+# once, without all the problems that come with copying and pasting the same
+# code into dozens of functions (or failing to).
+#
+# I've tried to make this code pretty generic, since it's already likely to
+# be used multiple ways within NSR. Really, we should use something like this
+# to generate defaults.[ch] as well, to avoid the same sorts of mismatches
+# that we've already seen and to which this approach makes NSR immune. That
+# would require using something other than defaults.h as the input, but that
+# format could be even simpler so that's a good thing too.
+
+
+import re
+import sys
+
+decl_re = re.compile("([a-z0-9_]+)$")
+tmpl_re = re.compile("// template-name (.*)")
+
+class CodeGenerator:
+
+ def __init__ (self):
+ self.decls = {}
+ self.skip = 0
+ self.templates = {}
+ self.make_defaults = self._make_defaults
+
+ # Redefine this to preprocess the name in a declaration, e.g.
+ # fop_lookup_t => nsrc_lookup
+ def munge_name (self, orig):
+ return orig
+
+ # By default, this will convert the argument string into a sequence of
+ # (type, name) tuples minus the first self.skip (default zero) arguments.
+ # You can redefine it to skip the conversion, do a different conversion,
+ # or rearrange the arguments however you like.
+ def munge_args (self, orig):
+ args = []
+ for decl in orig.strip("(); ").split(","):
+ m = decl_re.search(decl)
+ if m:
+ args.append((m.group(1),decl[:m.start(1)].strip()))
+ else:
+ raise RuntimeError("can't split %s into type+name"%decl)
+ return args[self.skip:]
+
+ def add_decl (self, fname, ftype, fargs):
+ self.decls[self.munge_name(fname)] = (ftype, self.munge_args(fargs))
+
+ def parse_decls (self, path, pattern):
+ regex = re.compile(pattern)
+ f = open(path,"r")
+ have_decl = False
+ while True:
+ line = f.readline()
+ if not line:
+ break
+ m = regex.search(line)
+ if m:
+ if have_decl:
+ self.add_decl(f_name,f_type,f_args)
+ f_name = m.group(2)
+ f_type = m.group(1)
+ f_args = line[m.end(0):-1].strip()
+ if f_args.rfind(")") >= 0:
+ self.add_decl(f_name,f_type,f_args)
+ else:
+ have_decl = True
+ elif have_decl:
+ if line.strip() == "":
+ self.add_decl(f_name,f_type,f_args)
+ have_decl = False
+ else:
+ f_args += " "
+ f_args += line[:-1].strip()
+ if have_decl:
+ self.add_decl(f_name,f_type,f_args)
+
+ # Legacy function (yeah, already) to load a single template. If you're
+ # using multiple templates, you're better off loading them all from one
+ # file using load_templates (note plural) instead.
+ def load_template (self, name, path):
+ self.templates[name] = open(path,"r").readlines()
+
+ # Load multiple templates. Each is introduced by a special comment of
+ # the form
+ #
+ # // template-name xyz
+ #
+ # One side effect is that the block before the first such comment will be
+ # ignored. This seems like it might be useful some day so I'll leave it
+ # in, but if people trip over it maybe it will change.
+ #
+ # It is recommended to define templates in expected execution order, to
+ # make the result more readable than the inverted order (e.g. callback
+ # then fop) common in the rest of our code.
+ def load_templates (self, path):
+ t_name = None
+ for line in open(path,"r").readlines():
+ if not line:
+ break
+ m = tmpl_re.match(line)
+ if m:
+ if t_name:
+ self.templates[t_name] = t_contents
+ t_name = m.group(1).strip()
+ t_contents = []
+ elif t_name:
+ t_contents.append(line)
+ if t_name:
+ self.templates[t_name] = t_contents
+
+ # Emit the template, with the following expansions:
+ #
+ # $NAME$ => function name (as passed in)
+ # $TYPE$ => function return value
+ # $ARGS_SHORT$ => argument list, including types
+ # $ARGS_LONG$ => argument list, *not* including types
+ # $DEFAULTS$ => default callback args (see below)
+ #
+ # The $DEFAULTS$ substitution is for the case where a fop (which has one
+ # set of arguments) needs to signal an error via STACK_UNWIND (which
+ # requires a different set of arguments). In this case we look up the
+ # argument list for the opposite direction, using self.make_defaults which
+ # the user must explicitly set to the method for the opposite direction.
+ # If an argument is a pointer, we replace it with NULL; otherwise we
+ # replace it with zero. It's a hack, but it's the only thing we do that
+ # doesn't require specific knowledge of our environment and the specific
+ # call we're handling. If this doesn't suffice, we'll have to add
+ # something like $ARG0$ which can be passed in for specific cases.
+ def emit (self, f_name, tmpl):
+ args = self.decls[f_name][1]
+ zipper = lambda x: x[0]
+ a_short = ", ".join(map(zipper,args))
+ zipper = lambda x: x[1] + " " + x[0]
+ a_long = ", ".join(map(zipper,args))
+ for line in self.templates[tmpl]:
+ line = line.replace("$NAME$",f_name)
+ line = line.replace("$TYPE$",self.decls[f_name][0])
+ line = line.replace("$ARGS_SHORT$",a_short)
+ line = line.replace("$ARGS_LONG$",a_long)
+ line = line.replace("$DEFAULTS$",self.make_defaults(f_name))
+ print(line.rstrip())
+
+ def _make_defaults (self, f_name):
+ result = []
+ for arg in self.decls[f_name][1]:
+ if arg[1][-1] == "*":
+ result.append("NULL")
+ else:
+ result.append("0")
+ return ", ".join(result)
+
+if __name__ == "__main__":
+ type_re = "([a-z_0-9]+)"
+ name_re = "\(\*fop_([a-z0-9]+)_t\)"
+ full_re = type_re + " *" + name_re
+ cg = CodeGenerator()
+ cg.skip = 2
+ cg.parse_decls(sys.argv[1],full_re)
+ """
+ for k, v in cg.decls.iteritems():
+ print("=== %s" % k)
+ print(" return type %s" % v[0])
+ for arg in v[1]:
+ print(" arg %s (type %s)" % arg)
+ """
+ cg.load_template("fop",sys.argv[2])
+ cg.emit("lookup","fop")
+ cg.emit("rename","fop")
+ cg.emit("setxattr","fop")
diff --git a/xlators/cluster/nsr-server/src/etcd-api.c b/xlators/cluster/nsr-server/src/etcd-api.c
new file mode 100644
index 000000000..a07019244
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/etcd-api.c
@@ -0,0 +1,831 @@
+/*
+ * Copyright (c) 2013, Red Hat
+ * All rights reserved.
+
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer. Redistributions in binary
+ * form must reproduce the above copyright notice, this list of conditions and
+ * the following disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* For asprintf */
+#if !defined(_GNU_SOURCE)
+#define _GNU_SOURCE
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <curl/curl.h>
+#include <yajl/yajl_tree.h>
+#include "etcd-api.h"
+
+
+#define DEFAULT_ETCD_PORT 4001
+#define SL_DELIM "\n\r\t ,;"
+
+typedef struct {
+ etcd_server *servers;
+} _etcd_session;
+
+typedef struct {
+ char *key;
+ char *value;
+ int *index_in; /* pointer so NULL can be special */
+ int index_out; /* NULL would be meaningless */
+} etcd_watch_t;
+
+typedef size_t curl_callback_t (void *, size_t, size_t, void *);
+
+int g_inited = 0;
+const char *value_path[] = { "node", "value", NULL };
+const char *nodes_path[] = { "node", "nodes", NULL };
+const char *entry_path[] = { "key", NULL };
+
+/*
+ * We only call this in case where it should be safe, but gcc doesn't know
+ * that so we use this to shut it up.
+ */
+char *
+MY_YAJL_GET_STRING (yajl_val x)
+{
+ char *y = YAJL_GET_STRING(x);
+
+ return y ? y : "bogus";
+}
+
+#if defined(DEBUG)
+void
+print_curl_error (char *intro, CURLcode res)
+{
+ printf("%s: %s\n",intro,curl_easy_strerror(res));
+}
+#else
+#define print_curl_error(intro,res)
+#endif
+
+
+etcd_session
+etcd_open (etcd_server *server_list)
+{
+ _etcd_session *session;
+
+ if (!g_inited) {
+ curl_global_init(CURL_GLOBAL_ALL);
+ g_inited = 1;
+ }
+
+ session = malloc(sizeof(*session));
+ if (!session) {
+ return NULL;
+ }
+
+ /*
+ * Some day we'll set up more persistent connections, and keep track
+ * (via redirects) of which server is leader so that we can always
+ * try it first. For now we just push that to the individual request
+ * functions, which do the most brain-dead thing that can work.
+ */
+
+ session->servers = server_list;
+ return session;
+}
+
+
+void
+etcd_close (etcd_session session)
+{
+ free(session);
+}
+
+/*
+ * Normal yajl_tree_get is returning NULL for these paths even when I can
+ * verify (in gdb) that they exist. I suppose I could debug this for them, but
+ * this is way easier.
+ *
+ * TBD: see if common distros are packaging a JSON library that isn't total
+ * crap.
+ */
+yajl_val
+my_yajl_tree_get (yajl_val root, char const **path, yajl_type type)
+{
+ yajl_val obj = root;
+ int i;
+
+ for (;;) {
+ if (!*path) {
+ if (obj && (obj->type != type)) {
+ return NULL;
+ }
+ return obj;
+ }
+ if (obj->type != yajl_t_object) {
+ return NULL;
+ }
+ for (i = 0; /* nothing */; ++i) {
+ if (i >= obj->u.object.len) {
+ return NULL;
+ }
+ if (!strcmp(obj->u.object.keys[i],*path)) {
+ obj = obj->u.object.values[i];
+ ++path;
+ break;
+ }
+ }
+ }
+}
+
+
+/*
+ * Looking directly at node->u.array seems terribly un-modular, but the YAJL
+ * tree interface doesn't seem to have any exposed API for iterating over the
+ * elements of an array. I tried using yajl_tree_get with an index in the
+ * path, either as a type-casted integer or as a string, but that didn't work.
+ */
+char *
+parse_array_response (yajl_val parent)
+{
+ size_t i;
+ yajl_val item;
+ yajl_val value;
+ char *retval = NULL;
+ char *saved;
+ yajl_val node;
+
+ node = my_yajl_tree_get(parent,nodes_path,yajl_t_array);
+ if (!node) {
+ return NULL;
+ }
+
+ for (i = 0; i < node->u.array.len; ++i) {
+ item = node->u.array.values[i];
+ if (!item) {
+ break;
+ }
+ value = my_yajl_tree_get(item,entry_path,yajl_t_string);
+ if (!value) {
+ break;
+ }
+ if (retval) {
+ saved = retval;
+ retval = NULL;
+ (void)asprintf (&retval, "%s\n%s",
+ saved, MY_YAJL_GET_STRING(value));
+ free(saved);
+ }
+ else {
+ retval = strdup(MY_YAJL_GET_STRING(value));
+ }
+ if (!retval) {
+ break;
+ }
+ }
+
+ return retval;
+}
+
+size_t
+parse_get_response (void *ptr, size_t size, size_t nmemb, void *stream)
+{
+ yajl_val node;
+ yajl_val value;
+
+ node = yajl_tree_parse(ptr,NULL,0);
+ if (node) {
+ value = my_yajl_tree_get(node,value_path,yajl_t_string);
+ if (value) {
+ /*
+ * YAJL probably copied it once, now we're going to
+ * copy it again. If anybody really cares for such
+ * small and infrequently used values, we'd have to do
+ * do something much more complicated (like using the
+ * stream interface) to avoid the copy. Right now it's
+ * just not worth it.
+ */
+ *((char **)stream) = strdup(MY_YAJL_GET_STRING(value));
+ }
+ else {
+ /* Might as well try this. */
+ *((char **)stream) = parse_array_response(node);
+ }
+ yajl_tree_free(node);
+ }
+
+ return size*nmemb;
+}
+
+
+etcd_result
+etcd_get_one (_etcd_session *session, char *key, etcd_server *srv, char *prefix,
+ char *post, curl_callback_t cb, char **stream)
+{
+ char *url;
+ CURL *curl;
+ CURLcode curl_res;
+ etcd_result res = ETCD_WTF;
+ void *err_label = &&done;
+
+ if (asprintf(&url,"http://%s:%u/v2/%s%s",
+ srv->host,srv->port,prefix,key) < 0) {
+ goto *err_label;
+ }
+ printf("url = %s\n",url);
+ err_label = &&free_url;
+
+ curl = curl_easy_init();
+ if (!curl) {
+ goto *err_label;
+ }
+ err_label = &&cleanup_curl;
+
+ /* TBD: add error checking for these */
+ curl_easy_setopt(curl,CURLOPT_URL,url);
+ curl_easy_setopt(curl,CURLOPT_FOLLOWLOCATION,1L);
+ curl_easy_setopt(curl,CURLOPT_WRITEFUNCTION,cb);
+ curl_easy_setopt(curl,CURLOPT_WRITEDATA,stream);
+ if (post) {
+ curl_easy_setopt(curl,CURLOPT_POST,1L);
+ curl_easy_setopt(curl,CURLOPT_POSTFIELDS,post);
+ }
+#if defined(DEBUG)
+ curl_easy_setopt(curl,CURLOPT_VERBOSE,1L);
+#endif
+
+ curl_res = curl_easy_perform(curl);
+ if (curl_res != CURLE_OK) {
+ print_curl_error("perform",curl_res);
+ goto *err_label;
+ }
+
+ res = ETCD_OK;
+
+cleanup_curl:
+ curl_easy_cleanup(curl);
+free_url:
+ free(url);
+done:
+ return res;
+}
+
+
+char *
+etcd_get (etcd_session session_as_void, char *key)
+{
+ _etcd_session *session = session_as_void;
+ etcd_server *srv;
+ etcd_result res;
+ char *value = NULL;
+
+ for (srv = session->servers; srv->host; ++srv) {
+ res = etcd_get_one(session,key,srv,"keys/",NULL,
+ parse_get_response,&value);
+ if ((res == ETCD_OK) && value) {
+ return value;
+ }
+ }
+
+ return NULL;
+}
+
+
+size_t
+parse_watch_response (void *ptr, size_t size, size_t nmemb, void *stream)
+{
+ yajl_val node;
+ yajl_val value;
+ etcd_watch_t *watch = stream;
+ static const char *i_path[] = { "node", "modifiedIndex", NULL };
+ static const char *k_path[] = { "node", "key", NULL };
+ static const char *v_path[] = { "node", "value", NULL };
+
+ node = yajl_tree_parse(ptr,NULL,0);
+ if (node) {
+ value = my_yajl_tree_get(node,i_path,yajl_t_number);
+ if (value) {
+ watch->index_out = strtoul(YAJL_GET_NUMBER(value),
+ NULL,10);
+ }
+ value = my_yajl_tree_get(node,k_path,yajl_t_string);
+ if (value) {
+ watch->key = strdup(MY_YAJL_GET_STRING(value));
+ }
+ value = my_yajl_tree_get(node,v_path,yajl_t_string);
+ if (value) {
+ watch->value = strdup(MY_YAJL_GET_STRING(value));
+ }
+ }
+
+ return size*nmemb;
+}
+
+
+etcd_result
+etcd_watch (etcd_session session_as_void, char *pfx,
+ char **keyp, char **valuep, int *index_in, int *index_out)
+{
+ _etcd_session *session = session_as_void;
+ etcd_server *srv;
+ etcd_result res;
+ etcd_watch_t watch;
+ char *path;
+
+ if (index_in) {
+ if (asprintf(&path,"%s?wait=true&recursive=true&waitIndex=%d",
+ pfx,*index_in) < 0) {
+ return ETCD_WTF;
+ }
+ }
+ else {
+ if (asprintf(&path,"%s?wait=true&recursive=true",pfx) < 0) {
+ return ETCD_WTF;
+ }
+ }
+
+ memset(&watch,0,sizeof(watch));
+ watch.index_in = index_in;
+
+ for (srv = session->servers; srv->host; ++srv) {
+ res = etcd_get_one(session,path,srv,"keys/",NULL,
+ parse_watch_response,(char **)&watch);
+ if (res == ETCD_OK) {
+ if (keyp) {
+ *keyp = watch.key;
+ }
+ if (valuep) {
+ *valuep = watch.value;
+ }
+ if (index_out) {
+ *index_out = watch.index_out;
+ }
+ break;
+ }
+ }
+
+ free(path);
+ return res;
+}
+
+
+size_t
+parse_set_response (void *ptr, size_t size, size_t nmemb, void *stream)
+{
+ yajl_val node;
+ yajl_val value;
+ etcd_result res = ETCD_PROTOCOL_ERROR;
+ /*
+ * Success responses contain prevValue and index. Failure responses
+ * contain errorCode and cause. Among all these, index seems to be the
+ * one we're most likely to need later, so look for that.
+ */
+ static const char *path[] = { "node", "modifiedIndex", NULL };
+
+ node = yajl_tree_parse(ptr,NULL,0);
+ if (node) {
+ value = my_yajl_tree_get(node,path,yajl_t_number);
+ if (value) {
+ res = ETCD_OK;
+ }
+ }
+
+ *((etcd_result *)stream) = res;
+ return size*nmemb;
+}
+
+
+size_t
+parse_lock_response (void *ptr, size_t size, size_t nmemb, void *stream)
+{
+ *((char **)stream) = strdup(ptr);
+ return size*nmemb;
+}
+
+
+/*
+ * There are two use cases, based on is_lock.
+ *
+ * If is_lock is null, we use the "keys" namespace. A null value means an
+ * HTTP DELETE; precond and ttl are both ignored. Otherwise we're setting a
+ * value, with *optional* precond and ttl.
+ *
+ * If is_lock is set, we use the "locks" namespace. A null value means an
+ * HTTP DELETE as before, and we still ignore ttl as before, but now precond
+ * must be set to represent the lock index. Otherwise ttl must be present,
+ * and we decide what to do based on precond. If it's null, this is an
+ * initial lock so we use an HTTP POST. Otherwise it's a renewal so we use
+ * an HTTP PUT instead.
+ */
+etcd_result
+etcd_set_one (_etcd_session *session, char *key, char *value,
+ char *precond, unsigned int ttl, etcd_server *srv,
+ char **is_lock)
+{
+ char *url;
+ char *contents = NULL;
+ CURL *curl;
+ etcd_result res = ETCD_WTF;
+ CURLcode curl_res;
+ void *err_label = &&done;
+ char *namespace;
+ char *http_cmd;
+ char *orig_index;
+
+ if (is_lock) {
+ namespace = "mod/v2/lock";
+ if (value) {
+ if (!ttl) {
+ /* Lock/renew must specify ttl. */
+ return ETCD_WTF;
+ }
+ http_cmd = precond ? "PUT" : "POST";
+ }
+ else {
+ if (!precond) {
+ /* Unlock must specify index. */
+ return ETCD_WTF;
+ }
+ http_cmd = "DELETE";
+ }
+ orig_index = *is_lock;
+ }
+ else {
+ namespace = "v2/keys";
+ http_cmd = value ? "PUT" : "DELETE";
+ }
+
+ if (asprintf(&url,"http://%s:%u/%s/%s",
+ srv->host,srv->port,namespace,key) < 0) {
+ goto *err_label;
+ }
+ err_label = &&free_url;
+
+ if (is_lock) {
+ if (precond) {
+ if (asprintf(&contents,"index=%s",precond) < 0) {
+ goto *err_label;
+ }
+ err_label = &&free_contents;
+ }
+ if (ttl) {
+ if (contents) {
+ char *c2;
+ if (asprintf(&c2,"ttl=%u;%s",ttl,contents) < 0) {
+ goto *err_label;
+ }
+ free(contents);
+ contents = c2;
+ }
+ else {
+ if (asprintf(&contents,"ttl=%u",ttl) < 0) {
+ goto *err_label;
+ }
+ }
+ err_label = &&free_contents;
+ }
+ }
+ else {
+ if (value) {
+ if (asprintf(&contents,"value=%s",value) < 0) {
+ goto *err_label;
+ }
+ err_label = &&free_contents;
+ }
+ if (precond) {
+ char *c2;
+ if (asprintf(&c2,"%s;prevValue=%s",contents,
+ precond) < 0) {
+ goto *err_label;
+ }
+ free(contents);
+ contents = c2;
+ err_label = &&free_contents;
+ }
+ if (ttl) {
+ char *c2;
+ if (asprintf(&c2,"%s;ttl=%u",contents,ttl) < 0) {
+ goto *err_label;
+ }
+ free(contents);
+ contents = c2;
+ err_label = &&free_contents;
+ }
+ }
+
+ curl = curl_easy_init();
+ if (!curl) {
+ goto *err_label;
+ }
+ err_label = &&cleanup_curl;
+
+ /* TBD: add error checking for these */
+ curl_easy_setopt(curl,CURLOPT_CUSTOMREQUEST,http_cmd);
+ curl_easy_setopt(curl,CURLOPT_URL,url);
+ curl_easy_setopt(curl,CURLOPT_FOLLOWLOCATION,1L);
+ curl_easy_setopt(curl,CURLOPT_POSTREDIR,CURL_REDIR_POST_ALL);
+
+ if (is_lock && value && !precond) {
+ /* Only do this for an initial lock, not a renewal. */
+ curl_easy_setopt (curl, CURLOPT_WRITEFUNCTION,
+ parse_lock_response);
+ curl_easy_setopt(curl,CURLOPT_WRITEDATA,is_lock);
+ }
+ else {
+ curl_easy_setopt (curl, CURLOPT_WRITEFUNCTION,
+ parse_set_response);
+ curl_easy_setopt(curl,CURLOPT_WRITEDATA,&res);
+ }
+
+ /*
+ * CURLOPT_HTTPPOST would be easier, but it looks like etcd will barf on
+ * that. Sigh.
+ */
+ if (contents) {
+ curl_easy_setopt(curl,CURLOPT_POST,1L);
+ curl_easy_setopt(curl,CURLOPT_POSTFIELDS,contents);
+ }
+#if defined(DEBUG)
+ curl_easy_setopt(curl,CURLOPT_VERBOSE,1L);
+#endif
+
+ curl_res = curl_easy_perform(curl);
+ if (curl_res != CURLE_OK) {
+ print_curl_error("perform",curl_res);
+ goto *err_label;
+ }
+
+ if (is_lock && value) {
+ if (!precond) {
+ /*
+ * If this is an initial lock, parse_lock_response would
+ * have been unable to set "res" for us. Instead, we
+ * set it here if the index string got updated.
+ */
+ if (*is_lock != orig_index) {
+ res = ETCD_OK;
+ }
+ }
+ else {
+ /*
+ * If this is a lock renewal, then a successful call
+ * will pass through neither parse_lock_response nor
+ * parse_get_response. The curl response code alone
+ * is sufficient.
+ */
+ res = ETCD_OK;
+ }
+ }
+
+ /*
+ * If the request succeeded, or at least got to the server and failed
+ * there, parse_set_response should have set res appropriately.
+ */
+
+cleanup_curl:
+ curl_easy_cleanup(curl);
+free_contents:
+ free(contents); /* might already be NULL for delete, but that's OK */
+free_url:
+ free(url);
+done:
+ return res;
+}
+
+
+etcd_result
+etcd_set (etcd_session session_as_void, char *key, char *value,
+ char *precond, unsigned int ttl)
+{
+ _etcd_session *session = session_as_void;
+ etcd_server *srv;
+ etcd_result res;
+
+ for (srv = session->servers; srv->host; ++srv) {
+ res = etcd_set_one(session,key,value,precond,ttl,srv,NULL);
+ /*
+ * Protocol errors are likely to be things like precondition
+ * failures, which won't be helped by retrying on another
+ * server.
+ */
+ if ((res == ETCD_OK) || (res == ETCD_PROTOCOL_ERROR)) {
+ return res;
+ }
+ }
+
+ return ETCD_WTF;
+}
+
+
+/*
+ * This uses the same path and status checks as SET, but with a different HTTP
+ * command instead of data. Precondition and TTL are obviously not used in
+ * this case, though a conditional delete would be a cool feature for etcd. I
+ * think you can get a timed delete by doing a conditional set to the current
+ * value with a TTL, but I haven't actually tried it.
+ */
+etcd_result
+etcd_delete (etcd_session session_as_void, char *key)
+{
+ _etcd_session *session = session_as_void;
+ etcd_server *srv;
+ etcd_result res;
+
+ for (srv = session->servers; srv->host; ++srv) {
+ res = etcd_set_one(session,key,NULL,NULL,0,srv,NULL);
+ if (res == ETCD_OK) {
+ break;
+ }
+ }
+
+ return res;
+}
+
+
+etcd_result
+etcd_lock (etcd_session session_as_void, char *key, unsigned int ttl,
+ char *index_in, char **index_out)
+{
+ _etcd_session *session = session_as_void;
+ etcd_server *srv;
+ etcd_result res;
+ char *tmp = NULL;
+
+ for (srv = session->servers; srv->host; ++srv) {
+ res = etcd_set_one(session,key,"hack",index_in,ttl,srv,&tmp);
+ if (res == ETCD_OK) {
+ if (index_out) {
+ *index_out = tmp;
+ }
+ break;
+ }
+ }
+
+ return res;
+}
+
+
+etcd_result
+etcd_unlock (etcd_session session_as_void, char *key, char *index)
+{
+ _etcd_session *session = session_as_void;
+ etcd_server *srv;
+ etcd_result res;
+ char *tmp = NULL;
+
+ for (srv = session->servers; srv->host; ++srv) {
+ res = etcd_set_one(session,key,NULL,index,0,srv,&tmp);
+ if (res == ETCD_OK) {
+ break;
+ }
+ }
+
+ return res;
+}
+size_t
+store_leader (void *ptr, size_t size, size_t nmemb, void *stream)
+{
+ *((char **)stream) = strdup(ptr);
+ return size * nmemb;
+}
+
+
+char *
+etcd_leader (etcd_session session_as_void)
+{
+ _etcd_session *session = session_as_void;
+ etcd_server *srv;
+ etcd_result res;
+ char *value = NULL;
+
+ for (srv = session->servers; srv->host; ++srv) {
+ res = etcd_get_one(session,"leader",srv,"",NULL,
+ store_leader,&value);
+ if ((res == ETCD_OK) && value) {
+ return value;
+ }
+ }
+
+ return NULL;
+}
+
+
+void
+free_sl (etcd_server *server_list)
+{
+ size_t num_servers;
+
+ for (num_servers = 0; server_list[num_servers].host; ++num_servers) {
+ free(server_list[num_servers].host);
+ }
+ free(server_list);
+}
+
+
+int
+_count_matching (char *text, char *cset, int result)
+{
+ char *t;
+ int res = 0;
+
+ for (t = text; *t; ++t) {
+ if ((strchr(cset,*t) != NULL) != result) {
+ break;
+ }
+ ++res;
+ }
+
+ return res;
+}
+
+#define count_matching(t,cs) _count_matching(t,cs,1)
+#define count_nonmatching(t,cs) _count_matching(t,cs,0)
+
+
+etcd_session
+etcd_open_str (char *server_names)
+{
+ char *snp;
+ int run_len;
+ int host_len;
+ size_t num_servers;
+ etcd_server *server_list;
+ etcd_session *session;
+
+ /*
+ * Yeah, we iterate over the string twice so we can allocate an
+ * appropriately sized array instead of turning it into a linked list.
+ * Unfortunately this means we can't use strtok* which is destructive
+ * with no platform-independent way to reverse the destructive effects.
+ */
+
+ num_servers = 0;
+ snp = server_names;
+ while (*snp) {
+ run_len = count_nonmatching(snp,SL_DELIM);
+ if (!run_len) {
+ snp += count_matching(snp,SL_DELIM);
+ continue;
+ }
+ ++num_servers;
+ snp += run_len;
+ }
+
+ if (!num_servers) {
+ return NULL;
+ }
+
+ server_list = calloc(num_servers+1,sizeof(*server_list));
+ if (!server_list) {
+ return NULL;
+ }
+ num_servers = 0;
+
+ snp = server_names;
+ while (*snp) {
+ run_len = count_nonmatching(snp,SL_DELIM);
+ if (!run_len) {
+ snp += count_matching(snp,SL_DELIM);
+ continue;
+ }
+ host_len = count_nonmatching(snp,":");
+ if ((run_len - host_len) > 1) {
+ server_list[num_servers].host = strndup(snp,host_len);
+ server_list[num_servers].port = (unsigned short)
+ strtoul(snp+host_len+1,NULL,10);
+ }
+ else {
+ server_list[num_servers].host = strndup(snp,run_len);
+ server_list[num_servers].port = DEFAULT_ETCD_PORT;
+ }
+ ++num_servers;
+ snp += run_len;
+ }
+
+ session = etcd_open(server_list);
+ if (!session) {
+ free_sl(server_list);
+ }
+ return session;
+}
+
+
+void
+etcd_close_str (etcd_session session)
+{
+ free_sl(((_etcd_session *)session)->servers);
+ etcd_close(session);
+}
diff --git a/xlators/cluster/nsr-server/src/etcd-api.h b/xlators/cluster/nsr-server/src/etcd-api.h
new file mode 100644
index 000000000..66275d40d
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/etcd-api.h
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2013, Red Hat
+ * All rights reserved.
+
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer. Redistributions in binary
+ * form must reproduce the above copyright notice, this list of conditions and
+ * the following disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Description of an etcd server. For now it just includes the name and
+ * port, but some day it might include other stuff like SSL certificate
+ * information.
+ */
+
+typedef enum {
+ ETCD_OK = 0,
+ ETCD_PROTOCOL_ERROR,
+ /* TBD: add other error categories here */
+ ETCD_WTF /* anything we can't easily categorize */
+} etcd_result;
+
+typedef struct {
+ char *host;
+ unsigned short port;
+} etcd_server;
+
+typedef void *etcd_session;
+
+/*
+ * etcd_open
+ *
+ * Establish a session to an etcd cluster, with automatic reconnection and
+ * so on.
+ *
+ * server_list
+ * Array of etcd_server structures, with the last having host=NULL. The
+ * caller is responsible for ensuring that this remains valid as long as
+ * the session exists.
+ */
+etcd_session etcd_open (etcd_server *server_list);
+
+
+/*
+ * etcd_open_str
+ *
+ * Same as etcd_open, except that the servers are specified as a list of
+ * host:port strings, separated by comma/semicolon or whitespace.
+ */
+etcd_session etcd_open_str (char *server_names);
+
+
+/*
+ * etcd_close
+ *
+ * Terminate a session, closing connections and freeing memory (or any other
+ * resources) associated with it.
+ */
+void etcd_close (etcd_session session);
+
+
+/*
+ * etcd_close
+ *
+ * Same as etcd_close, but also free the server list as etcd_open_str would
+ * have allocated it.
+ */
+void etcd_close_str (etcd_session session);
+
+
+/*
+ * etcd_get
+ *
+ * Fetch a key from one of the servers in a session. The return value is a
+ * newly allocated string, which must be freed by the caller.
+ *
+ * key
+ * The etcd key (path) to fetch.
+ */
+char * etcd_get (etcd_session session, char *key);
+
+
+/*
+ * etcd_watch
+ * Watch the set of keys matching a prefix.
+ *
+ * pfx
+ * The etcd key prefix (like a path) to watch.
+ *
+ * keyp
+ * Space for a pointer to the key that was added/modified/deleted.
+ *
+ * valuep
+ * Space for a pointer to the value if a key was added/modified. A delete
+ * is signified by this being set to NULL.
+ *
+ * index_in
+ * Pointer to an index to be used for *issuing* the watch request, or
+ * NULL for a watch without an index.
+ *
+ * index_out
+ * Pointer to space for an index *returned* by etcd, or NULL to mean don't
+ * bother.
+ *
+ * In normal usage, index_in will be NULL and index_out will be set to receive
+ * the index for the first watch. Subsequently, index_in will be set to
+ * provide the previous index (plus one) and index_out will be set to receive
+ * the next. It's entirely legitimate to point both at the same variable.
+ */
+
+etcd_result etcd_watch (etcd_session session, char *pfx,
+ char **keyp, char **valuep,
+ int *index_in, int *index_out);
+
+
+/*
+ * etcd_set
+ *
+ * Write a key, with optional TTL and/or previous value (as a precondition).
+ *
+ * key
+ * The etcd key (path) to set.
+ *
+ * value
+ * New value as a null-terminated string. Unlike etcd_get, we can derive
+ * the length ourselves instead of needing it to be passed in separately.
+ *
+ * precond
+ * Required previous value as a null-terminated string, or NULL to mean
+ * an unconditional set.
+ *
+ * ttl
+ * Time in seconds after which the value will automatically expire and be
+ * deleted, or zero to mean no auto-expiration.
+ */
+
+etcd_result etcd_set (etcd_session session, char *key, char *value,
+ char *precond, unsigned int ttl);
+
+
+/*
+ * etcd_delete
+ *
+ * Delete a key from one of the servers in a session.
+ *
+ * key
+ * The etcd key (path) to delete.
+ */
+
+etcd_result etcd_delete (etcd_session session, char *key);
+
+
+/*
+ * etcd_leader
+ *
+ * Get the identify of the current leader.
+ */
+
+char * etcd_leader (etcd_session session);
+
+/*
+ * etcd_lock
+ *
+ * Take or renew a lock - really a lease but the etcd folks call it a lock so
+ * we'll follow suit.
+ *
+ * key
+ * The path (in the "locks" namespace) for the lock.
+ *
+ * ttl
+ * Time in seconds for the lock.
+ *
+ * index_in (optional, indicates renewal)
+ * Lock index from previous lock call.
+ *
+ * index_out (only used for initial lock)
+ * Place for the new lock index. You must free this.
+ */
+
+etcd_result etcd_lock (etcd_session session_as_void, char *key,
+ unsigned int ttl, char *index_in, char **index_out);
+
+/*
+ * etcd_unlock
+ *
+ * Release a lock (see etcd_lock regarding terminology).
+ *
+ * key
+ * The path (in the "locks" namespace) for the lock.
+ *
+ * index
+ * Lock index from previous lock call.
+ */
+
+etcd_result etcd_unlock (etcd_session session_as_void, char *key,
+ char *index);
+
diff --git a/xlators/cluster/nsr-server/src/etcd-sim.c b/xlators/cluster/nsr-server/src/etcd-sim.c
new file mode 100644
index 000000000..d0bea12c7
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/etcd-sim.c
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2014, Red Hat
+ * All rights reserved.
+
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer. Redistributions in binary
+ * form must reproduce the above copyright notice, this list of conditions and
+ * the following disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/file.h>
+
+#include "mem-pool.h"
+
+/*
+ * Mock implementation of etcd
+ * The etcd file is simulated in /tmp/<server-names>
+ * Writes from Multiple writers are protected using file lock.
+*/
+
+#include "etcd-api.h"
+#define MAX_KEY_LEN 64
+#define MAX_VALUE_LEN 64
+#define MAX_EXPIRE_LEN 16
+
+etcd_session
+etcd_open (etcd_server *server_list)
+{
+ return NULL;
+}
+
+typedef struct _etcd_sim_s {
+ char *path;
+} etcd_sim_t;
+
+void
+etcd_close (etcd_session this)
+{
+ etcd_sim_t *sim = (etcd_sim_t *)this;
+ free(sim->path);
+ free(this);
+}
+
+
+char *
+etcd_get_1 (FILE *stream, char *key)
+{
+ char *str = NULL;
+ size_t len;
+ unsigned long expires;
+ char *ret;
+
+ // Read the file
+ while(1) {
+ if(str) {
+ free(str);
+ str = NULL;
+ }
+ if (getline((char **)&str, &len,stream) == -1) {
+ break;
+ }
+ if (!strncmp(str, key, strlen(key))) {
+ char k[256], s[256];
+ sscanf(str,"%s %s %lu",k, s, &expires);
+ // check if key is expired.
+ if (time(NULL) > expires) {
+ /* Keep looking for an unexpired entry. */
+ continue;
+ }
+ ret = calloc(1, strlen(s) + 1);
+ strcpy(ret,s);
+ free(str);
+ return(ret);
+ }
+ }
+ return NULL;
+}
+
+
+char *
+etcd_get (etcd_session this, char *key)
+{
+ etcd_sim_t *sim = (etcd_sim_t *)this;
+ int fd;
+ FILE *stream;
+ char *retval;
+
+ fd = open(sim->path,O_RDONLY);
+ if (!fd) {
+ return NULL;
+ }
+
+ stream = fdopen(fd,"r");
+ (void)flock(fd,LOCK_SH);
+ retval = etcd_get_1(stream,key);
+ (void)flock(fd,LOCK_UN);
+ fclose(stream); /* closes fd as well */
+
+ return retval;
+}
+
+
+etcd_result
+etcd_set_1 (FILE *stream, char *key, char *value,
+ char *precond, unsigned int ttl)
+{
+ char *str = NULL;
+ char tp[255];
+ size_t len;
+ unsigned long expires;
+
+ while(1) {
+ if(str) {
+ free(str);
+ str = NULL;
+ }
+ if (getline((char **)&str, &len,stream) == -1) {
+ break;
+ }
+ if (!strncmp(str, key, strlen(key))) {
+ char k[256], s[256];
+ sscanf(str,"%s %s %lu",k, s, &expires);
+ // check if the present key is expired
+ if (time(NULL) > expires) {
+ /* Keep looking for an unexpired entry. */
+ continue;
+ }
+ /*
+ * The only case in which we should fail here is if a
+ * precondition was specified and does not match the
+ * current (non-expired) value.
+ */
+ if (precond && strcmp(precond, s)) {
+ free(str);
+ return ETCD_WTF;
+ }
+ fseek(stream, -strlen(str), SEEK_CUR);
+ free(str);
+ goto here;
+ }
+ }
+here:
+ memset(tp, 0, 255);
+ sprintf(tp,"%*s %*s %*lu\n",
+ -MAX_KEY_LEN, key, -MAX_VALUE_LEN, value,
+ -MAX_EXPIRE_LEN, ttl ? time(NULL) + ttl : ~0);
+ if (fwrite(tp, 1,strlen(tp), stream) != strlen(tp)) {
+ return ETCD_WTF;
+ }
+ fflush(stream);
+ fsync(fileno(stream));
+ return ETCD_OK;
+}
+
+
+etcd_result
+etcd_set (etcd_session this, char *key, char *value,
+ char *precond, unsigned int ttl)
+{
+ etcd_sim_t *sim = (etcd_sim_t *)this;
+ int fd;
+ FILE *stream;
+ etcd_result retval;
+
+ fd = open(sim->path,O_RDWR);
+ if (fd < 0) {
+ return ETCD_WTF;
+ }
+
+ stream = fdopen(fd,"r+");
+ (void)flock(fd,LOCK_EX);
+ retval = etcd_set_1(stream,key,value,precond,ttl);
+ (void)flock(fd,LOCK_UN);
+ fclose(stream); /* closes fd as well */
+
+ return retval;
+}
+
+
+etcd_session
+etcd_open_str (char *server_names)
+{
+ etcd_sim_t *sim;
+ int fd;
+
+ sim = calloc(1, sizeof(etcd_sim_t));
+ (void)asprintf(&sim->path,"/tmp/%s",server_names);
+
+ fd = open(sim->path, O_RDWR | O_CREAT, 0777);
+ if (fd == -1) {
+ free(sim->path);
+ free(sim);
+ return NULL;
+ }
+
+ close(fd);
+ return ((void *)sim);
+}
+
+
+void
+etcd_close_str (etcd_session this)
+{
+ etcd_close(this);
+}
+
+etcd_result
+etcd_delete (etcd_session this, char *key)
+{
+ return ETCD_WTF;
+}
+
+char *
+etcd_leader (etcd_session this_as_void)
+{
+ return NULL;
+}
+
+etcd_result
+etcd_watch (etcd_session this, char *pfx, char **keyp, char **valuep,
+ int *index_in, int *index_out)
+{
+ return ETCD_WTF;
+}
+
+etcd_result
+etcd_lock (etcd_session session_as_void, char *key, unsigned int ttl,
+ char *index_in, char **index_out)
+{
+ char *path;
+ int fd;
+
+ if (!index_in) {
+ if (gf_asprintf(&path,"/var/tmp/%s",key) < 0) {
+ return ETCD_WTF;
+ }
+ fd = open(path,O_RDWR|O_CREAT,0666);
+ GF_FREE(path);
+ if (fd < 0) {
+ return ETCD_WTF;
+ }
+ if (flock(fd,LOCK_EX) < 0) {
+ close(fd);
+ return ETCD_WTF;
+ }
+ *index_out = strdup("42");
+ }
+
+ /*
+ * Yes, we leak an fd by not closing it here (and nobody else even
+ * knows about it). That would be awful in any other context, but
+ * for test scripts it won't matter.
+ */
+ return ETCD_OK;
+}
+
diff --git a/xlators/cluster/nsr-server/src/gen-fops.py b/xlators/cluster/nsr-server/src/gen-fops.py
new file mode 100755
index 000000000..1639f489c
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/gen-fops.py
@@ -0,0 +1,120 @@
+#!/usr/bin/python
+
+# This script generates the boilerplate versions of most fops and cbks in the
+# server. This allows the details of leadership-status checking, sequencing
+# between leader and followers (including fan-out), and basic error checking
+# to be centralized one place, with per-operation code kept to a minimum.
+
+import sys
+import codegen
+
+type_re = "([a-z_0-9]+)"
+name_re = "\(\*fop_([a-z0-9]+)_t\)"
+full_re = type_re + " *" + name_re
+fop_cg = codegen.CodeGenerator()
+fop_cg.skip = 2
+fop_cg.parse_decls(sys.argv[1],full_re)
+fop_cg.load_templates(sys.argv[2])
+
+# Use the multi-template feature to generate multiple callbacks from the same
+# parsed declarations.
+type_re = "([a-z_0-9]+)"
+name_re = "\(\*fop_([a-z0-9]+)_cbk_t\)"
+full_re = type_re + " *" + name_re
+cbk_cg = codegen.CodeGenerator()
+cbk_cg.skip = 5
+cbk_cg.parse_decls(sys.argv[1],full_re)
+cbk_cg.load_templates(sys.argv[2])
+
+# This is a nasty little trick to handle the case where a generated fop needs
+# a set of default arguments for the corresponding callback.
+fop_cg.make_defaults = cbk_cg.make_defaults
+
+# We need two types of templates. The first, for pure read operations, just
+# needs to do a simple am-i-leader check (augmented to allow dirty reads).
+# The second, for pure writes, needs to do fan-out to followers between those
+# initial checks and local execution. There are other operations that don't
+# fit neatly into either category - e.g. lock ops or fsync - so we'll just have
+# to handle those manually. The table thus includes entries only for those we
+# can categorize. The special cases, plus any new operations we've never even
+# heard of, aren't in there.
+#
+# Various keywords can be used to define/undefine preprocessor symbols used
+# in the templates, on a per-function basis. For example, if the keyword here
+# is "fsync" (lowercase word or abbreviation) that will cause NSR_CG_FSYNC
+# (prefix plus uppercase version) to be defined above all of the generated code
+# for that fop.
+
+fop_table = {
+ "access": "read",
+ "create": "write",
+ "discard": "write",
+# "entrylk": "read",
+ "fallocate": "write",
+# "fentrylk": "read",
+ "fgetxattr": "read",
+# "finodelk": "read",
+# "flush": "read",
+ "fremovexattr": "write",
+ "fsetattr": "write",
+ "fsetxattr": "write",
+ "fstat": "read",
+# "fsync": "read",
+# "fsyncdir": "read",
+ "ftruncate": "write",
+ "fxattrop": "write",
+ "getxattr": "read",
+# "inodelk": "read",
+ "link": "write",
+# "lk": "read",
+# "lookup": "read",
+ "mkdir": "write",
+ "mknod": "write",
+ "open": "write",
+ "opendir": "read",
+ "rchecksum": "read",
+ "readdir": "read",
+ "readdirp": "read",
+ "readlink": "read",
+ "readv": "read",
+ "removexattr": "write",
+ "rename": "write",
+ "rmdir": "write",
+ "setattr": "write",
+ "setxattr": "write",
+ "stat": "read",
+ "statfs": "read",
+ "symlink": "write",
+ "truncate": "write",
+ "unlink": "write",
+ "writev": "write,fsync,queue",
+ "xattrop": "write",
+}
+
+fops_done = []
+for x in sorted(fop_cg.decls.keys()):
+ if x in fop_table.keys():
+ info = fop_table[x].split(",")
+ kind = info[0]
+ flags = info[1:]
+ if ("fsync" in flags) or ("queue" in flags):
+ flags.append("need_fd")
+ for fname in flags:
+ print "#define NSR_CG_%s" % fname.upper()
+ cbk_cg.emit(x,kind+"-complete")
+ fop_cg.emit(x,kind+"-continue")
+ cbk_cg.emit(x,kind+"-fan-in")
+ fop_cg.emit(x,kind+"-dispatch")
+ fop_cg.emit(x,kind+"-fop")
+ for fname in flags:
+ print "#undef NSR_CG_%s" % fname.upper()
+ fops_done.append(x)
+ else:
+ print("/* No code emitted for %s */"%x)
+ print("")
+
+# Just for fun, emit the fops table too.
+print("struct xlator_fops fops = {")
+for x in fops_done:
+ print(" .%s = nsr_%s,"%(x,x))
+print("};")
diff --git a/xlators/cluster/nsr-server/src/leader.c b/xlators/cluster/nsr-server/src/leader.c
new file mode 100644
index 000000000..02a2609c8
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/leader.c
@@ -0,0 +1,138 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <regex.h>
+//#include <stdlib.h>
+#include <string.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "call-stub.h"
+#include "defaults.h"
+#include "xlator.h"
+#include "api/src/glfs.h"
+#include "api/src/glfs-internal.h"
+
+#ifndef NSR_SIM_ETCD
+#include "etcd-api.h"
+#endif
+#include "nsr-internal.h"
+#include "../../nsr-recon/src/recon_driver.h"
+#include "../../nsr-recon/src/recon_xlator.h"
+
+#define NSR_TTL 5
+
+static void
+nsr_set_leader (xlator_t *this, etcd_session etcd)
+{
+ long term = 0;
+ etcd_result res;
+ nsr_private_t *priv = this->private;
+ char n_t[sizeof(long)+1];
+ char *text = NULL;
+
+ gf_log (this->name, GF_LOG_INFO, "Just became leader");
+
+ text = etcd_get(etcd, priv->term_key);
+ if(text == NULL) {
+ term = 0;
+ } else {
+ term = strtol(text, NULL, 10);
+ }
+ sprintf(n_t,"%ld",term+1);
+ res = etcd_set(etcd, priv->term_key,n_t,text,0);
+ if(res != ETCD_OK) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to set term");
+ return;
+ }
+ priv->leader = _gf_true;
+
+ priv->current_term = term + 1;
+
+ if (priv->nsr_recon_start == _gf_false) {
+ atomic_fetch_and(&(priv->fence_io), 0);
+ return;
+ }
+
+ // Move this inside recon notify???
+ atomic_fetch_or(&(priv->fence_io), 1);
+
+ nsr_recon_notify_event_set_leader(priv);
+
+ return;
+}
+
+void *
+nsr_leader_thread (void *arg)
+{
+ xlator_t *this = (xlator_t *) arg;
+ nsr_private_t *priv = this->private;
+ etcd_result res;
+ char *index_in = NULL;
+ char *index_out = NULL;
+
+ gf_log (this->name, GF_LOG_INFO,
+ "calling etcd_open_str on servers %s", priv->etcd_servers);
+
+ priv->etcd = etcd_open_str(priv->etcd_servers);
+ if (!(priv->etcd)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to open etcd session\n");
+ return NULL;
+ }
+
+ priv->leader_inited = 1;
+
+ for (;;) {
+ /* Not leader yet. Try to become leader. */
+ for (;;) {
+ res = etcd_lock (priv->etcd, priv->leader_key, NSR_TTL,
+ index_in, &index_out);
+ if (res == ETCD_OK) {
+ break;
+ }
+ gf_log (this->name, GF_LOG_WARNING,
+ "etcd_lock failed (%d)", res);
+ sleep(1);
+ }
+ /* We're there. Notify other parts of the code. */
+ nsr_set_leader(this,priv->etcd);
+ /* Try to retain leadership. */
+ index_in = index_out;
+ index_out = NULL;
+ for (;;) {
+ res = etcd_lock (priv->etcd, priv->leader_key, NSR_TTL,
+ index_in, &index_out);
+ if (index_out && (index_in != index_out)) {
+ if (index_in) {
+ free(index_in);
+ }
+ index_in = index_out;
+ index_out = NULL;
+ }
+ if (res != ETCD_OK) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "lost leadership (%d)", res);
+ if (index_out) {
+ free(index_out);
+ }
+ break;
+ }
+ sleep(1);
+ }
+ }
+
+ etcd_close_str(priv->etcd);
+ return NULL;
+}
+
diff --git a/xlators/cluster/nsr-server/src/nsr-internal.h b/xlators/cluster/nsr-server/src/nsr-internal.h
new file mode 100644
index 000000000..72b61bfa5
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/nsr-internal.h
@@ -0,0 +1,101 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#define LEADER_XATTR "user.nsr.leader"
+#define SECOND_CHILD(xl) (xl->children->next->xlator)
+
+enum {
+ gf_mt_nsr_private_t = gf_common_mt_end + 1,
+ gf_mt_nsr_fd_ctx_t,
+ gf_mt_nsr_inode_ctx_t,
+ gf_mt_nsr_dirty_t,
+ gf_mt_nsr_end
+};
+
+typedef enum nsr_recon_notify_ev_id_t {
+ NSR_RECON_SET_LEADER = 1,
+ NSR_RECON_ADD_CHILD = 2
+} nsr_recon_notify_ev_id_t;
+
+typedef struct _nsr_recon_notify_ev_s {
+ nsr_recon_notify_ev_id_t id;
+ uint32_t index; // in case of add
+ struct list_head list;
+} nsr_recon_notify_ev_t;
+
+typedef struct {
+ char *etcd_servers;
+ char *subvol_uuid;
+ char *leader_key;
+ char *term_key;
+ char *brick_uuid;
+ gf_boolean_t leader;
+ uint8_t up_children;
+ uint8_t n_children;
+ char *vol_file;
+ etcd_session etcd;
+ volatile unsigned int fence_io;
+ uint32_t current_term;
+#ifdef NSR_DEBUG
+ uint32_t leader_log_fd;
+#endif
+ volatile int recon_notify_inited;
+ volatile int leader_inited;
+ uint32_t kid_state;
+ gf_lock_t dirty_lock;
+ struct list_head dirty_fds;
+ gf_boolean_t nsr_recon_start;
+ void * recon_ctx;
+ volatile uint32_t ops_in_flight;
+ uint32_t index;
+ gf_lock_t index_lock;
+ double quorum_pct;
+} nsr_private_t;
+
+typedef struct {
+ call_stub_t *stub;
+ call_stub_t *qstub;
+ uint8_t call_count;
+ fd_t *fd;
+ struct list_head qlinks;
+} nsr_local_t;
+
+/*
+ * This should match whatever changelog returns on the pre-op for us to pass
+ * when we're ready for our post-op.
+ */
+typedef uint32_t log_id_t;
+
+typedef struct {
+ struct list_head links;
+ log_id_t id;
+} nsr_dirty_list_t;
+
+typedef struct {
+ fd_t *fd;
+ struct list_head dirty_list;
+ struct list_head fd_list;
+} nsr_fd_ctx_t;
+
+typedef struct {
+ gf_lock_t lock;
+ uint32_t active;
+ struct list_head aqueue;
+ uint32_t pending;
+ struct list_head pqueue;
+} nsr_inode_ctx_t;
+
+void nsr_recon_notify_event_set_leader(nsr_private_t *priv);
+void nsr_recon_notify_event_add_child(nsr_private_t *priv, uint32_t index);
+void* nsr_recon_notify_thread (void *this);
+
diff --git a/xlators/cluster/nsr-server/src/nsr.c b/xlators/cluster/nsr-server/src/nsr.c
new file mode 100644
index 000000000..85eba09b5
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/nsr.c
@@ -0,0 +1,812 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "call-stub.h"
+#include "defaults.h"
+#include "xlator.h"
+#include "api/src/glfs.h"
+#include "api/src/glfs-internal.h"
+#include "run.h"
+#include "common-utils.h"
+#include "syncop.h"
+
+#include "etcd-api.h"
+#include "nsr-internal.h"
+#include "../../nsr-recon/src/recon_driver.h"
+#include "../../nsr-recon/src/recon_xlator.h"
+
+
+#define GLUSTERD_DEFAULT_WORKDIR "/var/lib/glusterd"
+#define GLUSTERD_VOLUME_DIR_PREFIX "vols"
+#define GLUSTERD_BRICK_INFO_DIR "bricks"
+
+#define NSR_FLUSH_INTERVAL 5
+
+nsr_inode_ctx_t *
+nsr_get_inode_ctx (xlator_t *this, inode_t *inode)
+{
+ uint64_t ctx_int = 0LL;
+ nsr_inode_ctx_t *ctx_ptr;
+
+ if (__inode_ctx_get(inode,this,&ctx_int) == 0) {
+ ctx_ptr = (nsr_inode_ctx_t *)(long)ctx_int;
+ }
+ else {
+ ctx_ptr = GF_CALLOC (1, sizeof(*ctx_ptr),
+ gf_mt_nsr_inode_ctx_t);
+ if (ctx_ptr) {
+ ctx_int = (uint64_t)(long)ctx_ptr;
+ if (__inode_ctx_set(inode,this,&ctx_int) == 0) {
+ LOCK_INIT(&ctx_ptr->lock);
+ INIT_LIST_HEAD(&ctx_ptr->aqueue);
+ INIT_LIST_HEAD(&ctx_ptr->pqueue);
+ }
+ else {
+ GF_FREE(ctx_ptr);
+ ctx_ptr = NULL;
+ }
+ }
+
+ }
+
+ return ctx_ptr;
+}
+
+nsr_fd_ctx_t *
+nsr_get_fd_ctx (xlator_t *this, fd_t *fd)
+{
+ uint64_t ctx_int = 0LL;
+ nsr_fd_ctx_t *ctx_ptr;
+
+ if (__fd_ctx_get(fd,this,&ctx_int) == 0) {
+ ctx_ptr = (nsr_fd_ctx_t *)(long)ctx_int;
+ }
+ else {
+ ctx_ptr = GF_CALLOC (1, sizeof(*ctx_ptr), gf_mt_nsr_fd_ctx_t);
+ if (ctx_ptr) {
+ if (__fd_ctx_set(fd,this,(uint64_t)ctx_ptr) == 0) {
+ INIT_LIST_HEAD(&ctx_ptr->dirty_list);
+ INIT_LIST_HEAD(&ctx_ptr->fd_list);
+ }
+ else {
+ GF_FREE(ctx_ptr);
+ ctx_ptr = NULL;
+ }
+ }
+
+ }
+
+ return ctx_ptr;
+}
+
+void
+nsr_mark_fd_dirty (xlator_t *this, nsr_local_t *local)
+{
+ fd_t *fd = local->fd;
+ nsr_fd_ctx_t *ctx_ptr;
+ nsr_dirty_list_t *dirty;
+ nsr_private_t *priv = this->private;
+
+ /*
+ * TBD: don't do any of this for O_SYNC/O_DIRECT writes.
+ * Unfortunately, that optimization requires that we distinguish
+ * between writev and other "write" calls, saving the original flags
+ * and checking them in the callback. Too much work for too little
+ * gain right now.
+ */
+
+ LOCK(&fd->lock);
+ ctx_ptr = nsr_get_fd_ctx(this,fd);
+ dirty = GF_CALLOC(1,sizeof(*dirty),gf_mt_nsr_dirty_t);
+ if (ctx_ptr && dirty) {
+ gf_log (this->name, GF_LOG_TRACE,
+ "marking fd %p as dirty (%p)", fd, dirty);
+ /* TBD: fill dirty->id from what changelog gave us */
+ list_add_tail(&dirty->links,&ctx_ptr->dirty_list);
+ if (list_empty(&ctx_ptr->fd_list)) {
+ /* Add a ref so _release doesn't get called. */
+ ctx_ptr->fd = fd_ref(fd);
+ LOCK(&priv->dirty_lock);
+ list_add_tail (&ctx_ptr->fd_list,
+ &priv->dirty_fds);
+ UNLOCK(&priv->dirty_lock);
+ }
+ }
+ else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not mark %p dirty", fd);
+ if (ctx_ptr) {
+ GF_FREE(ctx_ptr);
+ }
+ if (dirty) {
+ GF_FREE(dirty);
+ }
+ }
+ UNLOCK(&fd->lock);
+}
+
+#define NSR_TERM_XATTR "trusted.nsr.term"
+#define NSR_INDEX_XATTR "trusted.nsr.index"
+#define RECON_TERM_XATTR "trusted.nsr.recon-term"
+#define RECON_INDEX_XATTR "trusted.nsr.recon-index"
+#define NSR_REP_COUNT_XATTR "trusted.nsr.rep-count"
+#include "nsr-cg.c"
+
+uint8_t
+nsr_count_up_kids (nsr_private_t *priv)
+{
+ uint8_t retval = 0;
+ uint8_t i;
+
+ for (i = 0; i < priv->n_children; ++i) {
+ if (priv->kid_state & (1 << i)) {
+ ++retval;
+ }
+ }
+
+ return retval;
+}
+
+/*
+ * The fsync machinery looks a lot like that for any write call, but there are
+ * some important differences that are easy to miss. First, we don't care
+ * about the xdata that shows whether the call came from a leader or
+ * reconciliation process. If we're the leader we fan out; if we're not we
+ * don't. Second, we don't wait for followers before we issue the local call.
+ * The code generation system could be updated to handle this, and still might
+ * if we need to implement other "almost identical" paths (e.g. for open), but
+ * a copy is more readable as long as it's just one.
+ */
+
+int32_t
+nsr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ nsr_local_t *local = frame->local;
+ gf_boolean_t unwind;
+
+ LOCK(&frame->lock);
+ unwind = !--(local->call_count);
+ UNLOCK(&frame->lock);
+
+ if (unwind) {
+ STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+ }
+ return 0;
+}
+
+int32_t
+nsr_fsync_local_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ nsr_dirty_list_t *dirty;
+ nsr_dirty_list_t *dtmp;
+ nsr_local_t *local = frame->local;
+
+ list_for_each_entry_safe (dirty, dtmp, &local->qlinks, links) {
+ gf_log (this->name, GF_LOG_TRACE,
+ "sending post-op on %p (%p)", local->fd, dirty);
+ GF_FREE(dirty);
+ }
+
+ return nsr_fsync_cbk (frame, cookie, this, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+}
+
+int32_t
+nsr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+ dict_t *xdata)
+{
+ nsr_private_t *priv = this->private;
+ nsr_local_t *local;
+ uint64_t ctx_int = 0LL;
+ nsr_fd_ctx_t *ctx_ptr;
+ xlator_list_t *trav;
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ STACK_UNWIND_STRICT(fsync,frame,-1,ENOMEM,NULL,NULL,xdata);
+ return 0;
+ }
+ INIT_LIST_HEAD(&local->qlinks);
+ frame->local = local;
+
+ /* Move the dirty list from the fd to the fsync request. */
+ LOCK(&fd->lock);
+ if (__fd_ctx_get(fd,this,&ctx_int) == 0) {
+ ctx_ptr = (nsr_fd_ctx_t *)(long)ctx_int;
+ list_splice_init (&ctx_ptr->dirty_list,
+ &local->qlinks);
+ }
+ UNLOCK(&fd->lock);
+
+ /* Issue the local call. */
+ local->call_count = priv->leader ? priv->n_children : 1;
+ STACK_WIND (frame, nsr_fsync_local_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync,
+ fd, flags, xdata);
+
+ /* Issue remote calls if we're the leader. */
+ if (priv->leader) {
+ for (trav = this->children->next; trav; trav = trav->next) {
+ STACK_WIND (frame, nsr_fsync_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsync,
+ fd, flags, xdata);
+ }
+ }
+
+ return 0;
+}
+
+int32_t
+nsr_getxattr_special (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ dict_t *result;
+ nsr_private_t *priv = this->private;
+
+ if (!priv->leader) {
+ STACK_UNWIND_STRICT (getxattr, frame, -1, EREMOTE, NULL, NULL);
+ return 0;
+ }
+
+ if (!name || (strcmp(name,NSR_REP_COUNT_XATTR) != 0)) {
+ STACK_WIND_TAIL (frame,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->getxattr,
+ loc, name, xdata);
+ return 0;
+ }
+
+ result = dict_new();
+ if (!result) {
+ goto dn_failed;
+ }
+
+ priv->up_children = nsr_count_up_kids(this->private);
+ if (dict_set_uint32(result,NSR_REP_COUNT_XATTR,priv->up_children) != 0) {
+ goto dsu_failed;
+ }
+
+ STACK_UNWIND_STRICT (getxattr, frame, 0, 0, result, NULL);
+ dict_destroy(result);
+ return 0;
+
+dsu_failed:
+ dict_destroy(result);
+dn_failed:
+ STACK_UNWIND_STRICT (getxattr, frame, -1, ENOMEM, NULL, NULL);
+ return 0;
+}
+
+void
+nsr_flush_fd (xlator_t *this, nsr_fd_ctx_t *fd_ctx)
+{
+ nsr_dirty_list_t *dirty;
+ nsr_dirty_list_t *dtmp;
+
+ list_for_each_entry_safe (dirty, dtmp, &fd_ctx->dirty_list, links) {
+ gf_log (this->name, GF_LOG_TRACE,
+ "sending post-op on %p (%p)", fd_ctx->fd, dirty);
+ GF_FREE(dirty);
+ }
+
+ INIT_LIST_HEAD(&fd_ctx->dirty_list);
+}
+
+void *
+nsr_flush_thread (void *ctx)
+{
+ xlator_t *this = ctx;
+ nsr_private_t *priv = this->private;
+ struct list_head dirty_fds;
+ nsr_fd_ctx_t *fd_ctx;
+ nsr_fd_ctx_t *fd_tmp;
+ int ret;
+
+ for (;;) {
+ /*
+ * We have to be very careful to avoid lock inversions here, so
+ * we can't just hold priv->dirty_lock while we take and
+ * release locks for each fd. Instead, we only hold dirty_lock
+ * at the beginning of each iteration, as we (effectively) make
+ * a copy of the current list head and then clear the original.
+ * This leads to four scenarios for adding the first entry to
+ * an fd and potentially putting it on the global list.
+ *
+ * (1) While we're asleep. No lock contention, it just gets
+ * added and will be processed on the next iteration.
+ *
+ * (2) After we've made a local copy, but before we've started
+ * processing that fd. The new entry will be added to the
+ * fd (under its lock), and we'll process it on the current
+ * iteration.
+ *
+ * (3) While we're processing the fd. They'll block on the fd
+ * lock, then see that the list is empty and put it on the
+ * global list. We'll process it here on the next
+ * iteration.
+ *
+ * (4) While we're working, but after we've processed that fd.
+ * Same as (1) as far as that fd is concerned.
+ */
+ INIT_LIST_HEAD(&dirty_fds);
+ LOCK(&priv->dirty_lock);
+ list_splice_init(&priv->dirty_fds,&dirty_fds);
+ UNLOCK(&priv->dirty_lock);
+
+ list_for_each_entry_safe (fd_ctx, fd_tmp, &dirty_fds, fd_list) {
+ ret = syncop_fsync(FIRST_CHILD(this),fd_ctx->fd,0);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to fsync %p (%d)",
+ fd_ctx->fd, -ret);
+ }
+
+ LOCK(&fd_ctx->fd->lock);
+ nsr_flush_fd(this,fd_ctx);
+ list_del_init(&fd_ctx->fd_list);
+ UNLOCK(&fd_ctx->fd->lock);
+ fd_unref(fd_ctx->fd);
+ }
+
+ sleep(NSR_FLUSH_INTERVAL);
+ }
+
+ return NULL;
+}
+
+int32_t
+nsr_forget (xlator_t *this, inode_t *inode)
+{
+ uint64_t ctx = 0LL;
+
+ if ((inode_ctx_del(inode,this,&ctx) == 0) && ctx) {
+ GF_FREE((void *)(long)ctx);
+ }
+
+ return 0;
+}
+
+int32_t
+nsr_release (xlator_t *this, fd_t *fd)
+{
+ uint64_t ctx = 0LL;
+
+ if ((fd_ctx_del(fd,this,&ctx) == 0) && ctx) {
+ GF_FREE((void *)(long)ctx);
+ }
+
+ return 0;
+}
+
+struct xlator_cbks cbks = {
+ .forget = nsr_forget,
+ .release = nsr_release,
+};
+
+int
+nsr_reconfigure (xlator_t *this, dict_t *options)
+{
+ nsr_private_t *priv = this->private;
+
+ GF_OPTION_RECONF ("leader", priv->leader, options, bool, err);
+ gf_log (this->name, GF_LOG_INFO,
+ "reconfigure called. setting priv->leader to %d\n", priv->leader);
+ return 0;
+
+err:
+ return -1;
+}
+
+int
+nsr_get_child_index (xlator_t *this, xlator_t *kid)
+{
+ xlator_list_t *trav;
+ int retval = -1;
+
+ for (trav = this->children; trav; trav = trav->next) {
+ ++retval;
+ if (trav->xlator == kid) {
+ return retval;
+ }
+ }
+
+ return -1;
+}
+
+/*
+ * Child notify handling is unreasonably FUBAR. Sometimes we'll get a
+ * CHILD_DOWN for a protocol/client child before we ever got a CHILD_UP for it.
+ * Other times we won't. Because it's effectively random (probably racy), we
+ * can't just maintain a count. We actually have to keep track of the state
+ * for each child separately, to filter out the bogus CHILD_DOWN events, and
+ * then generate counts on demand.
+ */
+int
+nsr_notify (xlator_t *this, int event, void *data, ...)
+{
+ nsr_private_t *priv = this->private;
+ int index;
+
+ switch (event) {
+ case GF_EVENT_CHILD_UP:
+ index = nsr_get_child_index(this,data);
+ if (index >= 0) {
+ priv->kid_state |= (1 << index);
+ priv->up_children = nsr_count_up_kids(priv);
+ gf_log (this->name, GF_LOG_INFO,
+ "got CHILD_UP for %s, now %u kids",
+ ((xlator_t *)data)->name,
+ priv->up_children);
+ if (priv->nsr_recon_start == _gf_true) {
+ nsr_recon_notify_event_add_child(priv, index);
+ }
+ }
+ break;
+ case GF_EVENT_CHILD_DOWN:
+ index = nsr_get_child_index(this,data);
+ if (index >= 0) {
+ priv->kid_state &= ~(1 << index);
+ priv->up_children = nsr_count_up_kids(priv);
+ gf_log (this->name, GF_LOG_INFO,
+ "got CHILD_DOWN for %s, now %u kids",
+ ((xlator_t *)data)->name,
+ priv->up_children);
+ }
+ break;
+ default:
+ ;
+ }
+
+ return default_notify(this,event,data);
+}
+
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO ("nsr", this, out);
+
+ ret = xlator_mem_acct_init (this, gf_mt_nsr_end + 1);
+
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Memory accounting init" "failed");
+ return ret;
+ }
+out:
+ return ret;
+}
+
+
+extern void *nsr_leader_thread (void *);
+
+void
+nsr_deallocate_priv (nsr_private_t *priv)
+{
+ if (!priv) {
+ return;
+ }
+
+ if (priv->leader_key) {
+ GF_FREE(priv->leader_key);
+ }
+
+ if (priv->term_key) {
+ GF_FREE(priv->term_key);
+ }
+
+ GF_FREE(priv);
+}
+
+
+int32_t
+nsr_init (xlator_t *this)
+{
+ xlator_list_t *remote;
+ xlator_list_t *local;
+ nsr_private_t *priv = NULL;
+ xlator_list_t *trav;
+ pthread_t kid;
+ uuid_t tmp_uuid;
+ char *my_name = NULL, *morph_name = NULL, *recon_file = NULL, *recon_pid_file = NULL, *ptr = NULL;
+ char *volname;
+ extern xlator_t global_xlator;
+ glusterfs_ctx_t *oldctx = global_xlator.ctx;
+ runner_t runner = {0,};
+ int32_t ret = -1;
+ struct stat buf;
+ char *recon_log = NULL, *recon_log_dir = NULL;
+
+ /*
+ * Any fop that gets special treatment has to be patched in here,
+ * because the compiled-in table is produced by the code generator and
+ * only contains generated functions. Note that we have to go through
+ * this->fops because of some dynamic-linking strangeness; modifying
+ * the static table doesn't work.
+ */
+ this->fops->getxattr = nsr_getxattr_special;
+ this->fops->fsync = nsr_fsync;
+
+ local = this->children;
+ if (!local) {
+ gf_log (this->name, GF_LOG_ERROR, "no local subvolume");
+ goto err;
+ }
+
+ remote = local->next;
+ if (!remote) {
+ gf_log (this->name, GF_LOG_ERROR, "no remote subvolumes");
+ goto err;
+ }
+
+ this->local_pool = mem_pool_new (nsr_local_t, 128);
+ if (!this->local_pool) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to create nsr_local_t pool");
+ goto err;
+ }
+
+ priv = GF_CALLOC (1, sizeof(*priv), gf_mt_nsr_private_t);
+ if (!priv) {
+ gf_log (this->name, GF_LOG_ERROR, "could not allocate priv");
+ goto err;
+ }
+
+ // set this so that unless leader election is done, IO is fenced
+ priv->fence_io = 1;
+
+ for (trav = this->children; trav; trav = trav->next) {
+ ++(priv->n_children);
+ }
+
+ LOCK_INIT(&priv->dirty_lock);
+ LOCK_INIT(&priv->index_lock);
+ INIT_LIST_HEAD(&priv->dirty_fds);
+
+ this->private = priv;
+
+ GF_OPTION_INIT ("etcd-servers", priv->etcd_servers, str, err);
+ if (!priv->etcd_servers) {
+ gf_log (this->name, GF_LOG_ERROR, "etcd servers not generated. ???");
+ goto err;
+ }
+
+
+ GF_OPTION_INIT ("quorum-percent", priv->quorum_pct, percent, err);
+
+ GF_OPTION_INIT ("subvol-uuid", priv->subvol_uuid, str, err);
+ gf_log (this->name, GF_LOG_INFO, "subvol_uuid = %s", priv->subvol_uuid);
+ if (gf_asprintf(&priv->leader_key,"%s:leader",priv->subvol_uuid) <= 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not generate leader key");
+ goto err;
+ }
+ if (gf_asprintf(&priv->term_key,"%s:term",priv->subvol_uuid) <= 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not generate term key");
+ goto err;
+ }
+ uuid_generate(tmp_uuid);
+ priv->brick_uuid = strdup(uuid_utoa(tmp_uuid));
+ gf_log (this->name, GF_LOG_INFO, "brick_uuid = %s\n", priv->brick_uuid);
+
+ GF_OPTION_INIT ("my-name", my_name, str, err);
+ if (!my_name) {
+ gf_log (this->name, GF_LOG_ERROR, "brick name not generated. ???");
+ goto err;
+ }
+ GF_OPTION_INIT ("vol-name", volname, str, err);
+ if (!volname) {
+ gf_log (this->name, GF_LOG_ERROR, "vol name not generated. ???");
+ goto err;
+ }
+
+ morph_name = GF_CALLOC (1, strlen(my_name) + 1, gf_mt_nsr_private_t);
+ strcpy(morph_name, my_name);
+ recon_file = GF_CALLOC (1,PATH_MAX + strlen(morph_name) + strlen("con") +1, gf_mt_nsr_private_t);
+ recon_pid_file = GF_CALLOC (1,PATH_MAX + strlen(morph_name) + strlen("recon") +1, gf_mt_nsr_private_t);
+ if ((!recon_file) || (!recon_pid_file)) {
+ gf_log (this->name, GF_LOG_ERROR, "could not allocate reconciliation file name");
+ goto err;
+ }
+ ptr = strchr (morph_name, '/');
+ while (ptr) {
+ *ptr = '-';
+ ptr = strchr (morph_name, '/');
+ }
+
+ sprintf(recon_file,"/%s/%s/%s/%s/",GLUSTERD_DEFAULT_WORKDIR,
+ GLUSTERD_VOLUME_DIR_PREFIX,
+ volname,
+ GLUSTERD_BRICK_INFO_DIR);
+ strcat(recon_file, morph_name);
+ strcat(recon_file, "-nsr-recon.vol");
+
+ sprintf(recon_pid_file,"/%s/%s/%s/%s/",GLUSTERD_DEFAULT_WORKDIR,
+ GLUSTERD_VOLUME_DIR_PREFIX,
+ volname,
+ "run");
+ strcat(recon_pid_file, morph_name);
+ strcat(recon_pid_file, "-recon.pid");
+
+ priv->vol_file = GF_CALLOC (1,PATH_MAX + strlen(morph_name) + strlen("con") +1, gf_mt_nsr_private_t);
+ if (!priv->vol_file) {
+ gf_log (this->name, GF_LOG_ERROR, "could not allocate reconciliation file name");
+ goto err;
+ }
+ sprintf(priv->vol_file,"%s/%s/%s/%s/",
+ GLUSTERD_DEFAULT_WORKDIR,
+ GLUSTERD_VOLUME_DIR_PREFIX,
+ volname,
+ GLUSTERD_BRICK_INFO_DIR);
+ strcat(priv->vol_file, "con:");
+ strcat(priv->vol_file, morph_name);
+
+ if (pthread_create(&kid,NULL,nsr_flush_thread,this) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not start flush thread");
+ /* TBD: treat this as a fatal error? */
+ }
+
+ // Start the recon process. Then start the leader thread.
+ /*
+ * REVIEW
+ * Logs belong in /var/log not /tmp.
+ */
+
+ ret = mkdir (NSR_LOG_DIR, 0777);
+ if (ret != 0) {
+ if (errno != EEXIST) {
+ gf_log (this->name, GF_LOG_ERROR, "Couldn't create"
+ " nsr log directory (%s)", strerror (errno));
+ goto err;
+ }
+ }
+
+ recon_log_dir = GF_CALLOC (1, strlen (NSR_LOG_DIR) + strlen(morph_name)
+ + 2, gf_mt_nsr_private_t);
+ if (!recon_log_dir) {
+ gf_log (this->name, GF_LOG_ERROR, "Couldn't allocate recon log "
+ "dir name");
+ goto err;
+ }
+ sprintf (recon_log_dir, "%s/%s", NSR_LOG_DIR, morph_name);
+ ret = mkdir (recon_log_dir, 0777);
+
+ if (ret != 0){
+ if (errno != EEXIST) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Couldn't create brick log dir (%s)",
+ strerror (errno));
+ goto err;
+ }
+ }
+
+ recon_log = GF_CALLOC (1, strlen (recon_log_dir)+
+ strlen ("reconciliation.log") + 2,
+ gf_mt_nsr_private_t);
+ if (!recon_log) {
+ gf_log (this->name, GF_LOG_ERROR, "Couldn't allocate recon log"
+ " file name");
+ goto err;
+ }
+ sprintf (recon_log, "%s/reconciliation.log", recon_log_dir);
+
+ if (!stat(priv->vol_file, &buf)) {
+
+ runinit (&runner);
+ runner_add_args(&runner, SBIN_DIR "/glusterfs",
+ "-f", recon_file,
+ "-p", recon_pid_file,
+ "-l", recon_log,
+ NULL);
+ ret = runner_run (&runner);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not exec reconciliation process %s",
+ SBIN_DIR "/glusterfs");
+ goto err;
+ }
+
+ // TBD - convert this to make sure recon process runs
+ sleep(2);
+ priv->nsr_recon_start = _gf_true;
+ }
+
+
+ (void)pthread_create(&kid,NULL,nsr_recon_notify_thread,this);
+ while (priv->recon_notify_inited == 0) {
+ sleep(1);
+ }
+
+ if (pthread_create(&kid,NULL,nsr_leader_thread,this) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to start leader thread");
+ }
+ while (priv->leader_inited == 0) {
+ sleep(1);
+ }
+
+
+ /*
+ * Calling glfs_new changes old->ctx, even if THIS still points
+ * to global_xlator. That causes problems later in the main
+ * thread, when gf_log_dump_graph tries to use the FILE after
+ * we've mucked with it and gets a segfault in __fprintf_chk.
+ * We can avoid all that by undoing the damage before we
+ * continue.
+ */
+ global_xlator.ctx = oldctx;
+
+ return 0;
+
+err:
+ nsr_deallocate_priv(priv);
+ return -1;
+}
+
+
+void
+nsr_fini (xlator_t *this)
+{
+ nsr_deallocate_priv(this->private);
+}
+
+class_methods_t class_methods = {
+ .init = nsr_init,
+ .fini = nsr_fini,
+ .reconfigure = nsr_reconfigure,
+ .notify = nsr_notify,
+};
+
+struct volume_options options[] = {
+ { .key = {"leader"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "false",
+ .description = "Start in the leader role. This is only for "
+ "bootstrapping the code, and should go away when we "
+ "have real leader election."
+ },
+ { .key ={"vol-name"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "volume name"
+ },
+ { .key = {"my-name"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "brick name in form of host:/path"
+ },
+ { .key = {"etcd-servers"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "list of comma seperated etc servers"
+ },
+ { .key = {"subvol-uuid"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "UUID for this NSR (sub)volume"
+ },
+ { .key = {"quorum-percent"},
+ .type = GF_OPTION_TYPE_PERCENT,
+ .default_value = "50.0",
+ .description = "percentage of rep_count-1 that must be up"
+ },
+ { .key = {NULL} },
+};
diff --git a/xlators/cluster/nsr-server/src/recon_notify.c b/xlators/cluster/nsr-server/src/recon_notify.c
new file mode 100644
index 000000000..1c50de234
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/recon_notify.c
@@ -0,0 +1,389 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <string.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "call-stub.h"
+#include "defaults.h"
+#include "xlator.h"
+#include "api/src/glfs.h"
+#include "api/src/glfs-internal.h"
+#include "etcd-api.h"
+#include "nsr-internal.h"
+#include "../../nsr-recon/src/recon_driver.h"
+#include "../../nsr-recon/src/recon_xlator.h"
+
+
+
+typedef struct _nsr_recon_notify_ctx_t {
+ nsr_recon_notify_ev_t recon_head;
+ pthread_mutex_t recon_mutex;
+ pthread_cond_t recon_cv;
+ char **hosts; // list of hosts ordered depending on child indices
+ uint32_t current_term;
+ uint32_t last_reconciled_term;
+ glfs_t *fs;
+ glfs_fd_t *fd;
+} nsr_recon_notify_ctx_t;
+
+static int
+xlator_get_option (xlator_t *xl, char *key, char **value)
+{
+ GF_ASSERT (xl);
+ return dict_get_str (xl->options, key, value);
+}
+
+void nsr_recon_notify_event_set_leader(nsr_private_t *priv)
+{
+ nsr_recon_notify_ev_t *ev;
+ nsr_recon_notify_ctx_t *ctx = (nsr_recon_notify_ctx_t *)priv->recon_ctx;
+
+ ev = GF_CALLOC (1, sizeof (nsr_recon_notify_ev_t), 0);
+ ev->id = NSR_RECON_SET_LEADER;
+ INIT_LIST_HEAD(&(ev->list));
+ pthread_mutex_lock(&ctx->recon_mutex);
+ list_add_tail(&ev->list, &ctx->recon_head.list);
+ pthread_cond_signal(&ctx->recon_cv);
+ pthread_mutex_unlock(&ctx->recon_mutex);
+}
+
+void nsr_recon_notify_event_add_child(nsr_private_t *priv, uint32_t index)
+{
+ nsr_recon_notify_ev_t *ev;
+ nsr_recon_notify_ctx_t *ctx = (nsr_recon_notify_ctx_t *)priv->recon_ctx;
+
+ ev = GF_CALLOC (1, sizeof (nsr_recon_notify_ev_t), 0);
+ ev->id = NSR_RECON_ADD_CHILD;
+ ev->index = index;
+ INIT_LIST_HEAD(&(ev->list));
+ pthread_mutex_lock(&ctx->recon_mutex);
+ list_add_tail(&ev->list, &ctx->recon_head.list);
+ pthread_cond_signal(&ctx->recon_cv);
+ pthread_mutex_unlock(&ctx->recon_mutex);
+}
+
+
+static void
+nsr_recon_set_leader (xlator_t *this)
+{
+
+ nsr_private_t *priv = this->private;
+ nsr_recon_notify_ctx_t *ctx = (nsr_recon_notify_ctx_t *)priv->recon_ctx;
+ nsr_recon_role_t role;
+ xlator_t *old = this;
+ uint32_t i=0;
+
+ if (priv->leader != _gf_true)
+ return;
+
+ if (ctx->last_reconciled_term == priv->current_term)
+ return;
+
+ /*
+ * Quorum for reconciliation is not the same as quorum for I/O. Here,
+ * we require a true majority. The +1 is because we don't count
+ * ourselves as part of n_children or up_children.
+ *
+ * TBD: re-evaluate when to reconcile (including partial)
+ */
+ if (priv->up_children <= (priv->n_children / 2))
+ return;
+
+ gf_log (this->name, GF_LOG_INFO,
+ "Sending message to do recon with %d nodes\n",
+ priv->up_children);
+
+ role.num = 0;
+ role.role = leader;
+ for (i = 0; i < priv->n_children; ++i) {
+ if (priv->kid_state & (1 << i)) {
+ gf_log (this->name, GF_LOG_INFO,
+ "Recon using host %s",
+ ctx->hosts[i]);
+ strcpy(role.info[role.num].name, ctx->hosts[i]);
+ (role.num)++;
+ }
+ }
+
+ gf_log (this->name, GF_LOG_INFO,
+ "setting current term as %d", priv->current_term);
+ role.current_term = priv->current_term;
+ ENDIAN_CONVERSION_RR(role, _gf_false);
+
+ // inform the reconciliator that this is leader
+ // in the callback (once reconciliation is done),
+ // we will unfence the IOs.
+ // TBD - error handling later.
+ if (glfs_lseek(ctx->fd, nsr_recon_xlator_sector_1, SEEK_SET) == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "doing lseek failed\n");
+ return;
+ }
+
+ glusterfs_this_set(old);
+ gf_log (this->name, GF_LOG_INFO,
+ "Writing to local node to set leader");
+ do {
+ if (priv->leader != _gf_true) {
+ glusterfs_this_set(old);
+ gf_log (this->name, GF_LOG_ERROR, "no longer leader\n");
+ return;
+ }
+ if (glfs_write(ctx->fd, &role, sizeof(role), 0) == -1) {
+ if (errno == EAGAIN) {
+ // Wait for old reconciliation to bail out.
+ glusterfs_this_set(old);
+ gf_log (this->name, GF_LOG_ERROR,
+ "write failed with retry. retrying after some time\n");
+ sleep(5);
+ continue;
+ }
+ else{
+ glusterfs_this_set(old);
+ gf_log (this->name, GF_LOG_ERROR,
+ "doing write failed\n");
+ // This is because reconciliation has returned with error
+ // because some node has died in between.
+ // What should be done? Either we retry being leader
+ // or hook to CHILD_DOWN notification.
+ // Put that logic later. As of now we will just retry.
+ // This is easier.
+ sleep(5);
+ continue;
+ }
+ } else {
+ glusterfs_this_set(old);
+ gf_log (this->name, GF_LOG_INFO, "doing write with success\n");
+ break;
+ }
+ } while(1);
+ glusterfs_this_set(old);
+ gf_log (this->name, GF_LOG_INFO,
+ "glfs_write returned. unfencing IO\n");
+
+ // TBD - error handling
+
+ ctx->last_reconciled_term = priv->current_term;
+ priv->index = 0; // reset changelog index
+ atomic_fetch_and(&(priv->fence_io), 0);
+
+ return;
+}
+
+static void
+nsr_recon_add_child (xlator_t *this, uint32_t index)
+{
+ nsr_private_t *priv = this->private;
+ nsr_recon_notify_ctx_t *ctx = (nsr_recon_notify_ctx_t *)priv->recon_ctx;
+ nsr_recon_role_t role;
+ xlator_t *old = this;
+
+ if (priv->leader != _gf_true)
+ return;
+
+ // reconciliation still pending.
+ // Check if we have majority
+ if (ctx->last_reconciled_term != priv->current_term) {
+ nsr_recon_set_leader(this);
+ } else {
+ // Reconciliation done.
+ // new child joining the majority/
+ // Do reconciliation only fot this child but after fencing new IO and draining old IO
+ role.num = 1;
+ role.role = joiner;
+
+ atomic_fetch_or(&(priv->fence_io), 1);
+ while(priv->ops_in_flight) {
+ sleep(1);
+ }
+
+ strcpy(role.info[0].name, ctx->hosts[index]);
+ role.current_term = priv->current_term;
+ ENDIAN_CONVERSION_RR(role, _gf_false);
+ glfs_lseek(ctx->fd, nsr_recon_xlator_sector_1, SEEK_SET);
+ glusterfs_this_set(old);
+ gf_log (this->name, GF_LOG_INFO,
+ "Writing to local node to join %s\n", role.info[0].name);
+ glfs_write(ctx->fd, &role,
+ sizeof(role), 0);
+ glusterfs_this_set(old);
+ gf_log (this->name, GF_LOG_INFO,
+ "Write to local node to set joiner returned\n");
+
+ // TBD - error handling
+ atomic_fetch_and(&(priv->fence_io), 0);
+ }
+
+ return;
+}
+
+static uint32_t
+nsr_setup_recon (xlator_t *this)
+{
+ nsr_private_t *priv = this->private;
+ xlator_t *old = this;
+ uint32_t ret = 0;
+ nsr_recon_notify_ctx_t *ctx = (nsr_recon_notify_ctx_t *)priv->recon_ctx;
+
+ if (priv->nsr_recon_start == _gf_false)
+ return 0;
+
+ ctx->fs = glfs_new(priv->subvol_uuid);
+ if (!ctx->fs) {
+ ret = 1;
+ gf_log (this->name, GF_LOG_ERROR, "failed to initialise glfs \n");
+ goto done;
+ }
+
+ glusterfs_this_set(old);
+ ret = glfs_set_volfile(ctx->fs, priv->vol_file);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to set volfile \n");
+ goto done;
+ }
+
+ glusterfs_this_set(old);
+ /*
+ * REVIEW
+ * Logs belong in /var/log not /tmp.
+ */
+ glfs_set_logging (ctx->fs,"/tmp/glfs-log", 7);
+ if (glfs_init(ctx->fs) < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to init volfile \n");
+ ret = 1;
+ goto done;
+ }
+
+ glusterfs_this_set(old);
+ ctx->fd = glfs_open (ctx->fs, "/", O_RDWR);
+ if (ctx->fd == NULL) {
+ ret = 1;
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to open fd to communicate with recon process \n");
+ goto done;
+ }
+
+
+done:
+ glusterfs_this_set(old);
+ return ret;
+}
+
+
+static void
+nsr_setup_hosts(xlator_t *this)
+{
+ xlator_list_t *trav;
+ nsr_private_t *priv = this->private;
+ uint32_t i = 0;
+ nsr_recon_notify_ctx_t *ctx = (nsr_recon_notify_ctx_t *)priv->recon_ctx;
+
+ ctx->hosts = GF_CALLOC(sizeof(char *), priv->n_children, gf_mt_nsr_private_t);
+ // Iterate thru all the children
+ for (trav = this->children; trav; trav = trav->next) {
+ char *hostname = NULL, *vol = NULL;
+ int ret1 = 0, ret2 = 0, ret = 0;
+ xlator_t *xl = trav->xlator;
+ // If the child type is that of protocol/client
+ if (!strcmp(trav->xlator->type, "protocol/client")) {
+ ret1 = xlator_get_option (xl, "remote-host", &hostname);
+ ret2 = xlator_get_option (xl, "remote-subvolume", &vol);
+ if (!ret1 && !ret2) {
+ // add the name of that host to the hosts
+ ctx->hosts[i] = GF_CALLOC(sizeof(char), strlen(hostname) + strlen(vol) + 2, 0);
+ strcpy(ctx->hosts[i], hostname);
+ strcat(ctx->hosts[i], ":");
+ strcat(ctx->hosts[i], vol);
+ gf_log (this->name, GF_LOG_INFO,
+ "adding hosts %s to recon notfiy list", ctx->hosts[i]);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "CANNOT FIND HOSTNAME FOR A CHILD");
+ GF_ASSERT(0);
+ }
+ // local brick
+ } else {
+ ret = xlator_get_option (this, "my-name", &hostname);
+ if (!ret) {
+ uint32_t len = strlen(hostname);
+ ctx->hosts[i] = GF_CALLOC(sizeof(char),
+ len+1,
+ gf_mt_nsr_private_t);
+ strcpy(ctx->hosts[i], hostname);
+ gf_log (this->name, GF_LOG_INFO,
+ "adding my host %s to recon notfiy list", ctx->hosts[i]);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "CANNOT FIND MY HOSTNAME");
+ GF_ASSERT(0);
+ }
+ }
+ i++;
+ }
+}
+
+void *
+nsr_recon_notify_thread (void *arg)
+{
+ xlator_t *this = (xlator_t *)arg;
+ nsr_private_t *priv = this->private;
+ nsr_recon_notify_ev_t *ev;
+ nsr_recon_notify_ctx_t *ctx;
+
+ priv->recon_ctx = GF_CALLOC(1, sizeof(nsr_recon_notify_ctx_t), gf_mt_nsr_private_t);
+ if (!priv->recon_ctx) {
+ gf_log (this->name, GF_LOG_ERROR, "calloc error");
+ return NULL;
+ }
+ ctx = priv->recon_ctx;
+
+ pthread_mutex_init(&(ctx->recon_mutex), NULL);
+ pthread_cond_init(&(ctx->recon_cv), NULL);
+ INIT_LIST_HEAD(&(ctx->recon_head.list));
+
+ nsr_setup_hosts(this);
+
+ if (nsr_setup_recon(this)) {
+ gf_log (this->name, GF_LOG_ERROR, "recon notify thread : initing glfs error");
+ return NULL;
+ }
+
+ priv->recon_notify_inited = 1;
+
+ while(1) {
+ pthread_mutex_lock(&ctx->recon_mutex);
+ while (list_empty(&(ctx->recon_head.list))) {
+ pthread_cond_wait(&ctx->recon_cv, &ctx->recon_mutex);
+ }
+ pthread_mutex_unlock(&ctx->recon_mutex);
+
+ list_for_each_entry(ev, &(ctx->recon_head.list), list) {
+
+ if (ev->id == NSR_RECON_SET_LEADER) {
+ gf_log (this->name, GF_LOG_INFO,
+ "got add leader notfiy event");
+ nsr_recon_set_leader(this);
+ } else if (ev->id == NSR_RECON_ADD_CHILD) {
+ gf_log (this->name, GF_LOG_INFO,
+ "got add child notify event");
+ nsr_recon_add_child(this, ev->index);
+ }
+ }
+ list_del_init (&ev->list);
+ }
+
+ return NULL;
+}
+
diff --git a/xlators/cluster/nsr-server/src/yajl.c b/xlators/cluster/nsr-server/src/yajl.c
new file mode 100644
index 000000000..54e6474fc
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl.c
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "yajl/yajl_parse.h"
+#include "yajl_lex.h"
+#include "yajl_parser.h"
+#include "yajl_alloc.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <assert.h>
+
+const char *
+yajl_status_to_string(yajl_status stat)
+{
+ const char * statStr = "unknown";
+ switch (stat) {
+ case yajl_status_ok:
+ statStr = "ok, no error";
+ break;
+ case yajl_status_client_canceled:
+ statStr = "client canceled parse";
+ break;
+ case yajl_status_error:
+ statStr = "parse error";
+ break;
+ }
+ return statStr;
+}
+
+yajl_handle
+yajl_alloc(const yajl_callbacks * callbacks,
+ yajl_alloc_funcs * afs,
+ void * ctx)
+{
+ yajl_handle hand = NULL;
+ yajl_alloc_funcs afsBuffer;
+
+ /* first order of business is to set up memory allocation routines */
+ if (afs != NULL) {
+ if (afs->malloc == NULL || afs->realloc == NULL || afs->free == NULL)
+ {
+ return NULL;
+ }
+ } else {
+ yajl_set_default_alloc_funcs(&afsBuffer);
+ afs = &afsBuffer;
+ }
+
+ hand = (yajl_handle) YA_MALLOC(afs, sizeof(struct yajl_handle_t));
+
+ /* copy in pointers to allocation routines */
+ memcpy((void *) &(hand->alloc), (void *) afs, sizeof(yajl_alloc_funcs));
+
+ hand->callbacks = callbacks;
+ hand->ctx = ctx;
+ hand->lexer = NULL;
+ hand->bytesConsumed = 0;
+ hand->decodeBuf = yajl_buf_alloc(&(hand->alloc));
+ hand->flags = 0;
+ yajl_bs_init(hand->stateStack, &(hand->alloc));
+ yajl_bs_push(hand->stateStack, yajl_state_start);
+
+ return hand;
+}
+
+int
+yajl_config(yajl_handle h, yajl_option opt, ...)
+{
+ int rv = 1;
+ va_list ap;
+ va_start(ap, opt);
+
+ switch(opt) {
+ case yajl_allow_comments:
+ case yajl_dont_validate_strings:
+ case yajl_allow_trailing_garbage:
+ case yajl_allow_multiple_values:
+ case yajl_allow_partial_values:
+ if (va_arg(ap, int)) h->flags |= opt;
+ else h->flags &= ~opt;
+ break;
+ default:
+ rv = 0;
+ }
+ va_end(ap);
+
+ return rv;
+}
+
+void
+yajl_free(yajl_handle handle)
+{
+ yajl_bs_free(handle->stateStack);
+ yajl_buf_free(handle->decodeBuf);
+ if (handle->lexer) {
+ yajl_lex_free(handle->lexer);
+ handle->lexer = NULL;
+ }
+ YA_FREE(&(handle->alloc), handle);
+}
+
+yajl_status
+yajl_parse(yajl_handle hand, const unsigned char * jsonText,
+ size_t jsonTextLen)
+{
+ yajl_status status;
+
+ /* lazy allocation of the lexer */
+ if (hand->lexer == NULL) {
+ hand->lexer = yajl_lex_alloc(&(hand->alloc),
+ hand->flags & yajl_allow_comments,
+ !(hand->flags & yajl_dont_validate_strings));
+ }
+
+ status = yajl_do_parse(hand, jsonText, jsonTextLen);
+ return status;
+}
+
+
+yajl_status
+yajl_complete_parse(yajl_handle hand)
+{
+ /* The lexer is lazy allocated in the first call to parse. if parse is
+ * never called, then no data was provided to parse at all. This is a
+ * "premature EOF" error unless yajl_allow_partial_values is specified.
+ * allocating the lexer now is the simplest possible way to handle this
+ * case while preserving all the other semantics of the parser
+ * (multiple values, partial values, etc). */
+ if (hand->lexer == NULL) {
+ hand->lexer = yajl_lex_alloc(&(hand->alloc),
+ hand->flags & yajl_allow_comments,
+ !(hand->flags & yajl_dont_validate_strings));
+ }
+
+ return yajl_do_finish(hand);
+}
+
+unsigned char *
+yajl_get_error(yajl_handle hand, int verbose,
+ const unsigned char * jsonText, size_t jsonTextLen)
+{
+ return yajl_render_error_string(hand, jsonText, jsonTextLen, verbose);
+}
+
+size_t
+yajl_get_bytes_consumed(yajl_handle hand)
+{
+ if (!hand) return 0;
+ else return hand->bytesConsumed;
+}
+
+
+void
+yajl_free_error(yajl_handle hand, unsigned char * str)
+{
+ /* use memory allocation functions if set */
+ YA_FREE(&(hand->alloc), str);
+}
+
+/* XXX: add utility routines to parse from file */
diff --git a/xlators/cluster/nsr-server/src/yajl/yajl_common.h b/xlators/cluster/nsr-server/src/yajl/yajl_common.h
new file mode 100644
index 000000000..49ca3a5cb
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl/yajl_common.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef __YAJL_COMMON_H__
+#define __YAJL_COMMON_H__
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define YAJL_MAX_DEPTH 128
+
+/* msft dll export gunk. To build a DLL on windows, you
+ * must define WIN32, YAJL_SHARED, and YAJL_BUILD. To use a shared
+ * DLL, you must define YAJL_SHARED and WIN32 */
+#if defined(WIN32) && defined(YAJL_SHARED)
+# ifdef YAJL_BUILD
+# define YAJL_API __declspec(dllexport)
+# else
+# define YAJL_API __declspec(dllimport)
+# endif
+#else
+# if defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__) >= 303
+# define YAJL_API __attribute__ ((visibility("default")))
+# else
+# define YAJL_API
+# endif
+#endif
+
+/** pointer to a malloc function, supporting client overriding memory
+ * allocation routines */
+typedef void * (*yajl_malloc_func)(void *ctx, size_t sz);
+
+/** pointer to a free function, supporting client overriding memory
+ * allocation routines */
+typedef void (*yajl_free_func)(void *ctx, void * ptr);
+
+/** pointer to a realloc function which can resize an allocation. */
+typedef void * (*yajl_realloc_func)(void *ctx, void * ptr, size_t sz);
+
+/** A structure which can be passed to yajl_*_alloc routines to allow the
+ * client to specify memory allocation functions to be used. */
+typedef struct
+{
+ /** pointer to a function that can allocate uninitialized memory */
+ yajl_malloc_func malloc;
+ /** pointer to a function that can resize memory allocations */
+ yajl_realloc_func realloc;
+ /** pointer to a function that can free memory allocated using
+ * reallocFunction or mallocFunction */
+ yajl_free_func free;
+ /** a context pointer that will be passed to above allocation routines */
+ void * ctx;
+} yajl_alloc_funcs;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/xlators/cluster/nsr-server/src/yajl/yajl_gen.h b/xlators/cluster/nsr-server/src/yajl/yajl_gen.h
new file mode 100644
index 000000000..52fa99fc2
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl/yajl_gen.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/**
+ * \file yajl_gen.h
+ * Interface to YAJL's JSON generation facilities.
+ */
+
+#include <yajl/yajl_common.h>
+
+#ifndef __YAJL_GEN_H__
+#define __YAJL_GEN_H__
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+ /** generator status codes */
+ typedef enum {
+ /** no error */
+ yajl_gen_status_ok = 0,
+ /** at a point where a map key is generated, a function other than
+ * yajl_gen_string was called */
+ yajl_gen_keys_must_be_strings,
+ /** YAJL's maximum generation depth was exceeded. see
+ * YAJL_MAX_DEPTH */
+ yajl_max_depth_exceeded,
+ /** A generator function (yajl_gen_XXX) was called while in an error
+ * state */
+ yajl_gen_in_error_state,
+ /** A complete JSON document has been generated */
+ yajl_gen_generation_complete,
+ /** yajl_gen_double was passed an invalid floating point value
+ * (infinity or NaN). */
+ yajl_gen_invalid_number,
+ /** A print callback was passed in, so there is no internal
+ * buffer to get from */
+ yajl_gen_no_buf,
+ /** returned from yajl_gen_string() when the yajl_gen_validate_utf8
+ * option is enabled and an invalid was passed by client code.
+ */
+ yajl_gen_invalid_string
+ } yajl_gen_status;
+
+ /** an opaque handle to a generator */
+ typedef struct yajl_gen_t * yajl_gen;
+
+ /** a callback used for "printing" the results. */
+ typedef void (*yajl_print_t)(void * ctx,
+ const char * str,
+ size_t len);
+
+ /** configuration parameters for the parser, these may be passed to
+ * yajl_gen_config() along with option specific argument(s). In general,
+ * all configuration parameters default to *off*. */
+ typedef enum {
+ /** generate indented (beautiful) output */
+ yajl_gen_beautify = 0x01,
+ /**
+ * Set an indent string which is used when yajl_gen_beautify
+ * is enabled. Maybe something like \\t or some number of
+ * spaces. The default is four spaces ' '.
+ */
+ yajl_gen_indent_string = 0x02,
+ /**
+ * Set a function and context argument that should be used to
+ * output generated json. the function should conform to the
+ * yajl_print_t prototype while the context argument is a
+ * void * of your choosing.
+ *
+ * example:
+ * yajl_gen_config(g, yajl_gen_print_callback, myFunc, myVoidPtr);
+ */
+ yajl_gen_print_callback = 0x04,
+ /**
+ * Normally the generator does not validate that strings you
+ * pass to it via yajl_gen_string() are valid UTF8. Enabling
+ * this option will cause it to do so.
+ */
+ yajl_gen_validate_utf8 = 0x08,
+ /**
+ * the forward solidus (slash or '/' in human) is not required to be
+ * escaped in json text. By default, YAJL will not escape it in the
+ * iterest of saving bytes. Setting this flag will cause YAJL to
+ * always escape '/' in generated JSON strings.
+ */
+ yajl_gen_escape_solidus = 0x10
+ } yajl_gen_option;
+
+ /** allow the modification of generator options subsequent to handle
+ * allocation (via yajl_alloc)
+ * \returns zero in case of errors, non-zero otherwise
+ */
+ YAJL_API int yajl_gen_config(yajl_gen g, yajl_gen_option opt, ...);
+
+ /** allocate a generator handle
+ * \param allocFuncs an optional pointer to a structure which allows
+ * the client to overide the memory allocation
+ * used by yajl. May be NULL, in which case
+ * malloc/free/realloc will be used.
+ *
+ * \returns an allocated handle on success, NULL on failure (bad params)
+ */
+ YAJL_API yajl_gen yajl_gen_alloc(const yajl_alloc_funcs * allocFuncs);
+
+ /** free a generator handle */
+ YAJL_API void yajl_gen_free(yajl_gen handle);
+
+ YAJL_API yajl_gen_status yajl_gen_integer(yajl_gen hand, long long int number);
+ /** generate a floating point number. number may not be infinity or
+ * NaN, as these have no representation in JSON. In these cases the
+ * generator will return 'yajl_gen_invalid_number' */
+ YAJL_API yajl_gen_status yajl_gen_double(yajl_gen hand, double number);
+ YAJL_API yajl_gen_status yajl_gen_number(yajl_gen hand,
+ const char * num,
+ size_t len);
+ YAJL_API yajl_gen_status yajl_gen_string(yajl_gen hand,
+ const unsigned char * str,
+ size_t len);
+ YAJL_API yajl_gen_status yajl_gen_null(yajl_gen hand);
+ YAJL_API yajl_gen_status yajl_gen_bool(yajl_gen hand, int boolean);
+ YAJL_API yajl_gen_status yajl_gen_map_open(yajl_gen hand);
+ YAJL_API yajl_gen_status yajl_gen_map_close(yajl_gen hand);
+ YAJL_API yajl_gen_status yajl_gen_array_open(yajl_gen hand);
+ YAJL_API yajl_gen_status yajl_gen_array_close(yajl_gen hand);
+
+ /** access the null terminated generator buffer. If incrementally
+ * outputing JSON, one should call yajl_gen_clear to clear the
+ * buffer. This allows stream generation. */
+ YAJL_API yajl_gen_status yajl_gen_get_buf(yajl_gen hand,
+ const unsigned char ** buf,
+ size_t * len);
+
+ /** clear yajl's output buffer, but maintain all internal generation
+ * state. This function will not "reset" the generator state, and is
+ * intended to enable incremental JSON outputing. */
+ YAJL_API void yajl_gen_clear(yajl_gen hand);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/xlators/cluster/nsr-server/src/yajl/yajl_parse.h b/xlators/cluster/nsr-server/src/yajl/yajl_parse.h
new file mode 100644
index 000000000..55c831101
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl/yajl_parse.h
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/**
+ * \file yajl_parse.h
+ * Interface to YAJL's JSON stream parsing facilities.
+ */
+
+#include <yajl/yajl_common.h>
+
+#ifndef __YAJL_PARSE_H__
+#define __YAJL_PARSE_H__
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+ /** error codes returned from this interface */
+ typedef enum {
+ /** no error was encountered */
+ yajl_status_ok,
+ /** a client callback returned zero, stopping the parse */
+ yajl_status_client_canceled,
+ /** An error occured during the parse. Call yajl_get_error for
+ * more information about the encountered error */
+ yajl_status_error
+ } yajl_status;
+
+ /** attain a human readable, english, string for an error */
+ YAJL_API const char * yajl_status_to_string(yajl_status code);
+
+ /** an opaque handle to a parser */
+ typedef struct yajl_handle_t * yajl_handle;
+
+ /** yajl is an event driven parser. this means as json elements are
+ * parsed, you are called back to do something with the data. The
+ * functions in this table indicate the various events for which
+ * you will be called back. Each callback accepts a "context"
+ * pointer, this is a void * that is passed into the yajl_parse
+ * function which the client code may use to pass around context.
+ *
+ * All callbacks return an integer. If non-zero, the parse will
+ * continue. If zero, the parse will be canceled and
+ * yajl_status_client_canceled will be returned from the parse.
+ *
+ * \attention {
+ * A note about the handling of numbers:
+ *
+ * yajl will only convert numbers that can be represented in a
+ * double or a 64 bit (long long) int. All other numbers will
+ * be passed to the client in string form using the yajl_number
+ * callback. Furthermore, if yajl_number is not NULL, it will
+ * always be used to return numbers, that is yajl_integer and
+ * yajl_double will be ignored. If yajl_number is NULL but one
+ * of yajl_integer or yajl_double are defined, parsing of a
+ * number larger than is representable in a double or 64 bit
+ * integer will result in a parse error.
+ * }
+ */
+ typedef struct {
+ int (* yajl_null)(void * ctx);
+ int (* yajl_boolean)(void * ctx, int boolVal);
+ int (* yajl_integer)(void * ctx, long long integerVal);
+ int (* yajl_double)(void * ctx, double doubleVal);
+ /** A callback which passes the string representation of the number
+ * back to the client. Will be used for all numbers when present */
+ int (* yajl_number)(void * ctx, const char * numberVal,
+ size_t numberLen);
+
+ /** strings are returned as pointers into the JSON text when,
+ * possible, as a result, they are _not_ null padded */
+ int (* yajl_string)(void * ctx, const unsigned char * stringVal,
+ size_t stringLen);
+
+ int (* yajl_start_map)(void * ctx);
+ int (* yajl_map_key)(void * ctx, const unsigned char * key,
+ size_t stringLen);
+ int (* yajl_end_map)(void * ctx);
+
+ int (* yajl_start_array)(void * ctx);
+ int (* yajl_end_array)(void * ctx);
+ } yajl_callbacks;
+
+ /** allocate a parser handle
+ * \param callbacks a yajl callbacks structure specifying the
+ * functions to call when different JSON entities
+ * are encountered in the input text. May be NULL,
+ * which is only useful for validation.
+ * \param afs memory allocation functions, may be NULL for to use
+ * C runtime library routines (malloc and friends)
+ * \param ctx a context pointer that will be passed to callbacks.
+ */
+ YAJL_API yajl_handle yajl_alloc(const yajl_callbacks * callbacks,
+ yajl_alloc_funcs * afs,
+ void * ctx);
+
+
+ /** configuration parameters for the parser, these may be passed to
+ * yajl_config() along with option specific argument(s). In general,
+ * all configuration parameters default to *off*. */
+ typedef enum {
+ /** Ignore javascript style comments present in
+ * JSON input. Non-standard, but rather fun
+ * arguments: toggled off with integer zero, on otherwise.
+ *
+ * example:
+ * yajl_config(h, yajl_allow_comments, 1); // turn comment support on
+ */
+ yajl_allow_comments = 0x01,
+ /**
+ * When set the parser will verify that all strings in JSON input are
+ * valid UTF8 and will emit a parse error if this is not so. When set,
+ * this option makes parsing slightly more expensive (~7% depending
+ * on processor and compiler in use)
+ *
+ * example:
+ * yajl_config(h, yajl_dont_validate_strings, 1); // disable utf8 checking
+ */
+ yajl_dont_validate_strings = 0x02,
+ /**
+ * By default, upon calls to yajl_complete_parse(), yajl will
+ * ensure the entire input text was consumed and will raise an error
+ * otherwise. Enabling this flag will cause yajl to disable this
+ * check. This can be useful when parsing json out of a that contains more
+ * than a single JSON document.
+ */
+ yajl_allow_trailing_garbage = 0x04,
+ /**
+ * Allow multiple values to be parsed by a single handle. The
+ * entire text must be valid JSON, and values can be seperated
+ * by any kind of whitespace. This flag will change the
+ * behavior of the parser, and cause it continue parsing after
+ * a value is parsed, rather than transitioning into a
+ * complete state. This option can be useful when parsing multiple
+ * values from an input stream.
+ */
+ yajl_allow_multiple_values = 0x08,
+ /**
+ * When yajl_complete_parse() is called the parser will
+ * check that the top level value was completely consumed. I.E.,
+ * if called whilst in the middle of parsing a value
+ * yajl will enter an error state (premature EOF). Setting this
+ * flag suppresses that check and the corresponding error.
+ */
+ yajl_allow_partial_values = 0x10
+ } yajl_option;
+
+ /** allow the modification of parser options subsequent to handle
+ * allocation (via yajl_alloc)
+ * \returns zero in case of errors, non-zero otherwise
+ */
+ YAJL_API int yajl_config(yajl_handle h, yajl_option opt, ...);
+
+ /** free a parser handle */
+ YAJL_API void yajl_free(yajl_handle handle);
+
+ /** Parse some json!
+ * \param hand - a handle to the json parser allocated with yajl_alloc
+ * \param jsonText - a pointer to the UTF8 json text to be parsed
+ * \param jsonTextLength - the length, in bytes, of input text
+ */
+ YAJL_API yajl_status yajl_parse(yajl_handle hand,
+ const unsigned char * jsonText,
+ size_t jsonTextLength);
+
+ /** Parse any remaining buffered json.
+ * Since yajl is a stream-based parser, without an explicit end of
+ * input, yajl sometimes can't decide if content at the end of the
+ * stream is valid or not. For example, if "1" has been fed in,
+ * yajl can't know whether another digit is next or some character
+ * that would terminate the integer token.
+ *
+ * \param hand - a handle to the json parser allocated with yajl_alloc
+ */
+ YAJL_API yajl_status yajl_complete_parse(yajl_handle hand);
+
+ /** get an error string describing the state of the
+ * parse.
+ *
+ * If verbose is non-zero, the message will include the JSON
+ * text where the error occured, along with an arrow pointing to
+ * the specific char.
+ *
+ * \returns A dynamically allocated string will be returned which should
+ * be freed with yajl_free_error
+ */
+ YAJL_API unsigned char * yajl_get_error(yajl_handle hand, int verbose,
+ const unsigned char * jsonText,
+ size_t jsonTextLength);
+
+ /**
+ * get the amount of data consumed from the last chunk passed to YAJL.
+ *
+ * In the case of a successful parse this can help you understand if
+ * the entire buffer was consumed (which will allow you to handle
+ * "junk at end of input").
+ *
+ * In the event an error is encountered during parsing, this function
+ * affords the client a way to get the offset into the most recent
+ * chunk where the error occured. 0 will be returned if no error
+ * was encountered.
+ */
+ YAJL_API size_t yajl_get_bytes_consumed(yajl_handle hand);
+
+ /** free an error returned from yajl_get_error */
+ YAJL_API void yajl_free_error(yajl_handle hand, unsigned char * str);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/xlators/cluster/nsr-server/src/yajl/yajl_tree.h b/xlators/cluster/nsr-server/src/yajl/yajl_tree.h
new file mode 100644
index 000000000..8b377f636
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl/yajl_tree.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2010-2011 Florian Forster <ff at octo.it>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/**
+ * \file yajl_tree.h
+ *
+ * Parses JSON data and returns the data in tree form.
+ *
+ * \author Florian Forster
+ * \date August 2010
+ *
+ * This interface makes quick parsing and extraction of
+ * smallish JSON docs trivial:
+ *
+ * \include example/parse_config.c
+ */
+
+#ifndef YAJL_TREE_H
+#define YAJL_TREE_H 1
+
+#include <yajl/yajl_common.h>
+
+/** possible data types that a yajl_val_s can hold */
+typedef enum {
+ yajl_t_string = 1,
+ yajl_t_number = 2,
+ yajl_t_object = 3,
+ yajl_t_array = 4,
+ yajl_t_true = 5,
+ yajl_t_false = 6,
+ yajl_t_null = 7,
+ /** The any type isn't valid for yajl_val_s.type, but can be
+ * used as an argument to routines like yajl_tree_get().
+ */
+ yajl_t_any = 8
+} yajl_type;
+
+#define YAJL_NUMBER_INT_VALID 0x01
+#define YAJL_NUMBER_DOUBLE_VALID 0x02
+
+/** A pointer to a node in the parse tree */
+typedef struct yajl_val_s * yajl_val;
+
+/**
+ * A JSON value representation capable of holding one of the seven
+ * types above. For "string", "number", "object", and "array"
+ * additional data is available in the union. The "YAJL_IS_*"
+ * and "YAJL_GET_*" macros below allow type checking and convenient
+ * value extraction.
+ */
+struct yajl_val_s
+{
+ /** Type of the value contained. Use the "YAJL_IS_*" macors to check for a
+ * specific type. */
+ yajl_type type;
+ /** Type-specific data. You may use the "YAJL_GET_*" macros to access these
+ * members. */
+ union
+ {
+ char * string;
+ struct {
+ long long i; /*< integer value, if representable. */
+ double d; /*< double value, if representable. */
+ /** Signals whether the \em i and \em d members are
+ * valid. See \c YAJL_NUMBER_INT_VALID and
+ * \c YAJL_NUMBER_DOUBLE_VALID. */
+ char *r; /*< unparsed number in string form. */
+ unsigned int flags;
+ } number;
+ struct {
+ const char **keys; /*< Array of keys */
+ yajl_val *values; /*< Array of values. */
+ size_t len; /*< Number of key-value-pairs. */
+ } object;
+ struct {
+ yajl_val *values; /*< Array of elements. */
+ size_t len; /*< Number of elements. */
+ } array;
+ } u;
+};
+
+/**
+ * Parse a string.
+ *
+ * Parses an null-terminated string containing JSON data and returns a pointer
+ * to the top-level value (root of the parse tree).
+ *
+ * \param input Pointer to a null-terminated utf8 string containing
+ * JSON data.
+ * \param error_buffer Pointer to a buffer in which an error message will
+ * be stored if \em yajl_tree_parse fails, or
+ * \c NULL. The buffer will be initialized before
+ * parsing, so its content will be destroyed even if
+ * \em yajl_tree_parse succeeds.
+ * \param error_buffer_size Size of the memory area pointed to by
+ * \em error_buffer_size. If \em error_buffer_size is
+ * \c NULL, this argument is ignored.
+ *
+ * \returns Pointer to the top-level value or \c NULL on error. The memory
+ * pointed to must be freed using \em yajl_tree_free. In case of an error, a
+ * null terminated message describing the error in more detail is stored in
+ * \em error_buffer if it is not \c NULL.
+ */
+YAJL_API yajl_val yajl_tree_parse (const char *input,
+ char *error_buffer, size_t error_buffer_size);
+
+/**
+ * Free a parse tree returned by "yajl_tree_parse".
+ *
+ * \param v Pointer to a JSON value returned by "yajl_tree_parse". Passing NULL
+ * is valid and results in a no-op.
+ */
+YAJL_API void yajl_tree_free (yajl_val v);
+
+/**
+ * Access a nested value inside a tree.
+ *
+ * \param parent the node under which you'd like to extract values.
+ * \param path A null terminated array of strings, each the name of an object key
+ * \param type the yajl_type of the object you seek, or yajl_t_any if any will do.
+ *
+ * \returns a pointer to the found value, or NULL if we came up empty.
+ *
+ * Future Ideas: it'd be nice to move path to a string and implement support for
+ * a teeny tiny micro language here, so you can extract array elements, do things
+ * like .first and .last, even .length. Inspiration from JSONPath and css selectors?
+ * No it wouldn't be fast, but that's not what this API is about.
+ */
+YAJL_API yajl_val yajl_tree_get(yajl_val parent, const char ** path, yajl_type type);
+
+/* Various convenience macros to check the type of a `yajl_val` */
+#define YAJL_IS_STRING(v) (((v) != NULL) && ((v)->type == yajl_t_string))
+#define YAJL_IS_NUMBER(v) (((v) != NULL) && ((v)->type == yajl_t_number))
+#define YAJL_IS_INTEGER(v) (YAJL_IS_NUMBER(v) && ((v)->u.flags & YAJL_NUMBER_INT_VALID))
+#define YAJL_IS_DOUBLE(v) (YAJL_IS_NUMBER(v) && ((v)->u.flags & YAJL_NUMBER_DOUBLE_VALID))
+#define YAJL_IS_OBJECT(v) (((v) != NULL) && ((v)->type == yajl_t_object))
+#define YAJL_IS_ARRAY(v) (((v) != NULL) && ((v)->type == yajl_t_array ))
+#define YAJL_IS_TRUE(v) (((v) != NULL) && ((v)->type == yajl_t_true ))
+#define YAJL_IS_FALSE(v) (((v) != NULL) && ((v)->type == yajl_t_false ))
+#define YAJL_IS_NULL(v) (((v) != NULL) && ((v)->type == yajl_t_null ))
+
+/** Given a yajl_val_string return a ptr to the bare string it contains,
+ * or NULL if the value is not a string. */
+#define YAJL_GET_STRING(v) (YAJL_IS_STRING(v) ? (v)->u.string : NULL)
+
+/** Get the string representation of a number. You should check type first,
+ * perhaps using YAJL_IS_NUMBER */
+#define YAJL_GET_NUMBER(v) ((v)->u.number.r)
+
+/** Get the double representation of a number. You should check type first,
+ * perhaps using YAJL_IS_DOUBLE */
+#define YAJL_GET_DOUBLE(v) ((v)->u.number.d)
+
+/** Get the 64bit (long long) integer representation of a number. You should
+ * check type first, perhaps using YAJL_IS_INTEGER */
+#define YAJL_GET_INTEGER(v) ((v)->u.number.i)
+
+/** Get a pointer to a yajl_val_object or NULL if the value is not an object. */
+#define YAJL_GET_OBJECT(v) (YAJL_IS_OBJECT(v) ? &(v)->u.object : NULL)
+
+/** Get a pointer to a yajl_val_array or NULL if the value is not an object. */
+#define YAJL_GET_ARRAY(v) (YAJL_IS_ARRAY(v) ? &(v)->u.array : NULL)
+
+#endif /* YAJL_TREE_H */
diff --git a/xlators/cluster/nsr-server/src/yajl/yajl_version.h b/xlators/cluster/nsr-server/src/yajl/yajl_version.h
new file mode 100644
index 000000000..0fba9b8fc
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl/yajl_version.h
@@ -0,0 +1,23 @@
+#ifndef YAJL_VERSION_H_
+#define YAJL_VERSION_H_
+
+#include <yajl/yajl_common.h>
+
+#define YAJL_MAJOR 2
+#define YAJL_MINOR 0
+#define YAJL_MICRO 1
+
+#define YAJL_VERSION ((YAJL_MAJOR * 10000) + (YAJL_MINOR * 100) + YAJL_MICRO)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int YAJL_API yajl_version(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* YAJL_VERSION_H_ */
+
diff --git a/xlators/cluster/nsr-server/src/yajl_alloc.c b/xlators/cluster/nsr-server/src/yajl_alloc.c
new file mode 100644
index 000000000..276315af7
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_alloc.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/**
+ * \file yajl_alloc.h
+ * default memory allocation routines for yajl which use malloc/realloc and
+ * free
+ */
+
+#include "yajl_alloc.h"
+#include <stdlib.h>
+
+static void * yajl_internal_malloc(void *ctx, size_t sz)
+{
+ return malloc(sz);
+}
+
+static void * yajl_internal_realloc(void *ctx, void * previous,
+ size_t sz)
+{
+ return realloc(previous, sz);
+}
+
+static void yajl_internal_free(void *ctx, void * ptr)
+{
+ free(ptr);
+}
+
+void yajl_set_default_alloc_funcs(yajl_alloc_funcs * yaf)
+{
+ yaf->malloc = yajl_internal_malloc;
+ yaf->free = yajl_internal_free;
+ yaf->realloc = yajl_internal_realloc;
+ yaf->ctx = NULL;
+}
+
diff --git a/xlators/cluster/nsr-server/src/yajl_alloc.h b/xlators/cluster/nsr-server/src/yajl_alloc.h
new file mode 100644
index 000000000..a8a9e45e6
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_alloc.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/**
+ * \file yajl_alloc.h
+ * default memory allocation routines for yajl which use malloc/realloc and
+ * free
+ */
+
+#ifndef __YAJL_ALLOC_H__
+#define __YAJL_ALLOC_H__
+
+#include "yajl/yajl_common.h"
+
+#define YA_MALLOC(afs, sz) (afs)->malloc((afs)->ctx, (sz))
+#define YA_FREE(afs, ptr) (afs)->free((afs)->ctx, (ptr))
+#define YA_REALLOC(afs, ptr, sz) (afs)->realloc((afs)->ctx, (ptr), (sz))
+
+void yajl_set_default_alloc_funcs(yajl_alloc_funcs * yaf);
+
+#endif
diff --git a/xlators/cluster/nsr-server/src/yajl_buf.c b/xlators/cluster/nsr-server/src/yajl_buf.c
new file mode 100644
index 000000000..0d249d364
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_buf.c
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "yajl_buf.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define YAJL_BUF_INIT_SIZE 2048
+
+struct yajl_buf_t {
+ size_t len;
+ size_t used;
+ unsigned char * data;
+ yajl_alloc_funcs * alloc;
+};
+
+static
+void yajl_buf_ensure_available(yajl_buf buf, size_t want)
+{
+ size_t need;
+
+ assert(buf != NULL);
+
+ /* first call */
+ if (buf->data == NULL) {
+ buf->len = YAJL_BUF_INIT_SIZE;
+ buf->data = (unsigned char *) YA_MALLOC(buf->alloc, buf->len);
+ buf->data[0] = 0;
+ }
+
+ need = buf->len;
+
+ while (want >= (need - buf->used)) need <<= 1;
+
+ if (need != buf->len) {
+ buf->data = (unsigned char *) YA_REALLOC(buf->alloc, buf->data, need);
+ buf->len = need;
+ }
+}
+
+yajl_buf yajl_buf_alloc(yajl_alloc_funcs * alloc)
+{
+ yajl_buf b = YA_MALLOC(alloc, sizeof(struct yajl_buf_t));
+ memset((void *) b, 0, sizeof(struct yajl_buf_t));
+ b->alloc = alloc;
+ return b;
+}
+
+void yajl_buf_free(yajl_buf buf)
+{
+ assert(buf != NULL);
+ if (buf->data) YA_FREE(buf->alloc, buf->data);
+ YA_FREE(buf->alloc, buf);
+}
+
+void yajl_buf_append(yajl_buf buf, const void * data, size_t len)
+{
+ yajl_buf_ensure_available(buf, len);
+ if (len > 0) {
+ assert(data != NULL);
+ memcpy(buf->data + buf->used, data, len);
+ buf->used += len;
+ buf->data[buf->used] = 0;
+ }
+}
+
+void yajl_buf_clear(yajl_buf buf)
+{
+ buf->used = 0;
+ if (buf->data) buf->data[buf->used] = 0;
+}
+
+const unsigned char * yajl_buf_data(yajl_buf buf)
+{
+ return buf->data;
+}
+
+size_t yajl_buf_len(yajl_buf buf)
+{
+ return buf->used;
+}
+
+void
+yajl_buf_truncate(yajl_buf buf, size_t len)
+{
+ assert(len <= buf->used);
+ buf->used = len;
+}
diff --git a/xlators/cluster/nsr-server/src/yajl_buf.h b/xlators/cluster/nsr-server/src/yajl_buf.h
new file mode 100644
index 000000000..94929a519
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_buf.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef __YAJL_BUF_H__
+#define __YAJL_BUF_H__
+
+#include "yajl/yajl_common.h"
+#include "yajl_alloc.h"
+
+/*
+ * Implementation/performance notes. If this were moved to a header
+ * only implementation using #define's where possible we might be
+ * able to sqeeze a little performance out of the guy by killing function
+ * call overhead. YMMV.
+ */
+
+/**
+ * yajl_buf is a buffer with exponential growth. the buffer ensures that
+ * you are always null padded.
+ */
+typedef struct yajl_buf_t * yajl_buf;
+
+/* allocate a new buffer */
+yajl_buf yajl_buf_alloc(yajl_alloc_funcs * alloc);
+
+/* free the buffer */
+void yajl_buf_free(yajl_buf buf);
+
+/* append a number of bytes to the buffer */
+void yajl_buf_append(yajl_buf buf, const void * data, size_t len);
+
+/* empty the buffer */
+void yajl_buf_clear(yajl_buf buf);
+
+/* get a pointer to the beginning of the buffer */
+const unsigned char * yajl_buf_data(yajl_buf buf);
+
+/* get the length of the buffer */
+size_t yajl_buf_len(yajl_buf buf);
+
+/* truncate the buffer */
+void yajl_buf_truncate(yajl_buf buf, size_t len);
+
+#endif
diff --git a/xlators/cluster/nsr-server/src/yajl_bytestack.h b/xlators/cluster/nsr-server/src/yajl_bytestack.h
new file mode 100644
index 000000000..1fc50c470
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_bytestack.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/*
+ * A header only implementation of a simple stack of bytes, used in YAJL
+ * to maintain parse state.
+ */
+
+#ifndef __YAJL_BYTESTACK_H__
+#define __YAJL_BYTESTACK_H__
+
+#include "yajl/yajl_common.h"
+
+#define YAJL_BS_INC 128
+
+typedef struct yajl_bytestack_t
+{
+ unsigned char * stack;
+ size_t size;
+ size_t used;
+ yajl_alloc_funcs * yaf;
+} yajl_bytestack;
+
+/* initialize a bytestack */
+#define yajl_bs_init(obs, _yaf) { \
+ (obs).stack = NULL; \
+ (obs).size = 0; \
+ (obs).used = 0; \
+ (obs).yaf = (_yaf); \
+ } \
+
+
+/* initialize a bytestack */
+#define yajl_bs_free(obs) \
+ if ((obs).stack) (obs).yaf->free((obs).yaf->ctx, (obs).stack);
+
+#define yajl_bs_current(obs) \
+ (assert((obs).used > 0), (obs).stack[(obs).used - 1])
+
+#define yajl_bs_push(obs, byte) { \
+ if (((obs).size - (obs).used) == 0) { \
+ (obs).size += YAJL_BS_INC; \
+ (obs).stack = (obs).yaf->realloc((obs).yaf->ctx,\
+ (void *) (obs).stack, (obs).size);\
+ } \
+ (obs).stack[((obs).used)++] = (byte); \
+}
+
+/* removes the top item of the stack, returns nothing */
+#define yajl_bs_pop(obs) { ((obs).used)--; }
+
+#define yajl_bs_set(obs, byte) \
+ (obs).stack[((obs).used) - 1] = (byte);
+
+
+#endif
diff --git a/xlators/cluster/nsr-server/src/yajl_encode.c b/xlators/cluster/nsr-server/src/yajl_encode.c
new file mode 100644
index 000000000..9dc9a3e81
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_encode.c
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "yajl_encode.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+static void CharToHex(unsigned char c, char * hexBuf)
+{
+ const char * hexchar = "0123456789ABCDEF";
+ hexBuf[0] = hexchar[c >> 4];
+ hexBuf[1] = hexchar[c & 0x0F];
+}
+
+void
+yajl_string_encode(const yajl_print_t print,
+ void * ctx,
+ const unsigned char * str,
+ size_t len,
+ int escape_solidus)
+{
+ size_t beg = 0;
+ size_t end = 0;
+ char hexBuf[7];
+ hexBuf[0] = '\\'; hexBuf[1] = 'u'; hexBuf[2] = '0'; hexBuf[3] = '0';
+ hexBuf[6] = 0;
+
+ while (end < len) {
+ const char * escaped = NULL;
+ switch (str[end]) {
+ case '\r': escaped = "\\r"; break;
+ case '\n': escaped = "\\n"; break;
+ case '\\': escaped = "\\\\"; break;
+ /* it is not required to escape a solidus in JSON:
+ * read sec. 2.5: http://www.ietf.org/rfc/rfc4627.txt
+ * specifically, this production from the grammar:
+ * unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
+ */
+ case '/': if (escape_solidus) escaped = "\\/"; break;
+ case '"': escaped = "\\\""; break;
+ case '\f': escaped = "\\f"; break;
+ case '\b': escaped = "\\b"; break;
+ case '\t': escaped = "\\t"; break;
+ default:
+ if ((unsigned char) str[end] < 32) {
+ CharToHex(str[end], hexBuf + 4);
+ escaped = hexBuf;
+ }
+ break;
+ }
+ if (escaped != NULL) {
+ print(ctx, (const char *) (str + beg), end - beg);
+ print(ctx, escaped, (unsigned int)strlen(escaped));
+ beg = ++end;
+ } else {
+ ++end;
+ }
+ }
+ print(ctx, (const char *) (str + beg), end - beg);
+}
+
+static void hexToDigit(unsigned int * val, const unsigned char * hex)
+{
+ unsigned int i;
+ for (i=0;i<4;i++) {
+ unsigned char c = hex[i];
+ if (c >= 'A') c = (c & ~0x20) - 7;
+ c -= '0';
+ assert(!(c & 0xF0));
+ *val = (*val << 4) | c;
+ }
+}
+
+static void Utf32toUtf8(unsigned int codepoint, char * utf8Buf)
+{
+ if (codepoint < 0x80) {
+ utf8Buf[0] = (char) codepoint;
+ utf8Buf[1] = 0;
+ } else if (codepoint < 0x0800) {
+ utf8Buf[0] = (char) ((codepoint >> 6) | 0xC0);
+ utf8Buf[1] = (char) ((codepoint & 0x3F) | 0x80);
+ utf8Buf[2] = 0;
+ } else if (codepoint < 0x10000) {
+ utf8Buf[0] = (char) ((codepoint >> 12) | 0xE0);
+ utf8Buf[1] = (char) (((codepoint >> 6) & 0x3F) | 0x80);
+ utf8Buf[2] = (char) ((codepoint & 0x3F) | 0x80);
+ utf8Buf[3] = 0;
+ } else if (codepoint < 0x200000) {
+ utf8Buf[0] =(char)((codepoint >> 18) | 0xF0);
+ utf8Buf[1] =(char)(((codepoint >> 12) & 0x3F) | 0x80);
+ utf8Buf[2] =(char)(((codepoint >> 6) & 0x3F) | 0x80);
+ utf8Buf[3] =(char)((codepoint & 0x3F) | 0x80);
+ utf8Buf[4] = 0;
+ } else {
+ utf8Buf[0] = '?';
+ utf8Buf[1] = 0;
+ }
+}
+
+void yajl_string_decode(yajl_buf buf, const unsigned char * str,
+ size_t len)
+{
+ size_t beg = 0;
+ size_t end = 0;
+
+ while (end < len) {
+ if (str[end] == '\\') {
+ char utf8Buf[5];
+ const char * unescaped = "?";
+ yajl_buf_append(buf, str + beg, end - beg);
+ switch (str[++end]) {
+ case 'r': unescaped = "\r"; break;
+ case 'n': unescaped = "\n"; break;
+ case '\\': unescaped = "\\"; break;
+ case '/': unescaped = "/"; break;
+ case '"': unescaped = "\""; break;
+ case 'f': unescaped = "\f"; break;
+ case 'b': unescaped = "\b"; break;
+ case 't': unescaped = "\t"; break;
+ case 'u': {
+ unsigned int codepoint = 0;
+ hexToDigit(&codepoint, str + ++end);
+ end+=3;
+ /* check if this is a surrogate */
+ if ((codepoint & 0xFC00) == 0xD800) {
+ end++;
+ if (str[end] == '\\' && str[end + 1] == 'u') {
+ unsigned int surrogate = 0;
+ hexToDigit(&surrogate, str + end + 2);
+ codepoint =
+ (((codepoint & 0x3F) << 10) |
+ ((((codepoint >> 6) & 0xF) + 1) << 16) |
+ (surrogate & 0x3FF));
+ end += 5;
+ } else {
+ unescaped = "?";
+ break;
+ }
+ }
+
+ Utf32toUtf8(codepoint, utf8Buf);
+ unescaped = utf8Buf;
+
+ if (codepoint == 0) {
+ yajl_buf_append(buf, unescaped, 1);
+ beg = ++end;
+ continue;
+ }
+
+ break;
+ }
+ default:
+ assert("this should never happen" == NULL);
+ }
+ yajl_buf_append(buf, unescaped, (unsigned int)strlen(unescaped));
+ beg = ++end;
+ } else {
+ end++;
+ }
+ }
+ yajl_buf_append(buf, str + beg, end - beg);
+}
+
+#define ADV_PTR s++; if (!(len--)) return 0;
+
+int yajl_string_validate_utf8(const unsigned char * s, size_t len)
+{
+ if (!len) return 1;
+ if (!s) return 0;
+
+ while (len--) {
+ /* single byte */
+ if (*s <= 0x7f) {
+ /* noop */
+ }
+ /* two byte */
+ else if ((*s >> 5) == 0x6) {
+ ADV_PTR;
+ if (!((*s >> 6) == 0x2)) return 0;
+ }
+ /* three byte */
+ else if ((*s >> 4) == 0x0e) {
+ ADV_PTR;
+ if (!((*s >> 6) == 0x2)) return 0;
+ ADV_PTR;
+ if (!((*s >> 6) == 0x2)) return 0;
+ }
+ /* four byte */
+ else if ((*s >> 3) == 0x1e) {
+ ADV_PTR;
+ if (!((*s >> 6) == 0x2)) return 0;
+ ADV_PTR;
+ if (!((*s >> 6) == 0x2)) return 0;
+ ADV_PTR;
+ if (!((*s >> 6) == 0x2)) return 0;
+ } else {
+ return 0;
+ }
+
+ s++;
+ }
+
+ return 1;
+}
diff --git a/xlators/cluster/nsr-server/src/yajl_encode.h b/xlators/cluster/nsr-server/src/yajl_encode.h
new file mode 100644
index 000000000..af1e8bbde
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_encode.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef __YAJL_ENCODE_H__
+#define __YAJL_ENCODE_H__
+
+#include "yajl_buf.h"
+#include "yajl/yajl_gen.h"
+
+void yajl_string_encode(const yajl_print_t printer,
+ void * ctx,
+ const unsigned char * str,
+ size_t length,
+ int escape_solidus);
+
+void yajl_string_decode(yajl_buf buf, const unsigned char * str,
+ size_t length);
+
+int yajl_string_validate_utf8(const unsigned char * s, size_t len);
+
+#endif
diff --git a/xlators/cluster/nsr-server/src/yajl_gen.c b/xlators/cluster/nsr-server/src/yajl_gen.c
new file mode 100644
index 000000000..73763a9e0
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_gen.c
@@ -0,0 +1,350 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "yajl/yajl_gen.h"
+#include "yajl_buf.h"
+#include "yajl_encode.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <math.h>
+#include <stdarg.h>
+
+typedef enum {
+ yajl_gen_start,
+ yajl_gen_map_start,
+ yajl_gen_map_key,
+ yajl_gen_map_val,
+ yajl_gen_array_start,
+ yajl_gen_in_array,
+ yajl_gen_complete,
+ yajl_gen_error
+} yajl_gen_state;
+
+struct yajl_gen_t
+{
+ unsigned int flags;
+ unsigned int depth;
+ const char * indentString;
+ yajl_gen_state state[YAJL_MAX_DEPTH];
+ yajl_print_t print;
+ void * ctx; /* yajl_buf */
+ /* memory allocation routines */
+ yajl_alloc_funcs alloc;
+};
+
+int
+yajl_gen_config(yajl_gen g, yajl_gen_option opt, ...)
+{
+ int rv = 1;
+ va_list ap;
+ va_start(ap, opt);
+
+ switch(opt) {
+ case yajl_gen_beautify:
+ case yajl_gen_validate_utf8:
+ if (va_arg(ap, int)) g->flags |= opt;
+ else g->flags &= ~opt;
+ break;
+ case yajl_gen_indent_string: {
+ const char *indent = va_arg(ap, const char *);
+ g->indentString = indent;
+ for (; *indent; indent++) {
+ if (*indent != '\n'
+ && *indent != '\v'
+ && *indent != '\f'
+ && *indent != '\t'
+ && *indent != '\r'
+ && *indent != ' ')
+ {
+ g->indentString = NULL;
+ rv = 0;
+ }
+ }
+ break;
+ }
+ case yajl_gen_print_callback:
+ yajl_buf_free(g->ctx);
+ g->print = va_arg(ap, const yajl_print_t);
+ g->ctx = va_arg(ap, void *);
+ break;
+ default:
+ rv = 0;
+ }
+
+ va_end(ap);
+
+ return rv;
+}
+
+
+
+yajl_gen
+yajl_gen_alloc(const yajl_alloc_funcs * afs)
+{
+ yajl_gen g = NULL;
+ yajl_alloc_funcs afsBuffer;
+
+ /* first order of business is to set up memory allocation routines */
+ if (afs != NULL) {
+ if (afs->malloc == NULL || afs->realloc == NULL || afs->free == NULL)
+ {
+ return NULL;
+ }
+ } else {
+ yajl_set_default_alloc_funcs(&afsBuffer);
+ afs = &afsBuffer;
+ }
+
+ g = (yajl_gen) YA_MALLOC(afs, sizeof(struct yajl_gen_t));
+ if (!g) return NULL;
+
+ memset((void *) g, 0, sizeof(struct yajl_gen_t));
+ /* copy in pointers to allocation routines */
+ memcpy((void *) &(g->alloc), (void *) afs, sizeof(yajl_alloc_funcs));
+
+ g->print = (yajl_print_t)&yajl_buf_append;
+ g->ctx = yajl_buf_alloc(&(g->alloc));
+ g->indentString = " ";
+
+ return g;
+}
+
+void
+yajl_gen_free(yajl_gen g)
+{
+ if (g->print == (yajl_print_t)&yajl_buf_append) yajl_buf_free((yajl_buf)g->ctx);
+ YA_FREE(&(g->alloc), g);
+}
+
+#define INSERT_SEP \
+ if (g->state[g->depth] == yajl_gen_map_key || \
+ g->state[g->depth] == yajl_gen_in_array) { \
+ g->print(g->ctx, ",", 1); \
+ if ((g->flags & yajl_gen_beautify)) g->print(g->ctx, "\n", 1); \
+ } else if (g->state[g->depth] == yajl_gen_map_val) { \
+ g->print(g->ctx, ":", 1); \
+ if ((g->flags & yajl_gen_beautify)) g->print(g->ctx, " ", 1); \
+ }
+
+#define INSERT_WHITESPACE \
+ if ((g->flags & yajl_gen_beautify)) { \
+ if (g->state[g->depth] != yajl_gen_map_val) { \
+ unsigned int _i; \
+ for (_i=0;_i<g->depth;_i++) \
+ g->print(g->ctx, \
+ g->indentString, \
+ (unsigned int)strlen(g->indentString)); \
+ } \
+ }
+
+#define ENSURE_NOT_KEY \
+ if (g->state[g->depth] == yajl_gen_map_key || \
+ g->state[g->depth] == yajl_gen_map_start) { \
+ return yajl_gen_keys_must_be_strings; \
+ } \
+
+/* check that we're not complete, or in error state. in a valid state
+ * to be generating */
+#define ENSURE_VALID_STATE \
+ if (g->state[g->depth] == yajl_gen_error) { \
+ return yajl_gen_in_error_state;\
+ } else if (g->state[g->depth] == yajl_gen_complete) { \
+ return yajl_gen_generation_complete; \
+ }
+
+#define INCREMENT_DEPTH \
+ if (++(g->depth) >= YAJL_MAX_DEPTH) return yajl_max_depth_exceeded;
+
+#define DECREMENT_DEPTH \
+ if (--(g->depth) >= YAJL_MAX_DEPTH) return yajl_gen_error;
+
+#define APPENDED_ATOM \
+ switch (g->state[g->depth]) { \
+ case yajl_gen_start: \
+ g->state[g->depth] = yajl_gen_complete; \
+ break; \
+ case yajl_gen_map_start: \
+ case yajl_gen_map_key: \
+ g->state[g->depth] = yajl_gen_map_val; \
+ break; \
+ case yajl_gen_array_start: \
+ g->state[g->depth] = yajl_gen_in_array; \
+ break; \
+ case yajl_gen_map_val: \
+ g->state[g->depth] = yajl_gen_map_key; \
+ break; \
+ default: \
+ break; \
+ } \
+
+#define FINAL_NEWLINE \
+ if ((g->flags & yajl_gen_beautify) && g->state[g->depth] == yajl_gen_complete) \
+ g->print(g->ctx, "\n", 1);
+
+yajl_gen_status
+yajl_gen_integer(yajl_gen g, long long int number)
+{
+ char i[32];
+ ENSURE_VALID_STATE; ENSURE_NOT_KEY; INSERT_SEP; INSERT_WHITESPACE;
+ sprintf(i, "%lld", number);
+ g->print(g->ctx, i, (unsigned int)strlen(i));
+ APPENDED_ATOM;
+ FINAL_NEWLINE;
+ return yajl_gen_status_ok;
+}
+
+#ifdef WIN32
+#include <float.h>
+#define isnan _isnan
+#define isinf !_finite
+#endif
+
+yajl_gen_status
+yajl_gen_double(yajl_gen g, double number)
+{
+ char i[32];
+ ENSURE_VALID_STATE; ENSURE_NOT_KEY;
+ if (isnan(number) || isinf(number)) return yajl_gen_invalid_number;
+ INSERT_SEP; INSERT_WHITESPACE;
+ sprintf(i, "%.20g", number);
+ g->print(g->ctx, i, (unsigned int)strlen(i));
+ APPENDED_ATOM;
+ FINAL_NEWLINE;
+ return yajl_gen_status_ok;
+}
+
+yajl_gen_status
+yajl_gen_number(yajl_gen g, const char * s, size_t l)
+{
+ ENSURE_VALID_STATE; ENSURE_NOT_KEY; INSERT_SEP; INSERT_WHITESPACE;
+ g->print(g->ctx, s, l);
+ APPENDED_ATOM;
+ FINAL_NEWLINE;
+ return yajl_gen_status_ok;
+}
+
+yajl_gen_status
+yajl_gen_string(yajl_gen g, const unsigned char * str,
+ size_t len)
+{
+ // if validation is enabled, check that the string is valid utf8
+ // XXX: This checking could be done a little faster, in the same pass as
+ // the string encoding
+ if (g->flags & yajl_gen_validate_utf8) {
+ if (!yajl_string_validate_utf8(str, len)) {
+ return yajl_gen_invalid_string;
+ }
+ }
+ ENSURE_VALID_STATE; INSERT_SEP; INSERT_WHITESPACE;
+ g->print(g->ctx, "\"", 1);
+ yajl_string_encode(g->print, g->ctx, str, len, g->flags & yajl_gen_escape_solidus);
+ g->print(g->ctx, "\"", 1);
+ APPENDED_ATOM;
+ FINAL_NEWLINE;
+ return yajl_gen_status_ok;
+}
+
+yajl_gen_status
+yajl_gen_null(yajl_gen g)
+{
+ ENSURE_VALID_STATE; ENSURE_NOT_KEY; INSERT_SEP; INSERT_WHITESPACE;
+ g->print(g->ctx, "null", strlen("null"));
+ APPENDED_ATOM;
+ FINAL_NEWLINE;
+ return yajl_gen_status_ok;
+}
+
+yajl_gen_status
+yajl_gen_bool(yajl_gen g, int boolean)
+{
+ const char * val = boolean ? "true" : "false";
+
+ ENSURE_VALID_STATE; ENSURE_NOT_KEY; INSERT_SEP; INSERT_WHITESPACE;
+ g->print(g->ctx, val, (unsigned int)strlen(val));
+ APPENDED_ATOM;
+ FINAL_NEWLINE;
+ return yajl_gen_status_ok;
+}
+
+yajl_gen_status
+yajl_gen_map_open(yajl_gen g)
+{
+ ENSURE_VALID_STATE; ENSURE_NOT_KEY; INSERT_SEP; INSERT_WHITESPACE;
+ INCREMENT_DEPTH;
+
+ g->state[g->depth] = yajl_gen_map_start;
+ g->print(g->ctx, "{", 1);
+ if ((g->flags & yajl_gen_beautify)) g->print(g->ctx, "\n", 1);
+ FINAL_NEWLINE;
+ return yajl_gen_status_ok;
+}
+
+yajl_gen_status
+yajl_gen_map_close(yajl_gen g)
+{
+ ENSURE_VALID_STATE;
+ DECREMENT_DEPTH;
+
+ if ((g->flags & yajl_gen_beautify)) g->print(g->ctx, "\n", 1);
+ APPENDED_ATOM;
+ INSERT_WHITESPACE;
+ g->print(g->ctx, "}", 1);
+ FINAL_NEWLINE;
+ return yajl_gen_status_ok;
+}
+
+yajl_gen_status
+yajl_gen_array_open(yajl_gen g)
+{
+ ENSURE_VALID_STATE; ENSURE_NOT_KEY; INSERT_SEP; INSERT_WHITESPACE;
+ INCREMENT_DEPTH;
+ g->state[g->depth] = yajl_gen_array_start;
+ g->print(g->ctx, "[", 1);
+ if ((g->flags & yajl_gen_beautify)) g->print(g->ctx, "\n", 1);
+ FINAL_NEWLINE;
+ return yajl_gen_status_ok;
+}
+
+yajl_gen_status
+yajl_gen_array_close(yajl_gen g)
+{
+ ENSURE_VALID_STATE;
+ DECREMENT_DEPTH;
+ if ((g->flags & yajl_gen_beautify)) g->print(g->ctx, "\n", 1);
+ APPENDED_ATOM;
+ INSERT_WHITESPACE;
+ g->print(g->ctx, "]", 1);
+ FINAL_NEWLINE;
+ return yajl_gen_status_ok;
+}
+
+yajl_gen_status
+yajl_gen_get_buf(yajl_gen g, const unsigned char ** buf,
+ size_t * len)
+{
+ if (g->print != (yajl_print_t)&yajl_buf_append) return yajl_gen_no_buf;
+ *buf = yajl_buf_data((yajl_buf)g->ctx);
+ *len = yajl_buf_len((yajl_buf)g->ctx);
+ return yajl_gen_status_ok;
+}
+
+void
+yajl_gen_clear(yajl_gen g)
+{
+ if (g->print == (yajl_print_t)&yajl_buf_append) yajl_buf_clear((yajl_buf)g->ctx);
+}
diff --git a/xlators/cluster/nsr-server/src/yajl_lex.c b/xlators/cluster/nsr-server/src/yajl_lex.c
new file mode 100644
index 000000000..b098e6a99
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_lex.c
@@ -0,0 +1,763 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "yajl_lex.h"
+#include "yajl_buf.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+
+#ifdef YAJL_LEXER_DEBUG
+static const char *
+tokToStr(yajl_tok tok)
+{
+ switch (tok) {
+ case yajl_tok_bool: return "bool";
+ case yajl_tok_colon: return "colon";
+ case yajl_tok_comma: return "comma";
+ case yajl_tok_eof: return "eof";
+ case yajl_tok_error: return "error";
+ case yajl_tok_left_brace: return "brace";
+ case yajl_tok_left_bracket: return "bracket";
+ case yajl_tok_null: return "null";
+ case yajl_tok_integer: return "integer";
+ case yajl_tok_double: return "double";
+ case yajl_tok_right_brace: return "brace";
+ case yajl_tok_right_bracket: return "bracket";
+ case yajl_tok_string: return "string";
+ case yajl_tok_string_with_escapes: return "string_with_escapes";
+ }
+ return "unknown";
+}
+#endif
+
+/* Impact of the stream parsing feature on the lexer:
+ *
+ * YAJL support stream parsing. That is, the ability to parse the first
+ * bits of a chunk of JSON before the last bits are available (still on
+ * the network or disk). This makes the lexer more complex. The
+ * responsibility of the lexer is to handle transparently the case where
+ * a chunk boundary falls in the middle of a token. This is
+ * accomplished is via a buffer and a character reading abstraction.
+ *
+ * Overview of implementation
+ *
+ * When we lex to end of input string before end of token is hit, we
+ * copy all of the input text composing the token into our lexBuf.
+ *
+ * Every time we read a character, we do so through the readChar function.
+ * readChar's responsibility is to handle pulling all chars from the buffer
+ * before pulling chars from input text
+ */
+
+struct yajl_lexer_t {
+ /* the overal line and char offset into the data */
+ size_t lineOff;
+ size_t charOff;
+
+ /* error */
+ yajl_lex_error error;
+
+ /* a input buffer to handle the case where a token is spread over
+ * multiple chunks */
+ yajl_buf buf;
+
+ /* in the case where we have data in the lexBuf, bufOff holds
+ * the current offset into the lexBuf. */
+ size_t bufOff;
+
+ /* are we using the lex buf? */
+ unsigned int bufInUse;
+
+ /* shall we allow comments? */
+ unsigned int allowComments;
+
+ /* shall we validate utf8 inside strings? */
+ unsigned int validateUTF8;
+
+ yajl_alloc_funcs * alloc;
+};
+
+#define readChar(lxr, txt, off) \
+ (((lxr)->bufInUse && yajl_buf_len((lxr)->buf) && lxr->bufOff < yajl_buf_len((lxr)->buf)) ? \
+ (*((const unsigned char *) yajl_buf_data((lxr)->buf) + ((lxr)->bufOff)++)) : \
+ ((txt)[(*(off))++]))
+
+#define unreadChar(lxr, off) ((*(off) > 0) ? (*(off))-- : ((lxr)->bufOff--))
+
+yajl_lexer
+yajl_lex_alloc(yajl_alloc_funcs * alloc,
+ unsigned int allowComments, unsigned int validateUTF8)
+{
+ yajl_lexer lxr = (yajl_lexer) YA_MALLOC(alloc, sizeof(struct yajl_lexer_t));
+ memset((void *) lxr, 0, sizeof(struct yajl_lexer_t));
+ lxr->buf = yajl_buf_alloc(alloc);
+ lxr->allowComments = allowComments;
+ lxr->validateUTF8 = validateUTF8;
+ lxr->alloc = alloc;
+ return lxr;
+}
+
+void
+yajl_lex_free(yajl_lexer lxr)
+{
+ yajl_buf_free(lxr->buf);
+ YA_FREE(lxr->alloc, lxr);
+ return;
+}
+
+/* a lookup table which lets us quickly determine three things:
+ * VEC - valid escaped control char
+ * note. the solidus '/' may be escaped or not.
+ * IJC - invalid json char
+ * VHC - valid hex char
+ * NFP - needs further processing (from a string scanning perspective)
+ * NUC - needs utf8 checking when enabled (from a string scanning perspective)
+ */
+#define VEC 0x01
+#define IJC 0x02
+#define VHC 0x04
+#define NFP 0x08
+#define NUC 0x10
+
+static const char charLookupTable[256] =
+{
+/*00*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
+/*08*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
+/*10*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
+/*18*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
+
+/*20*/ 0 , 0 , NFP|VEC|IJC, 0 , 0 , 0 , 0 , 0 ,
+/*28*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , VEC ,
+/*30*/ VHC , VHC , VHC , VHC , VHC , VHC , VHC , VHC ,
+/*38*/ VHC , VHC , 0 , 0 , 0 , 0 , 0 , 0 ,
+
+/*40*/ 0 , VHC , VHC , VHC , VHC , VHC , VHC , 0 ,
+/*48*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+/*50*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+/*58*/ 0 , 0 , 0 , 0 , NFP|VEC|IJC, 0 , 0 , 0 ,
+
+/*60*/ 0 , VHC , VEC|VHC, VHC , VHC , VHC , VEC|VHC, 0 ,
+/*68*/ 0 , 0 , 0 , 0 , 0 , 0 , VEC , 0 ,
+/*70*/ 0 , 0 , VEC , 0 , VEC , 0 , 0 , 0 ,
+/*78*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
+
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC
+};
+
+/** process a variable length utf8 encoded codepoint.
+ *
+ * returns:
+ * yajl_tok_string - if valid utf8 char was parsed and offset was
+ * advanced
+ * yajl_tok_eof - if end of input was hit before validation could
+ * complete
+ * yajl_tok_error - if invalid utf8 was encountered
+ *
+ * NOTE: on error the offset will point to the first char of the
+ * invalid utf8 */
+#define UTF8_CHECK_EOF if (*offset >= jsonTextLen) { return yajl_tok_eof; }
+
+static yajl_tok
+yajl_lex_utf8_char(yajl_lexer lexer, const unsigned char * jsonText,
+ size_t jsonTextLen, size_t * offset,
+ unsigned char curChar)
+{
+ if (curChar <= 0x7f) {
+ /* single byte */
+ return yajl_tok_string;
+ } else if ((curChar >> 5) == 0x6) {
+ /* two byte */
+ UTF8_CHECK_EOF;
+ curChar = readChar(lexer, jsonText, offset);
+ if ((curChar >> 6) == 0x2) return yajl_tok_string;
+ } else if ((curChar >> 4) == 0x0e) {
+ /* three byte */
+ UTF8_CHECK_EOF;
+ curChar = readChar(lexer, jsonText, offset);
+ if ((curChar >> 6) == 0x2) {
+ UTF8_CHECK_EOF;
+ curChar = readChar(lexer, jsonText, offset);
+ if ((curChar >> 6) == 0x2) return yajl_tok_string;
+ }
+ } else if ((curChar >> 3) == 0x1e) {
+ /* four byte */
+ UTF8_CHECK_EOF;
+ curChar = readChar(lexer, jsonText, offset);
+ if ((curChar >> 6) == 0x2) {
+ UTF8_CHECK_EOF;
+ curChar = readChar(lexer, jsonText, offset);
+ if ((curChar >> 6) == 0x2) {
+ UTF8_CHECK_EOF;
+ curChar = readChar(lexer, jsonText, offset);
+ if ((curChar >> 6) == 0x2) return yajl_tok_string;
+ }
+ }
+ }
+
+ return yajl_tok_error;
+}
+
+/* lex a string. input is the lexer, pointer to beginning of
+ * json text, and start of string (offset).
+ * a token is returned which has the following meanings:
+ * yajl_tok_string: lex of string was successful. offset points to
+ * terminating '"'.
+ * yajl_tok_eof: end of text was encountered before we could complete
+ * the lex.
+ * yajl_tok_error: embedded in the string were unallowable chars. offset
+ * points to the offending char
+ */
+#define STR_CHECK_EOF \
+if (*offset >= jsonTextLen) { \
+ tok = yajl_tok_eof; \
+ goto finish_string_lex; \
+}
+
+/** scan a string for interesting characters that might need further
+ * review. return the number of chars that are uninteresting and can
+ * be skipped.
+ * (lth) hi world, any thoughts on how to make this routine faster? */
+static size_t
+yajl_string_scan(const unsigned char * buf, size_t len, int utf8check)
+{
+ unsigned char mask = IJC|NFP|(utf8check ? NUC : 0);
+ size_t skip = 0;
+ while (skip < len && !(charLookupTable[*buf] & mask))
+ {
+ skip++;
+ buf++;
+ }
+ return skip;
+}
+
+static yajl_tok
+yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
+ size_t jsonTextLen, size_t * offset)
+{
+ yajl_tok tok = yajl_tok_error;
+ int hasEscapes = 0;
+
+ for (;;) {
+ unsigned char curChar;
+
+ /* now jump into a faster scanning routine to skip as much
+ * of the buffers as possible */
+ {
+ const unsigned char * p;
+ size_t len;
+
+ if ((lexer->bufInUse && yajl_buf_len(lexer->buf) &&
+ lexer->bufOff < yajl_buf_len(lexer->buf)))
+ {
+ p = ((const unsigned char *) yajl_buf_data(lexer->buf) +
+ (lexer->bufOff));
+ len = yajl_buf_len(lexer->buf) - lexer->bufOff;
+ lexer->bufOff += yajl_string_scan(p, len, lexer->validateUTF8);
+ }
+ else if (*offset < jsonTextLen)
+ {
+ p = jsonText + *offset;
+ len = jsonTextLen - *offset;
+ *offset += yajl_string_scan(p, len, lexer->validateUTF8);
+ }
+ }
+
+ STR_CHECK_EOF;
+
+ curChar = readChar(lexer, jsonText, offset);
+
+ /* quote terminates */
+ if (curChar == '"') {
+ tok = yajl_tok_string;
+ break;
+ }
+ /* backslash escapes a set of control chars, */
+ else if (curChar == '\\') {
+ hasEscapes = 1;
+ STR_CHECK_EOF;
+
+ /* special case \u */
+ curChar = readChar(lexer, jsonText, offset);
+ if (curChar == 'u') {
+ unsigned int i = 0;
+
+ for (i=0;i<4;i++) {
+ STR_CHECK_EOF;
+ curChar = readChar(lexer, jsonText, offset);
+ if (!(charLookupTable[curChar] & VHC)) {
+ /* back up to offending char */
+ unreadChar(lexer, offset);
+ lexer->error = yajl_lex_string_invalid_hex_char;
+ goto finish_string_lex;
+ }
+ }
+ } else if (!(charLookupTable[curChar] & VEC)) {
+ /* back up to offending char */
+ unreadChar(lexer, offset);
+ lexer->error = yajl_lex_string_invalid_escaped_char;
+ goto finish_string_lex;
+ }
+ }
+ /* when not validating UTF8 it's a simple table lookup to determine
+ * if the present character is invalid */
+ else if(charLookupTable[curChar] & IJC) {
+ /* back up to offending char */
+ unreadChar(lexer, offset);
+ lexer->error = yajl_lex_string_invalid_json_char;
+ goto finish_string_lex;
+ }
+ /* when in validate UTF8 mode we need to do some extra work */
+ else if (lexer->validateUTF8) {
+ yajl_tok t = yajl_lex_utf8_char(lexer, jsonText, jsonTextLen,
+ offset, curChar);
+
+ if (t == yajl_tok_eof) {
+ tok = yajl_tok_eof;
+ goto finish_string_lex;
+ } else if (t == yajl_tok_error) {
+ lexer->error = yajl_lex_string_invalid_utf8;
+ goto finish_string_lex;
+ }
+ }
+ /* accept it, and move on */
+ }
+ finish_string_lex:
+ /* tell our buddy, the parser, wether he needs to process this string
+ * again */
+ if (hasEscapes && tok == yajl_tok_string) {
+ tok = yajl_tok_string_with_escapes;
+ }
+
+ return tok;
+}
+
+#define RETURN_IF_EOF if (*offset >= jsonTextLen) return yajl_tok_eof;
+
+static yajl_tok
+yajl_lex_number(yajl_lexer lexer, const unsigned char * jsonText,
+ size_t jsonTextLen, size_t * offset)
+{
+ /** XXX: numbers are the only entities in json that we must lex
+ * _beyond_ in order to know that they are complete. There
+ * is an ambiguous case for integers at EOF. */
+
+ unsigned char c;
+
+ yajl_tok tok = yajl_tok_integer;
+
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+
+ /* optional leading minus */
+ if (c == '-') {
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+ }
+
+ /* a single zero, or a series of integers */
+ if (c == '0') {
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+ } else if (c >= '1' && c <= '9') {
+ do {
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+ } while (c >= '0' && c <= '9');
+ } else {
+ unreadChar(lexer, offset);
+ lexer->error = yajl_lex_missing_integer_after_minus;
+ return yajl_tok_error;
+ }
+
+ /* optional fraction (indicates this is floating point) */
+ if (c == '.') {
+ int numRd = 0;
+
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+
+ while (c >= '0' && c <= '9') {
+ numRd++;
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+ }
+
+ if (!numRd) {
+ unreadChar(lexer, offset);
+ lexer->error = yajl_lex_missing_integer_after_decimal;
+ return yajl_tok_error;
+ }
+ tok = yajl_tok_double;
+ }
+
+ /* optional exponent (indicates this is floating point) */
+ if (c == 'e' || c == 'E') {
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+
+ /* optional sign */
+ if (c == '+' || c == '-') {
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+ }
+
+ if (c >= '0' && c <= '9') {
+ do {
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+ } while (c >= '0' && c <= '9');
+ } else {
+ unreadChar(lexer, offset);
+ lexer->error = yajl_lex_missing_integer_after_exponent;
+ return yajl_tok_error;
+ }
+ tok = yajl_tok_double;
+ }
+
+ /* we always go "one too far" */
+ unreadChar(lexer, offset);
+
+ return tok;
+}
+
+static yajl_tok
+yajl_lex_comment(yajl_lexer lexer, const unsigned char * jsonText,
+ size_t jsonTextLen, size_t * offset)
+{
+ unsigned char c;
+
+ yajl_tok tok = yajl_tok_comment;
+
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+
+ /* either slash or star expected */
+ if (c == '/') {
+ /* now we throw away until end of line */
+ do {
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+ } while (c != '\n');
+ } else if (c == '*') {
+ /* now we throw away until end of comment */
+ for (;;) {
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+ if (c == '*') {
+ RETURN_IF_EOF;
+ c = readChar(lexer, jsonText, offset);
+ if (c == '/') {
+ break;
+ } else {
+ unreadChar(lexer, offset);
+ }
+ }
+ }
+ } else {
+ lexer->error = yajl_lex_invalid_char;
+ tok = yajl_tok_error;
+ }
+
+ return tok;
+}
+
+yajl_tok
+yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText,
+ size_t jsonTextLen, size_t * offset,
+ const unsigned char ** outBuf, size_t * outLen)
+{
+ yajl_tok tok = yajl_tok_error;
+ unsigned char c;
+ size_t startOffset = *offset;
+
+ *outBuf = NULL;
+ *outLen = 0;
+
+ for (;;) {
+ assert(*offset <= jsonTextLen);
+
+ if (*offset >= jsonTextLen) {
+ tok = yajl_tok_eof;
+ goto lexed;
+ }
+
+ c = readChar(lexer, jsonText, offset);
+
+ switch (c) {
+ case '{':
+ tok = yajl_tok_left_bracket;
+ goto lexed;
+ case '}':
+ tok = yajl_tok_right_bracket;
+ goto lexed;
+ case '[':
+ tok = yajl_tok_left_brace;
+ goto lexed;
+ case ']':
+ tok = yajl_tok_right_brace;
+ goto lexed;
+ case ',':
+ tok = yajl_tok_comma;
+ goto lexed;
+ case ':':
+ tok = yajl_tok_colon;
+ goto lexed;
+ case '\t': case '\n': case '\v': case '\f': case '\r': case ' ':
+ startOffset++;
+ break;
+ case 't': {
+ const char * want = "rue";
+ do {
+ if (*offset >= jsonTextLen) {
+ tok = yajl_tok_eof;
+ goto lexed;
+ }
+ c = readChar(lexer, jsonText, offset);
+ if (c != *want) {
+ unreadChar(lexer, offset);
+ lexer->error = yajl_lex_invalid_string;
+ tok = yajl_tok_error;
+ goto lexed;
+ }
+ } while (*(++want));
+ tok = yajl_tok_bool;
+ goto lexed;
+ }
+ case 'f': {
+ const char * want = "alse";
+ do {
+ if (*offset >= jsonTextLen) {
+ tok = yajl_tok_eof;
+ goto lexed;
+ }
+ c = readChar(lexer, jsonText, offset);
+ if (c != *want) {
+ unreadChar(lexer, offset);
+ lexer->error = yajl_lex_invalid_string;
+ tok = yajl_tok_error;
+ goto lexed;
+ }
+ } while (*(++want));
+ tok = yajl_tok_bool;
+ goto lexed;
+ }
+ case 'n': {
+ const char * want = "ull";
+ do {
+ if (*offset >= jsonTextLen) {
+ tok = yajl_tok_eof;
+ goto lexed;
+ }
+ c = readChar(lexer, jsonText, offset);
+ if (c != *want) {
+ unreadChar(lexer, offset);
+ lexer->error = yajl_lex_invalid_string;
+ tok = yajl_tok_error;
+ goto lexed;
+ }
+ } while (*(++want));
+ tok = yajl_tok_null;
+ goto lexed;
+ }
+ case '"': {
+ tok = yajl_lex_string(lexer, (const unsigned char *) jsonText,
+ jsonTextLen, offset);
+ goto lexed;
+ }
+ case '-':
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9': {
+ /* integer parsing wants to start from the beginning */
+ unreadChar(lexer, offset);
+ tok = yajl_lex_number(lexer, (const unsigned char *) jsonText,
+ jsonTextLen, offset);
+ goto lexed;
+ }
+ case '/':
+ /* hey, look, a probable comment! If comments are disabled
+ * it's an error. */
+ if (!lexer->allowComments) {
+ unreadChar(lexer, offset);
+ lexer->error = yajl_lex_unallowed_comment;
+ tok = yajl_tok_error;
+ goto lexed;
+ }
+ /* if comments are enabled, then we should try to lex
+ * the thing. possible outcomes are
+ * - successful lex (tok_comment, which means continue),
+ * - malformed comment opening (slash not followed by
+ * '*' or '/') (tok_error)
+ * - eof hit. (tok_eof) */
+ tok = yajl_lex_comment(lexer, (const unsigned char *) jsonText,
+ jsonTextLen, offset);
+ if (tok == yajl_tok_comment) {
+ /* "error" is silly, but that's the initial
+ * state of tok. guilty until proven innocent. */
+ tok = yajl_tok_error;
+ yajl_buf_clear(lexer->buf);
+ lexer->bufInUse = 0;
+ startOffset = *offset;
+ break;
+ }
+ /* hit error or eof, bail */
+ goto lexed;
+ default:
+ lexer->error = yajl_lex_invalid_char;
+ tok = yajl_tok_error;
+ goto lexed;
+ }
+ }
+
+
+ lexed:
+ /* need to append to buffer if the buffer is in use or
+ * if it's an EOF token */
+ if (tok == yajl_tok_eof || lexer->bufInUse) {
+ if (!lexer->bufInUse) yajl_buf_clear(lexer->buf);
+ lexer->bufInUse = 1;
+ yajl_buf_append(lexer->buf, jsonText + startOffset, *offset - startOffset);
+ lexer->bufOff = 0;
+
+ if (tok != yajl_tok_eof) {
+ *outBuf = yajl_buf_data(lexer->buf);
+ *outLen = yajl_buf_len(lexer->buf);
+ lexer->bufInUse = 0;
+ }
+ } else if (tok != yajl_tok_error) {
+ *outBuf = jsonText + startOffset;
+ *outLen = *offset - startOffset;
+ }
+
+ /* special case for strings. skip the quotes. */
+ if (tok == yajl_tok_string || tok == yajl_tok_string_with_escapes)
+ {
+ assert(*outLen >= 2);
+ (*outBuf)++;
+ *outLen -= 2;
+ }
+
+
+#ifdef YAJL_LEXER_DEBUG
+ if (tok == yajl_tok_error) {
+ printf("lexical error: %s\n",
+ yajl_lex_error_to_string(yajl_lex_get_error(lexer)));
+ } else if (tok == yajl_tok_eof) {
+ printf("EOF hit\n");
+ } else {
+ printf("lexed %s: '", tokToStr(tok));
+ fwrite(*outBuf, 1, *outLen, stdout);
+ printf("'\n");
+ }
+#endif
+
+ return tok;
+}
+
+const char *
+yajl_lex_error_to_string(yajl_lex_error error)
+{
+ switch (error) {
+ case yajl_lex_e_ok:
+ return "ok, no error";
+ case yajl_lex_string_invalid_utf8:
+ return "invalid bytes in UTF8 string.";
+ case yajl_lex_string_invalid_escaped_char:
+ return "inside a string, '\\' occurs before a character "
+ "which it may not.";
+ case yajl_lex_string_invalid_json_char:
+ return "invalid character inside string.";
+ case yajl_lex_string_invalid_hex_char:
+ return "invalid (non-hex) character occurs after '\\u' inside "
+ "string.";
+ case yajl_lex_invalid_char:
+ return "invalid char in json text.";
+ case yajl_lex_invalid_string:
+ return "invalid string in json text.";
+ case yajl_lex_missing_integer_after_exponent:
+ return "malformed number, a digit is required after the exponent.";
+ case yajl_lex_missing_integer_after_decimal:
+ return "malformed number, a digit is required after the "
+ "decimal point.";
+ case yajl_lex_missing_integer_after_minus:
+ return "malformed number, a digit is required after the "
+ "minus sign.";
+ case yajl_lex_unallowed_comment:
+ return "probable comment found in input text, comments are "
+ "not enabled.";
+ }
+ return "unknown error code";
+}
+
+
+/** allows access to more specific information about the lexical
+ * error when yajl_lex_lex returns yajl_tok_error. */
+yajl_lex_error
+yajl_lex_get_error(yajl_lexer lexer)
+{
+ if (lexer == NULL) return (yajl_lex_error) -1;
+ return lexer->error;
+}
+
+size_t yajl_lex_current_line(yajl_lexer lexer)
+{
+ return lexer->lineOff;
+}
+
+size_t yajl_lex_current_char(yajl_lexer lexer)
+{
+ return lexer->charOff;
+}
+
+yajl_tok yajl_lex_peek(yajl_lexer lexer, const unsigned char * jsonText,
+ size_t jsonTextLen, size_t offset)
+{
+ const unsigned char * outBuf;
+ size_t outLen;
+ size_t bufLen = yajl_buf_len(lexer->buf);
+ size_t bufOff = lexer->bufOff;
+ unsigned int bufInUse = lexer->bufInUse;
+ yajl_tok tok;
+
+ tok = yajl_lex_lex(lexer, jsonText, jsonTextLen, &offset,
+ &outBuf, &outLen);
+
+ lexer->bufOff = bufOff;
+ lexer->bufInUse = bufInUse;
+ yajl_buf_truncate(lexer->buf, bufLen);
+
+ return tok;
+}
diff --git a/xlators/cluster/nsr-server/src/yajl_lex.h b/xlators/cluster/nsr-server/src/yajl_lex.h
new file mode 100644
index 000000000..cbaae0c13
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_lex.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef __YAJL_LEX_H__
+#define __YAJL_LEX_H__
+
+#include "yajl/yajl_common.h"
+
+typedef enum {
+ yajl_tok_bool,
+ yajl_tok_colon,
+ yajl_tok_comma,
+ yajl_tok_eof,
+ yajl_tok_error,
+ yajl_tok_left_brace,
+ yajl_tok_left_bracket,
+ yajl_tok_null,
+ yajl_tok_right_brace,
+ yajl_tok_right_bracket,
+
+ /* we differentiate between integers and doubles to allow the
+ * parser to interpret the number without re-scanning */
+ yajl_tok_integer,
+ yajl_tok_double,
+
+ /* we differentiate between strings which require further processing,
+ * and strings that do not */
+ yajl_tok_string,
+ yajl_tok_string_with_escapes,
+
+ /* comment tokens are not currently returned to the parser, ever */
+ yajl_tok_comment
+} yajl_tok;
+
+typedef struct yajl_lexer_t * yajl_lexer;
+
+yajl_lexer yajl_lex_alloc(yajl_alloc_funcs * alloc,
+ unsigned int allowComments,
+ unsigned int validateUTF8);
+
+void yajl_lex_free(yajl_lexer lexer);
+
+/**
+ * run/continue a lex. "offset" is an input/output parameter.
+ * It should be initialized to zero for a
+ * new chunk of target text, and upon subsetquent calls with the same
+ * target text should passed with the value of the previous invocation.
+ *
+ * the client may be interested in the value of offset when an error is
+ * returned from the lexer. This allows the client to render useful
+n * error messages.
+ *
+ * When you pass the next chunk of data, context should be reinitialized
+ * to zero.
+ *
+ * Finally, the output buffer is usually just a pointer into the jsonText,
+ * however in cases where the entity being lexed spans multiple chunks,
+ * the lexer will buffer the entity and the data returned will be
+ * a pointer into that buffer.
+ *
+ * This behavior is abstracted from client code except for the performance
+ * implications which require that the client choose a reasonable chunk
+ * size to get adequate performance.
+ */
+yajl_tok yajl_lex_lex(yajl_lexer lexer, const unsigned char * jsonText,
+ size_t jsonTextLen, size_t * offset,
+ const unsigned char ** outBuf, size_t * outLen);
+
+/** have a peek at the next token, but don't move the lexer forward */
+yajl_tok yajl_lex_peek(yajl_lexer lexer, const unsigned char * jsonText,
+ size_t jsonTextLen, size_t offset);
+
+
+typedef enum {
+ yajl_lex_e_ok = 0,
+ yajl_lex_string_invalid_utf8,
+ yajl_lex_string_invalid_escaped_char,
+ yajl_lex_string_invalid_json_char,
+ yajl_lex_string_invalid_hex_char,
+ yajl_lex_invalid_char,
+ yajl_lex_invalid_string,
+ yajl_lex_missing_integer_after_decimal,
+ yajl_lex_missing_integer_after_exponent,
+ yajl_lex_missing_integer_after_minus,
+ yajl_lex_unallowed_comment
+} yajl_lex_error;
+
+const char * yajl_lex_error_to_string(yajl_lex_error error);
+
+/** allows access to more specific information about the lexical
+ * error when yajl_lex_lex returns yajl_tok_error. */
+yajl_lex_error yajl_lex_get_error(yajl_lexer lexer);
+
+/** get the current offset into the most recently lexed json string. */
+size_t yajl_lex_current_offset(yajl_lexer lexer);
+
+/** get the number of lines lexed by this lexer instance */
+size_t yajl_lex_current_line(yajl_lexer lexer);
+
+/** get the number of chars lexed by this lexer instance since the last
+ * \n or \r */
+size_t yajl_lex_current_char(yajl_lexer lexer);
+
+#endif
diff --git a/xlators/cluster/nsr-server/src/yajl_parser.c b/xlators/cluster/nsr-server/src/yajl_parser.c
new file mode 100644
index 000000000..bf9ef24ef
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_parser.c
@@ -0,0 +1,492 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "yajl/yajl_parse.h"
+#include "yajl_lex.h"
+#include "yajl_parser.h"
+#include "yajl_encode.h"
+#include "yajl_bytestack.h"
+
+#include <stdlib.h>
+#include <limits.h>
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <assert.h>
+#include <math.h>
+
+#define MAX_VALUE_TO_MULTIPLY ((LLONG_MAX / 10) + (LLONG_MAX % 10))
+
+ /* same semantics as strtol */
+long long
+yajl_parse_integer(const unsigned char *number, unsigned int length)
+{
+ long long ret = 0;
+ long sign = 1;
+ const unsigned char *pos = number;
+ if (*pos == '-') { pos++; sign = -1; }
+ if (*pos == '+') { pos++; }
+
+ while (pos < number + length) {
+ if ( ret > MAX_VALUE_TO_MULTIPLY ) {
+ errno = ERANGE;
+ return sign == 1 ? LLONG_MAX : LLONG_MIN;
+ }
+ ret *= 10;
+ if (LLONG_MAX - ret < (*pos - '0')) {
+ errno = ERANGE;
+ return sign == 1 ? LLONG_MAX : LLONG_MIN;
+ }
+ ret += (*pos++ - '0');
+ }
+
+ return sign * ret;
+}
+
+unsigned char *
+yajl_render_error_string(yajl_handle hand, const unsigned char * jsonText,
+ size_t jsonTextLen, int verbose)
+{
+ size_t offset = hand->bytesConsumed;
+ unsigned char * str;
+ const char * errorType = NULL;
+ const char * errorText = NULL;
+ char text[72];
+ const char * arrow = " (right here) ------^\n";
+
+ if (yajl_bs_current(hand->stateStack) == yajl_state_parse_error) {
+ errorType = "parse";
+ errorText = hand->parseError;
+ } else if (yajl_bs_current(hand->stateStack) == yajl_state_lexical_error) {
+ errorType = "lexical";
+ errorText = yajl_lex_error_to_string(yajl_lex_get_error(hand->lexer));
+ } else {
+ errorType = "unknown";
+ }
+
+ {
+ size_t memneeded = 0;
+ memneeded += strlen(errorType);
+ memneeded += strlen(" error");
+ if (errorText != NULL) {
+ memneeded += strlen(": ");
+ memneeded += strlen(errorText);
+ }
+ str = (unsigned char *) YA_MALLOC(&(hand->alloc), memneeded + 2);
+ if (!str) return NULL;
+ str[0] = 0;
+ strcat((char *) str, errorType);
+ strcat((char *) str, " error");
+ if (errorText != NULL) {
+ strcat((char *) str, ": ");
+ strcat((char *) str, errorText);
+ }
+ strcat((char *) str, "\n");
+ }
+
+ /* now we append as many spaces as needed to make sure the error
+ * falls at char 41, if verbose was specified */
+ if (verbose) {
+ size_t start, end, i;
+ size_t spacesNeeded;
+
+ spacesNeeded = (offset < 30 ? 40 - offset : 10);
+ start = (offset >= 30 ? offset - 30 : 0);
+ end = (offset + 30 > jsonTextLen ? jsonTextLen : offset + 30);
+
+ for (i=0;i<spacesNeeded;i++) text[i] = ' ';
+
+ for (;start < end;start++, i++) {
+ if (jsonText[start] != '\n' && jsonText[start] != '\r')
+ {
+ text[i] = jsonText[start];
+ }
+ else
+ {
+ text[i] = ' ';
+ }
+ }
+ assert(i <= 71);
+ text[i++] = '\n';
+ text[i] = 0;
+ {
+ char * newStr = (char *)
+ YA_MALLOC(&(hand->alloc), (unsigned int)(strlen((char *) str) +
+ strlen((char *) text) +
+ strlen(arrow) + 1));
+ if (newStr) {
+ newStr[0] = 0;
+ strcat((char *) newStr, (char *) str);
+ strcat((char *) newStr, text);
+ strcat((char *) newStr, arrow);
+ }
+ YA_FREE(&(hand->alloc), str);
+ str = (unsigned char *) newStr;
+ }
+ }
+ return str;
+}
+
+/* check for client cancelation */
+#define _CC_CHK(x) \
+ if (!(x)) { \
+ yajl_bs_set(hand->stateStack, yajl_state_parse_error); \
+ hand->parseError = \
+ "client cancelled parse via callback return value"; \
+ return yajl_status_client_canceled; \
+ }
+
+
+yajl_status
+yajl_do_finish(yajl_handle hand)
+{
+ yajl_status stat;
+ stat = yajl_do_parse(hand,(const unsigned char *) " ",1);
+
+ if (stat != yajl_status_ok) return stat;
+
+ switch(yajl_bs_current(hand->stateStack))
+ {
+ case yajl_state_parse_error:
+ case yajl_state_lexical_error:
+ return yajl_status_error;
+ case yajl_state_got_value:
+ case yajl_state_parse_complete:
+ return yajl_status_ok;
+ default:
+ if (!(hand->flags & yajl_allow_partial_values))
+ {
+ yajl_bs_set(hand->stateStack, yajl_state_parse_error);
+ hand->parseError = "premature EOF";
+ return yajl_status_error;
+ }
+ return yajl_status_ok;
+ }
+}
+
+yajl_status
+yajl_do_parse(yajl_handle hand, const unsigned char * jsonText,
+ size_t jsonTextLen)
+{
+ yajl_tok tok;
+ const unsigned char * buf;
+ size_t bufLen;
+ size_t * offset = &(hand->bytesConsumed);
+
+ *offset = 0;
+
+ around_again:
+ switch (yajl_bs_current(hand->stateStack)) {
+ case yajl_state_parse_complete:
+ if (hand->flags & yajl_allow_multiple_values) {
+ yajl_bs_set(hand->stateStack, yajl_state_got_value);
+ goto around_again;
+ }
+ if (!(hand->flags & yajl_allow_trailing_garbage)) {
+ if (*offset != jsonTextLen) {
+ tok = yajl_lex_lex(hand->lexer, jsonText, jsonTextLen,
+ offset, &buf, &bufLen);
+ if (tok != yajl_tok_eof) {
+ yajl_bs_set(hand->stateStack, yajl_state_parse_error);
+ hand->parseError = "trailing garbage";
+ }
+ goto around_again;
+ }
+ }
+ return yajl_status_ok;
+ case yajl_state_lexical_error:
+ case yajl_state_parse_error:
+ return yajl_status_error;
+ case yajl_state_start:
+ case yajl_state_got_value:
+ case yajl_state_map_need_val:
+ case yajl_state_array_need_val:
+ case yajl_state_array_start: {
+ /* for arrays and maps, we advance the state for this
+ * depth, then push the state of the next depth.
+ * If an error occurs during the parsing of the nesting
+ * enitity, the state at this level will not matter.
+ * a state that needs pushing will be anything other
+ * than state_start */
+
+ yajl_state stateToPush = yajl_state_start;
+
+ tok = yajl_lex_lex(hand->lexer, jsonText, jsonTextLen,
+ offset, &buf, &bufLen);
+
+ switch (tok) {
+ case yajl_tok_eof:
+ return yajl_status_ok;
+ case yajl_tok_error:
+ yajl_bs_set(hand->stateStack, yajl_state_lexical_error);
+ goto around_again;
+ case yajl_tok_string:
+ if (hand->callbacks && hand->callbacks->yajl_string) {
+ _CC_CHK(hand->callbacks->yajl_string(hand->ctx,
+ buf, bufLen));
+ }
+ break;
+ case yajl_tok_string_with_escapes:
+ if (hand->callbacks && hand->callbacks->yajl_string) {
+ yajl_buf_clear(hand->decodeBuf);
+ yajl_string_decode(hand->decodeBuf, buf, bufLen);
+ _CC_CHK(hand->callbacks->yajl_string(
+ hand->ctx, yajl_buf_data(hand->decodeBuf),
+ yajl_buf_len(hand->decodeBuf)));
+ }
+ break;
+ case yajl_tok_bool:
+ if (hand->callbacks && hand->callbacks->yajl_boolean) {
+ _CC_CHK(hand->callbacks->yajl_boolean(hand->ctx,
+ *buf == 't'));
+ }
+ break;
+ case yajl_tok_null:
+ if (hand->callbacks && hand->callbacks->yajl_null) {
+ _CC_CHK(hand->callbacks->yajl_null(hand->ctx));
+ }
+ break;
+ case yajl_tok_left_bracket:
+ if (hand->callbacks && hand->callbacks->yajl_start_map) {
+ _CC_CHK(hand->callbacks->yajl_start_map(hand->ctx));
+ }
+ stateToPush = yajl_state_map_start;
+ break;
+ case yajl_tok_left_brace:
+ if (hand->callbacks && hand->callbacks->yajl_start_array) {
+ _CC_CHK(hand->callbacks->yajl_start_array(hand->ctx));
+ }
+ stateToPush = yajl_state_array_start;
+ break;
+ case yajl_tok_integer:
+ if (hand->callbacks) {
+ if (hand->callbacks->yajl_number) {
+ _CC_CHK(hand->callbacks->yajl_number(
+ hand->ctx,(const char *) buf, bufLen));
+ } else if (hand->callbacks->yajl_integer) {
+ long long int i = 0;
+ i = yajl_parse_integer(buf, bufLen);
+ if ((i == LLONG_MIN || i == LLONG_MAX) &&
+ errno == ERANGE)
+ {
+ yajl_bs_set(hand->stateStack,
+ yajl_state_parse_error);
+ hand->parseError = "integer overflow" ;
+ /* try to restore error offset */
+ if (*offset >= bufLen) *offset -= bufLen;
+ else *offset = 0;
+ goto around_again;
+ }
+ _CC_CHK(hand->callbacks->yajl_integer(hand->ctx,
+ i));
+ }
+ }
+ break;
+ case yajl_tok_double:
+ if (hand->callbacks) {
+ if (hand->callbacks->yajl_number) {
+ _CC_CHK(hand->callbacks->yajl_number(
+ hand->ctx, (const char *) buf, bufLen));
+ } else if (hand->callbacks->yajl_double) {
+ double d = 0.0;
+ yajl_buf_clear(hand->decodeBuf);
+ yajl_buf_append(hand->decodeBuf, buf, bufLen);
+ buf = yajl_buf_data(hand->decodeBuf);
+ d = strtod((char *) buf, NULL);
+ if ((d == HUGE_VAL || d == -HUGE_VAL) &&
+ errno == ERANGE)
+ {
+ yajl_bs_set(hand->stateStack,
+ yajl_state_parse_error);
+ hand->parseError = "numeric (floating point) "
+ "overflow";
+ /* try to restore error offset */
+ if (*offset >= bufLen) *offset -= bufLen;
+ else *offset = 0;
+ goto around_again;
+ }
+ _CC_CHK(hand->callbacks->yajl_double(hand->ctx,
+ d));
+ }
+ }
+ break;
+ case yajl_tok_right_brace: {
+ if (yajl_bs_current(hand->stateStack) ==
+ yajl_state_array_start)
+ {
+ if (hand->callbacks &&
+ hand->callbacks->yajl_end_array)
+ {
+ _CC_CHK(hand->callbacks->yajl_end_array(hand->ctx));
+ }
+ yajl_bs_pop(hand->stateStack);
+ goto around_again;
+ }
+ /* intentional fall-through */
+ }
+ case yajl_tok_colon:
+ case yajl_tok_comma:
+ case yajl_tok_right_bracket:
+ yajl_bs_set(hand->stateStack, yajl_state_parse_error);
+ hand->parseError =
+ "unallowed token at this point in JSON text";
+ goto around_again;
+ default:
+ yajl_bs_set(hand->stateStack, yajl_state_parse_error);
+ hand->parseError = "invalid token, internal error";
+ goto around_again;
+ }
+ /* got a value. transition depends on the state we're in. */
+ {
+ yajl_state s = yajl_bs_current(hand->stateStack);
+ if (s == yajl_state_start || s == yajl_state_got_value) {
+ yajl_bs_set(hand->stateStack, yajl_state_parse_complete);
+ } else if (s == yajl_state_map_need_val) {
+ yajl_bs_set(hand->stateStack, yajl_state_map_got_val);
+ } else {
+ yajl_bs_set(hand->stateStack, yajl_state_array_got_val);
+ }
+ }
+ if (stateToPush != yajl_state_start) {
+ yajl_bs_push(hand->stateStack, stateToPush);
+ }
+
+ goto around_again;
+ }
+ case yajl_state_map_start:
+ case yajl_state_map_need_key: {
+ /* only difference between these two states is that in
+ * start '}' is valid, whereas in need_key, we've parsed
+ * a comma, and a string key _must_ follow */
+ tok = yajl_lex_lex(hand->lexer, jsonText, jsonTextLen,
+ offset, &buf, &bufLen);
+ switch (tok) {
+ case yajl_tok_eof:
+ return yajl_status_ok;
+ case yajl_tok_error:
+ yajl_bs_set(hand->stateStack, yajl_state_lexical_error);
+ goto around_again;
+ case yajl_tok_string_with_escapes:
+ if (hand->callbacks && hand->callbacks->yajl_map_key) {
+ yajl_buf_clear(hand->decodeBuf);
+ yajl_string_decode(hand->decodeBuf, buf, bufLen);
+ buf = yajl_buf_data(hand->decodeBuf);
+ bufLen = yajl_buf_len(hand->decodeBuf);
+ }
+ /* intentional fall-through */
+ case yajl_tok_string:
+ if (hand->callbacks && hand->callbacks->yajl_map_key) {
+ _CC_CHK(hand->callbacks->yajl_map_key(hand->ctx, buf,
+ bufLen));
+ }
+ yajl_bs_set(hand->stateStack, yajl_state_map_sep);
+ goto around_again;
+ case yajl_tok_right_bracket:
+ if (yajl_bs_current(hand->stateStack) ==
+ yajl_state_map_start)
+ {
+ if (hand->callbacks && hand->callbacks->yajl_end_map) {
+ _CC_CHK(hand->callbacks->yajl_end_map(hand->ctx));
+ }
+ yajl_bs_pop(hand->stateStack);
+ goto around_again;
+ }
+ default:
+ yajl_bs_set(hand->stateStack, yajl_state_parse_error);
+ hand->parseError =
+ "invalid object key (must be a string)";
+ goto around_again;
+ }
+ }
+ case yajl_state_map_sep: {
+ tok = yajl_lex_lex(hand->lexer, jsonText, jsonTextLen,
+ offset, &buf, &bufLen);
+ switch (tok) {
+ case yajl_tok_colon:
+ yajl_bs_set(hand->stateStack, yajl_state_map_need_val);
+ goto around_again;
+ case yajl_tok_eof:
+ return yajl_status_ok;
+ case yajl_tok_error:
+ yajl_bs_set(hand->stateStack, yajl_state_lexical_error);
+ goto around_again;
+ default:
+ yajl_bs_set(hand->stateStack, yajl_state_parse_error);
+ hand->parseError = "object key and value must "
+ "be separated by a colon (':')";
+ goto around_again;
+ }
+ }
+ case yajl_state_map_got_val: {
+ tok = yajl_lex_lex(hand->lexer, jsonText, jsonTextLen,
+ offset, &buf, &bufLen);
+ switch (tok) {
+ case yajl_tok_right_bracket:
+ if (hand->callbacks && hand->callbacks->yajl_end_map) {
+ _CC_CHK(hand->callbacks->yajl_end_map(hand->ctx));
+ }
+ yajl_bs_pop(hand->stateStack);
+ goto around_again;
+ case yajl_tok_comma:
+ yajl_bs_set(hand->stateStack, yajl_state_map_need_key);
+ goto around_again;
+ case yajl_tok_eof:
+ return yajl_status_ok;
+ case yajl_tok_error:
+ yajl_bs_set(hand->stateStack, yajl_state_lexical_error);
+ goto around_again;
+ default:
+ yajl_bs_set(hand->stateStack, yajl_state_parse_error);
+ hand->parseError = "after key and value, inside map, "
+ "I expect ',' or '}'";
+ /* try to restore error offset */
+ if (*offset >= bufLen) *offset -= bufLen;
+ else *offset = 0;
+ goto around_again;
+ }
+ }
+ case yajl_state_array_got_val: {
+ tok = yajl_lex_lex(hand->lexer, jsonText, jsonTextLen,
+ offset, &buf, &bufLen);
+ switch (tok) {
+ case yajl_tok_right_brace:
+ if (hand->callbacks && hand->callbacks->yajl_end_array) {
+ _CC_CHK(hand->callbacks->yajl_end_array(hand->ctx));
+ }
+ yajl_bs_pop(hand->stateStack);
+ goto around_again;
+ case yajl_tok_comma:
+ yajl_bs_set(hand->stateStack, yajl_state_array_need_val);
+ goto around_again;
+ case yajl_tok_eof:
+ return yajl_status_ok;
+ case yajl_tok_error:
+ yajl_bs_set(hand->stateStack, yajl_state_lexical_error);
+ goto around_again;
+ default:
+ yajl_bs_set(hand->stateStack, yajl_state_parse_error);
+ hand->parseError =
+ "after array element, I expect ',' or ']'";
+ goto around_again;
+ }
+ }
+ }
+
+ abort();
+ return yajl_status_error;
+}
+
diff --git a/xlators/cluster/nsr-server/src/yajl_parser.h b/xlators/cluster/nsr-server/src/yajl_parser.h
new file mode 100644
index 000000000..53409731a
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_parser.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2007-2011, Lloyd Hilaiel <lloyd@hilaiel.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef __YAJL_PARSER_H__
+#define __YAJL_PARSER_H__
+
+#include "yajl/yajl_parse.h"
+#include "yajl_bytestack.h"
+#include "yajl_buf.h"
+#include "yajl_lex.h"
+
+
+typedef enum {
+ yajl_state_start = 0,
+ yajl_state_parse_complete,
+ yajl_state_parse_error,
+ yajl_state_lexical_error,
+ yajl_state_map_start,
+ yajl_state_map_sep,
+ yajl_state_map_need_val,
+ yajl_state_map_got_val,
+ yajl_state_map_need_key,
+ yajl_state_array_start,
+ yajl_state_array_got_val,
+ yajl_state_array_need_val,
+ yajl_state_got_value,
+} yajl_state;
+
+struct yajl_handle_t {
+ const yajl_callbacks * callbacks;
+ void * ctx;
+ yajl_lexer lexer;
+ const char * parseError;
+ /* the number of bytes consumed from the last client buffer,
+ * in the case of an error this will be an error offset, in the
+ * case of an error this can be used as the error offset */
+ size_t bytesConsumed;
+ /* temporary storage for decoded strings */
+ yajl_buf decodeBuf;
+ /* a stack of states. access with yajl_state_XXX routines */
+ yajl_bytestack stateStack;
+ /* memory allocation routines */
+ yajl_alloc_funcs alloc;
+ /* bitfield */
+ unsigned int flags;
+};
+
+yajl_status
+yajl_do_parse(yajl_handle handle, const unsigned char * jsonText,
+ size_t jsonTextLen);
+
+yajl_status
+yajl_do_finish(yajl_handle handle);
+
+unsigned char *
+yajl_render_error_string(yajl_handle hand, const unsigned char * jsonText,
+ size_t jsonTextLen, int verbose);
+
+/* A little built in integer parsing routine with the same semantics as strtol
+ * that's unaffected by LOCALE. */
+long long
+yajl_parse_integer(const unsigned char *number, unsigned int length);
+
+
+#endif
diff --git a/xlators/cluster/nsr-server/src/yajl_tree.c b/xlators/cluster/nsr-server/src/yajl_tree.c
new file mode 100644
index 000000000..1a69134e7
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_tree.c
@@ -0,0 +1,501 @@
+/*
+ * Copyright (c) 2010-2011 Florian Forster <ff at octo.it>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "yajl/yajl_tree.h"
+#include "yajl/yajl_parse.h"
+
+#include "yajl_parser.h"
+
+#ifdef WIN32
+#define snprintf sprintf_s
+#endif
+
+#define STATUS_CONTINUE 1
+#define STATUS_ABORT 0
+
+struct stack_elem_s;
+typedef struct stack_elem_s stack_elem_t;
+struct stack_elem_s
+{
+ char * key;
+ yajl_val value;
+ stack_elem_t *next;
+};
+
+struct context_s
+{
+ stack_elem_t *stack;
+ yajl_val root;
+ char *errbuf;
+ size_t errbuf_size;
+};
+typedef struct context_s context_t;
+
+#define RETURN_ERROR(ctx,retval,...) { \
+ if ((ctx)->errbuf != NULL) \
+ snprintf ((ctx)->errbuf, (ctx)->errbuf_size, __VA_ARGS__); \
+ return (retval); \
+ }
+
+static yajl_val value_alloc (yajl_type type)
+{
+ yajl_val v;
+
+ v = malloc (sizeof (*v));
+ if (v == NULL) return (NULL);
+ memset (v, 0, sizeof (*v));
+ v->type = type;
+
+ return (v);
+}
+
+static void yajl_object_free (yajl_val v)
+{
+ size_t i;
+
+ if (!YAJL_IS_OBJECT(v)) return;
+
+ for (i = 0; i < v->u.object.len; i++)
+ {
+ free((char *) v->u.object.keys[i]);
+ v->u.object.keys[i] = NULL;
+ yajl_tree_free (v->u.object.values[i]);
+ v->u.object.values[i] = NULL;
+ }
+
+ free((void*) v->u.object.keys);
+ free(v->u.object.values);
+ free(v);
+}
+
+static void yajl_array_free (yajl_val v)
+{
+ size_t i;
+
+ if (!YAJL_IS_ARRAY(v)) return;
+
+ for (i = 0; i < v->u.array.len; i++)
+ {
+ yajl_tree_free (v->u.array.values[i]);
+ v->u.array.values[i] = NULL;
+ }
+
+ free(v->u.array.values);
+ free(v);
+}
+
+/*
+ * Parsing nested objects and arrays is implemented using a stack. When a new
+ * object or array starts (a curly or a square opening bracket is read), an
+ * appropriate value is pushed on the stack. When the end of the object is
+ * reached (an appropriate closing bracket has been read), the value is popped
+ * off the stack and added to the enclosing object using "context_add_value".
+ */
+static int context_push(context_t *ctx, yajl_val v)
+{
+ stack_elem_t *stack;
+
+ stack = malloc (sizeof (*stack));
+ if (stack == NULL)
+ RETURN_ERROR (ctx, ENOMEM, "Out of memory");
+ memset (stack, 0, sizeof (*stack));
+
+ assert ((ctx->stack == NULL)
+ || YAJL_IS_OBJECT (v)
+ || YAJL_IS_ARRAY (v));
+
+ stack->value = v;
+ stack->next = ctx->stack;
+ ctx->stack = stack;
+
+ return (0);
+}
+
+static yajl_val context_pop(context_t *ctx)
+{
+ stack_elem_t *stack;
+ yajl_val v;
+
+ if (ctx->stack == NULL)
+ RETURN_ERROR (ctx, NULL, "context_pop: "
+ "Bottom of stack reached prematurely");
+
+ stack = ctx->stack;
+ ctx->stack = stack->next;
+
+ v = stack->value;
+
+ free (stack);
+
+ return (v);
+}
+
+static int object_add_keyval(context_t *ctx,
+ yajl_val obj, char *key, yajl_val value)
+{
+ const char **tmpk;
+ yajl_val *tmpv;
+
+ /* We're checking for NULL in "context_add_value" or its callers. */
+ assert (ctx != NULL);
+ assert (obj != NULL);
+ assert (key != NULL);
+ assert (value != NULL);
+
+ /* We're assuring that "obj" is an object in "context_add_value". */
+ assert(YAJL_IS_OBJECT(obj));
+
+ tmpk = realloc((void *) obj->u.object.keys, sizeof(*(obj->u.object.keys)) * (obj->u.object.len + 1));
+ if (tmpk == NULL)
+ RETURN_ERROR(ctx, ENOMEM, "Out of memory");
+ obj->u.object.keys = tmpk;
+
+ tmpv = realloc(obj->u.object.values, sizeof (*obj->u.object.values) * (obj->u.object.len + 1));
+ if (tmpv == NULL)
+ RETURN_ERROR(ctx, ENOMEM, "Out of memory");
+ obj->u.object.values = tmpv;
+
+ obj->u.object.keys[obj->u.object.len] = key;
+ obj->u.object.values[obj->u.object.len] = value;
+ obj->u.object.len++;
+
+ return (0);
+}
+
+static int array_add_value (context_t *ctx,
+ yajl_val array, yajl_val value)
+{
+ yajl_val *tmp;
+
+ /* We're checking for NULL pointers in "context_add_value" or its
+ * callers. */
+ assert (ctx != NULL);
+ assert (array != NULL);
+ assert (value != NULL);
+
+ /* "context_add_value" will only call us with array values. */
+ assert(YAJL_IS_ARRAY(array));
+
+ tmp = realloc(array->u.array.values,
+ sizeof(*(array->u.array.values)) * (array->u.array.len + 1));
+ if (tmp == NULL)
+ RETURN_ERROR(ctx, ENOMEM, "Out of memory");
+ array->u.array.values = tmp;
+ array->u.array.values[array->u.array.len] = value;
+ array->u.array.len++;
+
+ return 0;
+}
+
+/*
+ * Add a value to the value on top of the stack or the "root" member in the
+ * context if the end of the parsing process is reached.
+ */
+static int context_add_value (context_t *ctx, yajl_val v)
+{
+ /* We're checking for NULL values in all the calling functions. */
+ assert (ctx != NULL);
+ assert (v != NULL);
+
+ /*
+ * There are three valid states in which this function may be called:
+ * - There is no value on the stack => This is the only value. This is the
+ * last step done when parsing a document. We assign the value to the
+ * "root" member and return.
+ * - The value on the stack is an object. In this case store the key on the
+ * stack or, if the key has already been read, add key and value to the
+ * object.
+ * - The value on the stack is an array. In this case simply add the value
+ * and return.
+ */
+ if (ctx->stack == NULL)
+ {
+ assert (ctx->root == NULL);
+ ctx->root = v;
+ return (0);
+ }
+ else if (YAJL_IS_OBJECT (ctx->stack->value))
+ {
+ if (ctx->stack->key == NULL)
+ {
+ if (!YAJL_IS_STRING (v))
+ RETURN_ERROR (ctx, EINVAL, "context_add_value: "
+ "Object key is not a string (%#04x)",
+ v->type);
+
+ ctx->stack->key = v->u.string;
+ v->u.string = NULL;
+ free(v);
+ return (0);
+ }
+ else /* if (ctx->key != NULL) */
+ {
+ char * key;
+
+ key = ctx->stack->key;
+ ctx->stack->key = NULL;
+ return (object_add_keyval (ctx, ctx->stack->value, key, v));
+ }
+ }
+ else if (YAJL_IS_ARRAY (ctx->stack->value))
+ {
+ return (array_add_value (ctx, ctx->stack->value, v));
+ }
+ else
+ {
+ RETURN_ERROR (ctx, EINVAL, "context_add_value: Cannot add value to "
+ "a value of type %#04x (not a composite type)",
+ ctx->stack->value->type);
+ }
+}
+
+static int handle_string (void *ctx,
+ const unsigned char *string, size_t string_length)
+{
+ yajl_val v;
+
+ v = value_alloc (yajl_t_string);
+ if (v == NULL)
+ RETURN_ERROR ((context_t *) ctx, STATUS_ABORT, "Out of memory");
+
+ v->u.string = malloc (string_length + 1);
+ if (v->u.string == NULL)
+ {
+ free (v);
+ RETURN_ERROR ((context_t *) ctx, STATUS_ABORT, "Out of memory");
+ }
+ memcpy(v->u.string, string, string_length);
+ v->u.string[string_length] = 0;
+
+ return ((context_add_value (ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT);
+}
+
+static int handle_number (void *ctx, const char *string, size_t string_length)
+{
+ yajl_val v;
+ char *endptr;
+
+ v = value_alloc(yajl_t_number);
+ if (v == NULL)
+ RETURN_ERROR((context_t *) ctx, STATUS_ABORT, "Out of memory");
+
+ v->u.number.r = malloc(string_length + 1);
+ if (v->u.number.r == NULL)
+ {
+ free(v);
+ RETURN_ERROR((context_t *) ctx, STATUS_ABORT, "Out of memory");
+ }
+ memcpy(v->u.number.r, string, string_length);
+ v->u.number.r[string_length] = 0;
+
+ v->u.number.flags = 0;
+
+ endptr = NULL;
+ errno = 0;
+ v->u.number.i = yajl_parse_integer((const unsigned char *) v->u.number.r,
+ strlen(v->u.number.r));
+ if ((errno == 0) && (endptr != NULL) && (*endptr == 0))
+ v->u.number.flags |= YAJL_NUMBER_INT_VALID;
+
+ endptr = NULL;
+ errno = 0;
+ v->u.number.d = strtod(v->u.number.r, &endptr);
+ if ((errno == 0) && (endptr != NULL) && (*endptr == 0))
+ v->u.number.flags |= YAJL_NUMBER_DOUBLE_VALID;
+
+ return ((context_add_value(ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT);
+}
+
+static int handle_start_map (void *ctx)
+{
+ yajl_val v;
+
+ v = value_alloc(yajl_t_object);
+ if (v == NULL)
+ RETURN_ERROR ((context_t *) ctx, STATUS_ABORT, "Out of memory");
+
+ v->u.object.keys = NULL;
+ v->u.object.values = NULL;
+ v->u.object.len = 0;
+
+ return ((context_push (ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT);
+}
+
+static int handle_end_map (void *ctx)
+{
+ yajl_val v;
+
+ v = context_pop (ctx);
+ if (v == NULL)
+ return (STATUS_ABORT);
+
+ return ((context_add_value (ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT);
+}
+
+static int handle_start_array (void *ctx)
+{
+ yajl_val v;
+
+ v = value_alloc(yajl_t_array);
+ if (v == NULL)
+ RETURN_ERROR ((context_t *) ctx, STATUS_ABORT, "Out of memory");
+
+ v->u.array.values = NULL;
+ v->u.array.len = 0;
+
+ return ((context_push (ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT);
+}
+
+static int handle_end_array (void *ctx)
+{
+ yajl_val v;
+
+ v = context_pop (ctx);
+ if (v == NULL)
+ return (STATUS_ABORT);
+
+ return ((context_add_value (ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT);
+}
+
+static int handle_boolean (void *ctx, int boolean_value)
+{
+ yajl_val v;
+
+ v = value_alloc (boolean_value ? yajl_t_true : yajl_t_false);
+ if (v == NULL)
+ RETURN_ERROR ((context_t *) ctx, STATUS_ABORT, "Out of memory");
+
+ return ((context_add_value (ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT);
+}
+
+static int handle_null (void *ctx)
+{
+ yajl_val v;
+
+ v = value_alloc (yajl_t_null);
+ if (v == NULL)
+ RETURN_ERROR ((context_t *) ctx, STATUS_ABORT, "Out of memory");
+
+ return ((context_add_value (ctx, v) == 0) ? STATUS_CONTINUE : STATUS_ABORT);
+}
+
+/*
+ * Public functions
+ */
+yajl_val yajl_tree_parse (const char *input,
+ char *error_buffer, size_t error_buffer_size)
+{
+ static const yajl_callbacks callbacks =
+ {
+ /* null = */ handle_null,
+ /* boolean = */ handle_boolean,
+ /* integer = */ NULL,
+ /* double = */ NULL,
+ /* number = */ handle_number,
+ /* string = */ handle_string,
+ /* start map = */ handle_start_map,
+ /* map key = */ handle_string,
+ /* end map = */ handle_end_map,
+ /* start array = */ handle_start_array,
+ /* end array = */ handle_end_array
+ };
+
+ yajl_handle handle;
+ yajl_status status;
+ context_t ctx = { NULL, NULL, NULL, 0 };
+
+ ctx.errbuf = error_buffer;
+ ctx.errbuf_size = error_buffer_size;
+
+ if (error_buffer != NULL)
+ memset (error_buffer, 0, error_buffer_size);
+
+ handle = yajl_alloc (&callbacks, NULL, &ctx);
+ yajl_config(handle, yajl_allow_comments, 1);
+
+ status = yajl_parse(handle,
+ (unsigned char *) input,
+ strlen (input));
+ status = yajl_complete_parse (handle);
+ if (status != yajl_status_ok) {
+ if (error_buffer != NULL && error_buffer_size > 0) {
+ snprintf(
+ error_buffer, error_buffer_size, "%s",
+ (char *) yajl_get_error(handle, 1,
+ (const unsigned char *) input,
+ strlen(input)));
+ }
+ yajl_free (handle);
+ return NULL;
+ }
+
+ yajl_free (handle);
+ return (ctx.root);
+}
+
+yajl_val yajl_tree_get(yajl_val n, const char ** path, yajl_type type)
+{
+ if (!path) return NULL;
+ while (n && *path) {
+ unsigned int i;
+
+ if (n->type != yajl_t_object) return NULL;
+ for (i = 0; i < n->u.object.len; i++) {
+ if (!strcmp(*path, n->u.object.keys[i])) {
+ n = n->u.object.values[i];
+ break;
+ }
+ }
+ if (i == n->u.object.len) return NULL;
+ path++;
+ }
+ if (n && type != yajl_t_any && type != n->type) n = NULL;
+ return n;
+}
+
+void yajl_tree_free (yajl_val v)
+{
+ if (v == NULL) return;
+
+ if (YAJL_IS_STRING(v))
+ {
+ free(v->u.string);
+ free(v);
+ }
+ else if (YAJL_IS_NUMBER(v))
+ {
+ free(v->u.number.r);
+ free(v);
+ }
+ else if (YAJL_GET_OBJECT(v))
+ {
+ yajl_object_free(v);
+ }
+ else if (YAJL_GET_ARRAY(v))
+ {
+ yajl_array_free(v);
+ }
+ else /* if (yajl_t_true or yajl_t_false or yajl_t_null) */
+ {
+ free(v);
+ }
+}
diff --git a/xlators/cluster/nsr-server/src/yajl_version.c b/xlators/cluster/nsr-server/src/yajl_version.c
new file mode 100644
index 000000000..0671da722
--- /dev/null
+++ b/xlators/cluster/nsr-server/src/yajl_version.c
@@ -0,0 +1,7 @@
+#include <yajl/yajl_version.h>
+
+int yajl_version(void)
+{
+ return YAJL_VERSION;
+}
+
diff --git a/xlators/cluster/stripe/src/Makefile.am b/xlators/cluster/stripe/src/Makefile.am
index 42846df83..4268d6f03 100644
--- a/xlators/cluster/stripe/src/Makefile.am
+++ b/xlators/cluster/stripe/src/Makefile.am
@@ -1,8 +1,7 @@
-
xlator_LTLIBRARIES = stripe.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
-stripe_la_LDFLAGS = -module -avoidversion
+stripe_la_LDFLAGS = -module -avoid-version
stripe_la_SOURCES = stripe.c stripe-helpers.c \
$(top_builddir)/xlators/lib/src/libxlator.c
diff --git a/xlators/cluster/stripe/src/stripe-helpers.c b/xlators/cluster/stripe/src/stripe-helpers.c
index e85661c14..3c12809d6 100644
--- a/xlators/cluster/stripe/src/stripe-helpers.c
+++ b/xlators/cluster/stripe/src/stripe-helpers.c
@@ -12,6 +12,8 @@
#include "stripe.h"
#include "byte-order.h"
+#include "mem-types.h"
+#include "logging.h"
void
stripe_local_wipe (stripe_local_t *local)
@@ -158,6 +160,93 @@ stripe_free_xattr_str (stripe_local_t *local)
int32_t
+stripe_fill_lockinfo_xattr (xlator_t *this, stripe_local_t *local,
+ void **xattr_serz)
+{
+ int32_t ret = -1, i = 0, len = 0;
+ dict_t *tmp1 = NULL, *tmp2 = NULL;
+ char *buf = NULL;
+ stripe_xattr_sort_t *xattr = NULL;
+
+ if (xattr_serz == NULL) {
+ goto out;
+ }
+
+ tmp2 = dict_new ();
+
+ if (tmp2 == NULL) {
+ goto out;
+ }
+
+ for (i = 0; i < local->nallocs; i++) {
+ xattr = local->xattr_list + i;
+ len = xattr->xattr_len;
+
+ if (len && xattr && xattr->xattr_value) {
+ ret = dict_reset (tmp2);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "dict_reset failed (%s)",
+ strerror (-ret));
+ }
+
+ ret = dict_unserialize (xattr->xattr_value,
+ xattr->xattr_len,
+ &tmp2);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "dict_unserialize failed (%s)",
+ strerror (-ret));
+ ret = -1;
+ goto out;
+ }
+
+ tmp1 = dict_copy (tmp2, tmp1);
+ if (tmp1 == NULL) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "dict_copy failed (%s)",
+ strerror (-ret));
+ ret = -1;
+ goto out;
+ }
+ }
+ }
+
+ len = dict_serialized_length (tmp1);
+ if (len > 0) {
+ buf = GF_CALLOC (1, len, gf_common_mt_dict_t);
+ if (buf == NULL) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_serialize (tmp1, buf);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "dict_serialize failed (%s)", strerror (-ret));
+ GF_FREE(buf);
+ ret = -1;
+ goto out;
+ }
+
+ *xattr_serz = buf;
+ }
+
+ ret = 0;
+out:
+ if (tmp1 != NULL) {
+ dict_unref (tmp1);
+ }
+
+ if (tmp2 != NULL) {
+ dict_unref (tmp2);
+ }
+
+ return ret;
+}
+
+
+int32_t
stripe_fill_pathinfo_xattr (xlator_t *this, stripe_local_t *local,
char **xattr_serz)
{
@@ -172,8 +261,8 @@ stripe_fill_pathinfo_xattr (xlator_t *this, stripe_local_t *local,
goto out;
}
- (void) snprintf (stripe_size_str, 20, "%ld",
- (local->fctx) ? local->fctx->stripe_size : 0);
+ (void) snprintf (stripe_size_str, 20, "%"PRId64,
+ (long long) (local->fctx) ? local->fctx->stripe_size : 0);
/* extra bytes for decorations (brackets and <>'s) */
padding = strlen (this->name) + strlen (STRIPE_PATHINFO_HEADER)
@@ -193,6 +282,7 @@ stripe_fill_pathinfo_xattr (xlator_t *this, stripe_local_t *local,
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"Cannot aggregate pathinfo list");
+ GF_FREE(pathinfo_serz);
goto out;
}
@@ -415,7 +505,7 @@ set_default_block_size (stripe_private_t *priv, char *num)
GF_VALIDATE_OR_GOTO (THIS->name, num, out);
- if (gf_string2bytesize (num, &priv->block_size) != 0) {
+ if (gf_string2bytesize_uint64 (num, &priv->block_size) != 0) {
gf_log (THIS->name, GF_LOG_ERROR,
"invalid number format \"%s\"", num);
goto out;
@@ -465,7 +555,7 @@ set_stripe_block_size (xlator_t *this, stripe_private_t *priv, char *data)
if (ret)
goto out;
}
- if (gf_string2bytesize (num, &stripe_opt->block_size) != 0) {
+ if (gf_string2bytesize_uint64 (num, &stripe_opt->block_size) != 0) {
gf_log (this->name, GF_LOG_ERROR,
"invalid number format \"%s\"", num);
goto out;
@@ -585,4 +675,3 @@ uncoalesced_size(off_t size, uint64_t stripe_size, int stripe_count,
return size;
}
-
diff --git a/xlators/cluster/stripe/src/stripe-mem-types.h b/xlators/cluster/stripe/src/stripe-mem-types.h
index e05ba0c29..e9ac9cf46 100644
--- a/xlators/cluster/stripe/src/stripe-mem-types.h
+++ b/xlators/cluster/stripe/src/stripe-mem-types.h
@@ -20,6 +20,7 @@ enum gf_stripe_mem_types_ {
gf_stripe_mt_stripe_fd_ctx_t,
gf_stripe_mt_char,
gf_stripe_mt_int8_t,
+ gf_stripe_mt_int32_t,
gf_stripe_mt_xlator_t,
gf_stripe_mt_stripe_private_t,
gf_stripe_mt_stripe_options,
diff --git a/xlators/cluster/stripe/src/stripe.c b/xlators/cluster/stripe/src/stripe.c
index 76d912a09..0ebea8168 100644
--- a/xlators/cluster/stripe/src/stripe.c
+++ b/xlators/cluster/stripe/src/stripe.c
@@ -1053,6 +1053,9 @@ stripe_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc,
op_errno = ENOMEM;
goto err;
}
+
+ frame->local = local;
+
local->op_ret = -1;
loc_copy (&local->loc, oldloc);
loc_copy (&local->loc2, newloc);
@@ -1066,8 +1069,6 @@ stripe_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc,
local->fctx = fctx;
}
- frame->local = local;
-
STACK_WIND (frame, stripe_first_rename_cbk, trav->xlator,
trav->xlator->fops->rename, oldloc, newloc, NULL);
@@ -2879,15 +2880,15 @@ stripe_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, dict
goto err;
}
+ frame->local = local;
+
inode_ctx_get(fd->inode, this, (uint64_t *) &fctx);
if (!fctx) {
op_errno = EINVAL;
goto err;
}
local->fctx = fctx;
-
local->op_ret = -1;
- frame->local = local;
local->call_count = priv->child_count;
while (trav) {
@@ -3774,6 +3775,502 @@ err:
int32_t
+stripe_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ int32_t callcnt = 0;
+ stripe_local_t *local = NULL;
+ stripe_local_t *mlocal = NULL;
+ call_frame_t *prev = NULL;
+ call_frame_t *mframe = NULL;
+
+ if (!this || !frame || !frame->local || !cookie) {
+ gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
+ goto out;
+ }
+
+ prev = cookie;
+ local = frame->local;
+ mframe = local->orig_frame;
+ mlocal = mframe->local;
+
+ LOCK(&frame->lock);
+ {
+ callcnt = ++mlocal->call_count;
+
+ if (op_ret == 0) {
+ mlocal->post_buf = *postbuf;
+ mlocal->pre_buf = *prebuf;
+
+ mlocal->prebuf_blocks += prebuf->ia_blocks;
+ mlocal->postbuf_blocks += postbuf->ia_blocks;
+
+ correct_file_size(prebuf, mlocal->fctx, prev);
+ correct_file_size(postbuf, mlocal->fctx, prev);
+
+ if (mlocal->prebuf_size < prebuf->ia_size)
+ mlocal->prebuf_size = prebuf->ia_size;
+ if (mlocal->postbuf_size < postbuf->ia_size)
+ mlocal->postbuf_size = postbuf->ia_size;
+ }
+
+ /* return the first failure */
+ if (mlocal->op_ret == 0) {
+ mlocal->op_ret = op_ret;
+ mlocal->op_errno = op_errno;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if ((callcnt == mlocal->wind_count) && mlocal->unwind) {
+ mlocal->pre_buf.ia_size = mlocal->prebuf_size;
+ mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks;
+ mlocal->post_buf.ia_size = mlocal->postbuf_size;
+ mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks;
+
+ STRIPE_STACK_UNWIND (fallocate, mframe, mlocal->op_ret,
+ mlocal->op_errno, &mlocal->pre_buf,
+ &mlocal->post_buf, NULL);
+ }
+out:
+ STRIPE_STACK_DESTROY(frame);
+ return 0;
+}
+
+int32_t
+stripe_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ stripe_local_t *local = NULL;
+ stripe_fd_ctx_t *fctx = NULL;
+ int32_t op_errno = 1;
+ int32_t idx = 0;
+ int32_t offset_offset = 0;
+ int32_t remaining_size = 0;
+ off_t fill_size = 0;
+ uint64_t stripe_size = 0;
+ uint64_t tmp_fctx = 0;
+ off_t dest_offset = 0;
+ call_frame_t *fframe = NULL;
+ stripe_local_t *flocal = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (fd->inode, err);
+
+ inode_ctx_get (fd->inode, this, &tmp_fctx);
+ if (!tmp_fctx) {
+ op_errno = EINVAL;
+ goto err;
+ }
+ fctx = (stripe_fd_ctx_t *)(long)tmp_fctx;
+ stripe_size = fctx->stripe_size;
+
+ STRIPE_VALIDATE_FCTX (fctx, err);
+
+ remaining_size = len;
+
+ local = mem_get0 (this->local_pool);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ frame->local = local;
+ local->stripe_size = stripe_size;
+ local->fctx = fctx;
+
+ if (!stripe_size) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Wrong stripe size for the file");
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ while (1) {
+ fframe = copy_frame(frame);
+ flocal = mem_get0(this->local_pool);
+ if (!flocal) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ flocal->orig_frame = frame;
+ fframe->local = flocal;
+
+ /* send fallocate request to the associated child node */
+ idx = (((offset + offset_offset) /
+ local->stripe_size) % fctx->stripe_count);
+
+ fill_size = (local->stripe_size -
+ ((offset + offset_offset) % local->stripe_size));
+ if (fill_size > remaining_size)
+ fill_size = remaining_size;
+
+ remaining_size -= fill_size;
+
+ local->wind_count++;
+ if (remaining_size == 0)
+ local->unwind = 1;
+
+ dest_offset = offset + offset_offset;
+ if (fctx->stripe_coalesce)
+ dest_offset = coalesced_offset(dest_offset,
+ local->stripe_size, fctx->stripe_count);
+
+ /*
+ * TODO: Create a separate handler for coalesce mode that sends a
+ * single fallocate per-child (since the ranges are linear).
+ */
+ STACK_WIND(fframe, stripe_fallocate_cbk, fctx->xl_array[idx],
+ fctx->xl_array[idx]->fops->fallocate, fd, mode,
+ dest_offset, fill_size, xdata);
+
+ offset_offset += fill_size;
+ if (remaining_size == 0)
+ break;
+ }
+
+ return 0;
+err:
+ if (fframe)
+ STRIPE_STACK_DESTROY(fframe);
+
+ STRIPE_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+
+int32_t
+stripe_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ int32_t callcnt = 0;
+ stripe_local_t *local = NULL;
+ stripe_local_t *mlocal = NULL;
+ call_frame_t *prev = NULL;
+ call_frame_t *mframe = NULL;
+
+ if (!this || !frame || !frame->local || !cookie) {
+ gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
+ goto out;
+ }
+
+ prev = cookie;
+ local = frame->local;
+ mframe = local->orig_frame;
+ mlocal = mframe->local;
+
+ LOCK(&frame->lock);
+ {
+ callcnt = ++mlocal->call_count;
+
+ if (op_ret == 0) {
+ mlocal->post_buf = *postbuf;
+ mlocal->pre_buf = *prebuf;
+
+ mlocal->prebuf_blocks += prebuf->ia_blocks;
+ mlocal->postbuf_blocks += postbuf->ia_blocks;
+
+ correct_file_size(prebuf, mlocal->fctx, prev);
+ correct_file_size(postbuf, mlocal->fctx, prev);
+
+ if (mlocal->prebuf_size < prebuf->ia_size)
+ mlocal->prebuf_size = prebuf->ia_size;
+ if (mlocal->postbuf_size < postbuf->ia_size)
+ mlocal->postbuf_size = postbuf->ia_size;
+ }
+
+ /* return the first failure */
+ if (mlocal->op_ret == 0) {
+ mlocal->op_ret = op_ret;
+ mlocal->op_errno = op_errno;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if ((callcnt == mlocal->wind_count) && mlocal->unwind) {
+ mlocal->pre_buf.ia_size = mlocal->prebuf_size;
+ mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks;
+ mlocal->post_buf.ia_size = mlocal->postbuf_size;
+ mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks;
+
+ STRIPE_STACK_UNWIND (discard, mframe, mlocal->op_ret,
+ mlocal->op_errno, &mlocal->pre_buf,
+ &mlocal->post_buf, NULL);
+ }
+out:
+ STRIPE_STACK_DESTROY(frame);
+ return 0;
+}
+
+int32_t
+stripe_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ stripe_local_t *local = NULL;
+ stripe_fd_ctx_t *fctx = NULL;
+ int32_t op_errno = 1;
+ int32_t idx = 0;
+ int32_t offset_offset = 0;
+ int32_t remaining_size = 0;
+ off_t fill_size = 0;
+ uint64_t stripe_size = 0;
+ uint64_t tmp_fctx = 0;
+ off_t dest_offset = 0;
+ call_frame_t *fframe = NULL;
+ stripe_local_t *flocal = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (fd->inode, err);
+
+ inode_ctx_get (fd->inode, this, &tmp_fctx);
+ if (!tmp_fctx) {
+ op_errno = EINVAL;
+ goto err;
+ }
+ fctx = (stripe_fd_ctx_t *)(long)tmp_fctx;
+ stripe_size = fctx->stripe_size;
+
+ STRIPE_VALIDATE_FCTX (fctx, err);
+
+ remaining_size = len;
+
+ local = mem_get0 (this->local_pool);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ frame->local = local;
+ local->stripe_size = stripe_size;
+ local->fctx = fctx;
+
+ if (!stripe_size) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Wrong stripe size for the file");
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ while (1) {
+ fframe = copy_frame(frame);
+ flocal = mem_get0(this->local_pool);
+ if (!flocal) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ flocal->orig_frame = frame;
+ fframe->local = flocal;
+
+ /* send discard request to the associated child node */
+ idx = (((offset + offset_offset) /
+ local->stripe_size) % fctx->stripe_count);
+
+ fill_size = (local->stripe_size -
+ ((offset + offset_offset) % local->stripe_size));
+ if (fill_size > remaining_size)
+ fill_size = remaining_size;
+
+ remaining_size -= fill_size;
+
+ local->wind_count++;
+ if (remaining_size == 0)
+ local->unwind = 1;
+
+ dest_offset = offset + offset_offset;
+ if (fctx->stripe_coalesce)
+ dest_offset = coalesced_offset(dest_offset,
+ local->stripe_size, fctx->stripe_count);
+
+ /*
+ * TODO: Create a separate handler for coalesce mode that sends a
+ * single discard per-child (since the ranges are linear).
+ */
+ STACK_WIND(fframe, stripe_discard_cbk, fctx->xl_array[idx],
+ fctx->xl_array[idx]->fops->discard, fd, dest_offset,
+ fill_size, xdata);
+
+ offset_offset += fill_size;
+ if (remaining_size == 0)
+ break;
+ }
+
+ return 0;
+err:
+ if (fframe)
+ STRIPE_STACK_DESTROY(fframe);
+
+ STRIPE_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t
+stripe_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ int32_t callcnt = 0;
+ stripe_local_t *local = NULL;
+ stripe_local_t *mlocal = NULL;
+ call_frame_t *prev = NULL;
+ call_frame_t *mframe = NULL;
+
+ GF_ASSERT (frame);
+
+ if (!this || !frame->local || !cookie) {
+ gf_log ("stripe", GF_LOG_DEBUG, "possible NULL deref");
+ goto out;
+ }
+
+ prev = cookie;
+ local = frame->local;
+ mframe = local->orig_frame;
+ mlocal = mframe->local;
+
+ LOCK(&frame->lock);
+ {
+ callcnt = ++mlocal->call_count;
+
+ if (op_ret == 0) {
+ mlocal->post_buf = *postbuf;
+ mlocal->pre_buf = *prebuf;
+
+ mlocal->prebuf_blocks += prebuf->ia_blocks;
+ mlocal->postbuf_blocks += postbuf->ia_blocks;
+
+ correct_file_size(prebuf, mlocal->fctx, prev);
+ correct_file_size(postbuf, mlocal->fctx, prev);
+
+ if (mlocal->prebuf_size < prebuf->ia_size)
+ mlocal->prebuf_size = prebuf->ia_size;
+ if (mlocal->postbuf_size < postbuf->ia_size)
+ mlocal->postbuf_size = postbuf->ia_size;
+ }
+
+ /* return the first failure */
+ if (mlocal->op_ret == 0) {
+ mlocal->op_ret = op_ret;
+ mlocal->op_errno = op_errno;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if ((callcnt == mlocal->wind_count) && mlocal->unwind) {
+ mlocal->pre_buf.ia_size = mlocal->prebuf_size;
+ mlocal->pre_buf.ia_blocks = mlocal->prebuf_blocks;
+ mlocal->post_buf.ia_size = mlocal->postbuf_size;
+ mlocal->post_buf.ia_blocks = mlocal->postbuf_blocks;
+
+ STRIPE_STACK_UNWIND (zerofill, mframe, mlocal->op_ret,
+ mlocal->op_errno, &mlocal->pre_buf,
+ &mlocal->post_buf, NULL);
+ }
+out:
+ STRIPE_STACK_DESTROY(frame);
+ return 0;
+}
+
+int32_t
+stripe_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata)
+{
+ stripe_local_t *local = NULL;
+ stripe_fd_ctx_t *fctx = NULL;
+ int32_t op_errno = 1;
+ int32_t idx = 0;
+ int32_t offset_offset = 0;
+ int32_t remaining_size = 0;
+ off_t fill_size = 0;
+ uint64_t stripe_size = 0;
+ uint64_t tmp_fctx = 0;
+ off_t dest_offset = 0;
+ call_frame_t *fframe = NULL;
+ stripe_local_t *flocal = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO (fd->inode, err);
+
+ inode_ctx_get (fd->inode, this, &tmp_fctx);
+ if (!tmp_fctx) {
+ op_errno = EINVAL;
+ goto err;
+ }
+ fctx = (stripe_fd_ctx_t *)(long)tmp_fctx;
+ stripe_size = fctx->stripe_size;
+
+ STRIPE_VALIDATE_FCTX (fctx, err);
+
+ remaining_size = len;
+
+ local = mem_get0 (this->local_pool);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ frame->local = local;
+ local->stripe_size = stripe_size;
+ local->fctx = fctx;
+
+ if (!stripe_size) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Wrong stripe size for the file");
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ while (1) {
+ fframe = copy_frame(frame);
+ flocal = mem_get0(this->local_pool);
+ if (!flocal) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ flocal->orig_frame = frame;
+ fframe->local = flocal;
+
+ idx = (((offset + offset_offset) /
+ local->stripe_size) % fctx->stripe_count);
+
+ fill_size = (local->stripe_size -
+ ((offset + offset_offset) % local->stripe_size));
+ if (fill_size > remaining_size)
+ fill_size = remaining_size;
+
+ remaining_size -= fill_size;
+
+ local->wind_count++;
+ if (remaining_size == 0)
+ local->unwind = 1;
+
+ dest_offset = offset + offset_offset;
+ if (fctx->stripe_coalesce)
+ dest_offset = coalesced_offset(dest_offset,
+ local->stripe_size,
+ fctx->stripe_count);
+
+ STACK_WIND(fframe, stripe_zerofill_cbk, fctx->xl_array[idx],
+ fctx->xl_array[idx]->fops->zerofill, fd,
+ dest_offset, fill_size, xdata);
+ offset_offset += fill_size;
+ if (remaining_size == 0)
+ break;
+ }
+
+ return 0;
+err:
+ if (fframe)
+ STRIPE_STACK_DESTROY(fframe);
+
+ STRIPE_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t
stripe_release (xlator_t *this, fd_t *fd)
{
return 0;
@@ -3809,6 +4306,7 @@ notify (xlator_t *this, int32_t event, void *data, ...)
stripe_private_t *priv = NULL;
int down_client = 0;
int i = 0;
+ gf_boolean_t heard_from_all_children = _gf_false;
if (!this)
return 0;
@@ -3820,30 +4318,34 @@ notify (xlator_t *this, int32_t event, void *data, ...)
switch (event)
{
case GF_EVENT_CHILD_UP:
- case GF_EVENT_CHILD_CONNECTING:
{
/* get an index number to set */
for (i = 0; i < priv->child_count; i++) {
if (data == priv->xl_array[i])
break;
}
- priv->state[i] = 1;
- for (i = 0; i < priv->child_count; i++) {
- if (!priv->state[i])
- down_client++;
+
+ if (priv->child_count == i) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "got GF_EVENT_CHILD_UP bad subvolume %s",
+ data? ((xlator_t *)data)->name: NULL);
+ break;
}
LOCK (&priv->lock);
{
- priv->nodes_down = down_client;
if (data == FIRST_CHILD (this))
priv->first_child_down = 0;
- if (!priv->nodes_down)
- default_notify (this, event, data);
+ priv->last_event[i] = event;
}
UNLOCK (&priv->lock);
}
break;
+ case GF_EVENT_CHILD_CONNECTING:
+ {
+ // 'CONNECTING' doesn't ensure its CHILD_UP, so do nothing
+ goto out;
+ }
case GF_EVENT_CHILD_DOWN:
{
/* get an index number to set */
@@ -3851,20 +4353,19 @@ notify (xlator_t *this, int32_t event, void *data, ...)
if (data == priv->xl_array[i])
break;
}
- priv->state[i] = 0;
- for (i = 0; i < priv->child_count; i++) {
- if (!priv->state[i])
- down_client++;
+
+ if (priv->child_count == i) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "got GF_EVENT_CHILD_DOWN bad subvolume %s",
+ data? ((xlator_t *)data)->name: NULL);
+ break;
}
LOCK (&priv->lock);
{
- priv->nodes_down = down_client;
-
if (data == FIRST_CHILD (this))
priv->first_child_down = 1;
- if (priv->nodes_down)
- default_notify (this, event, data);
+ priv->last_event[i] = event;
}
UNLOCK (&priv->lock);
}
@@ -3874,10 +4375,30 @@ notify (xlator_t *this, int32_t event, void *data, ...)
{
/* */
default_notify (this, event, data);
+ goto out;
}
break;
}
+ // Consider child as down if it's last_event is not CHILD_UP
+ for (i = 0, down_client = 0; i < priv->child_count; i++)
+ if (priv->last_event[i] != GF_EVENT_CHILD_UP)
+ down_client++;
+
+ LOCK (&priv->lock);
+ {
+ priv->nodes_down = down_client;
+ }
+ UNLOCK (&priv->lock);
+
+ heard_from_all_children = _gf_true;
+ for (i = 0; i < priv->child_count; i++)
+ if (!priv->last_event[i])
+ heard_from_all_children = _gf_false;
+
+ if (heard_from_all_children)
+ default_notify (this, event, data);
+out:
return 0;
}
@@ -3923,6 +4444,37 @@ stripe_setxattr_cbk (call_frame_t *frame, void *cookie,
return 0;
}
+#ifdef HAVE_BD_XLATOR
+int
+stripe_is_bd (dict_t *this, char *key, data_t *value, void *data)
+{
+ gf_boolean_t *is_bd = data;
+
+ if (data == NULL)
+ return 0;
+
+ if (XATTR_IS_BD (key))
+ *is_bd = _gf_true;
+
+ return 0;
+}
+
+static inline gf_boolean_t
+stripe_setxattr_is_bd (dict_t *dict)
+{
+ gf_boolean_t is_bd = _gf_false;
+
+ if (dict == NULL)
+ goto out;
+
+ dict_foreach (dict, stripe_is_bd, &is_bd);
+out:
+ return is_bd;
+}
+#else
+#define stripe_setxattr_is_bd(dict) _gf_false
+#endif
+
int
stripe_setxattr (call_frame_t *frame, xlator_t *this,
loc_t *loc, dict_t *dict, int flags, dict_t *xdata)
@@ -3932,6 +4484,7 @@ stripe_setxattr (call_frame_t *frame, xlator_t *this,
stripe_private_t *priv = NULL;
stripe_local_t *local = NULL;
int i = 0;
+ gf_boolean_t is_bd = _gf_false;
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (this, err);
@@ -3954,11 +4507,15 @@ stripe_setxattr (call_frame_t *frame, xlator_t *this,
local->wind_count = priv->child_count;
local->op_ret = local->op_errno = 0;
+ is_bd = stripe_setxattr_is_bd (dict);
+
/**
* Set xattrs for directories on all subvolumes. Additionally
- * this power is only given to a special client.
+ * this power is only given to a special client. Bd xlator
+ * also needs xattrs for regular files (ie LVs)
*/
- if ((frame->root->pid == -1) && IA_ISDIR (loc->inode->ia_type)) {
+ if (((frame->root->pid == GF_CLIENT_PID_GSYNCD) &&
+ IA_ISDIR (loc->inode->ia_type)) || is_bd) {
for (i = 0; i < priv->child_count; i++, trav = trav->next) {
STACK_WIND (frame, stripe_setxattr_cbk,
trav->xlator, trav->xlator->fops->setxattr,
@@ -3987,12 +4544,111 @@ stripe_fsetxattr_cbk (call_frame_t *frame, void *cookie,
return 0;
}
+
+int
+stripe_is_special_key (dict_t *this,
+ char *key,
+ data_t *value,
+ void *data)
+{
+ gf_boolean_t *is_special = NULL;
+
+ if (data == NULL) {
+ goto out;
+ }
+
+ is_special = data;
+
+ if (XATTR_IS_LOCKINFO (key) || XATTR_IS_BD (key))
+ *is_special = _gf_true;
+
+out:
+ return 0;
+}
+
+int32_t
+stripe_fsetxattr_everyone_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *xdata)
+{
+ int call_count = 0;
+ stripe_local_t *local = NULL;
+
+ local = frame->local;
+
+ LOCK (&frame->lock);
+ {
+ call_count = --local->wind_count;
+
+ if (op_ret < 0) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ }
+ }
+ UNLOCK (&frame->lock);
+
+ if (call_count == 0) {
+ STRIPE_STACK_UNWIND (fsetxattr, frame, local->op_ret,
+ local->op_errno, NULL);
+ }
+ return 0;
+}
+
+int
+stripe_fsetxattr_to_everyone (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ dict_t *dict, int flags, dict_t *xdata)
+{
+ xlator_list_t *trav = NULL;
+ stripe_private_t *priv = NULL;
+ int ret = -1;
+ stripe_local_t *local = NULL;
+
+ priv = this->private;
+
+ local = mem_get0 (this->local_pool);
+ if (local == NULL) {
+ goto out;
+ }
+
+ frame->local = local;
+
+ local->wind_count = priv->child_count;
+
+ trav = this->children;
+
+ while (trav) {
+ STACK_WIND (frame, stripe_fsetxattr_everyone_cbk,
+ trav->xlator, trav->xlator->fops->fsetxattr,
+ fd, dict, flags, xdata);
+ trav = trav->next;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static inline gf_boolean_t
+stripe_fsetxattr_is_special (dict_t *dict)
+{
+ gf_boolean_t is_spl = _gf_false;
+
+ if (dict == NULL) {
+ goto out;
+ }
+
+ dict_foreach (dict, stripe_is_special_key, &is_spl);
+
+out:
+ return is_spl;
+}
+
int
stripe_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
dict_t *dict, int flags, dict_t *xdata)
{
- int32_t op_ret = -1;
- int32_t op_errno = EINVAL;
+ int32_t op_ret = -1, ret = -1, op_errno = EINVAL;
+ gf_boolean_t is_spl = _gf_false;
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (this, err);
@@ -4001,12 +4657,25 @@ stripe_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
GF_IF_INTERNAL_XATTR_GOTO ("trusted.*stripe*", dict,
op_errno, err);
+ is_spl = stripe_fsetxattr_is_special (dict);
+ if (is_spl) {
+ ret = stripe_fsetxattr_to_everyone (frame, this, fd, dict,
+ flags, xdata);
+ if (ret < 0) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ goto out;
+ }
+
STACK_WIND (frame, stripe_fsetxattr_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->fsetxattr,
fd, dict, flags, xdata);
+out:
return 0;
- err:
+err:
STRIPE_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, NULL);
return 0;
}
@@ -4217,7 +4886,7 @@ unlock:
if (!local_entry)
break;
- if (!IA_ISREG (local_entry->d_stat.ia_type)) {
+ if (!IA_ISREG (local_entry->d_stat.ia_type) || !local_entry->inode) {
LOCK (&frame->lock);
{
local->wind_count--;
@@ -4411,7 +5080,7 @@ reconfigure (xlator_t *this, dict_t *options)
goto unlock;
}
- if (gf_string2bytesize (opt->default_value, &priv->block_size)){
+ if (gf_string2bytesize_uint64 (opt->default_value, &priv->block_size)){
gf_log (this->name, GF_LOG_ERROR,
"Unable to set default block-size ");
ret = -1;
@@ -4486,9 +5155,9 @@ init (xlator_t *this)
if (!priv->xl_array)
goto out;
- priv->state = GF_CALLOC (count, sizeof (int8_t),
- gf_stripe_mt_int8_t);
- if (!priv->state)
+ priv->last_event = GF_CALLOC (count, sizeof (int),
+ gf_stripe_mt_int32_t);
+ if (!priv->last_event)
goto out;
priv->child_count = count;
@@ -4518,7 +5187,7 @@ init (xlator_t *this)
ret = -1;
goto unlock;
}
- if (gf_string2bytesize (opt->default_value, &priv->block_size)){
+ if (gf_string2bytesize_uint64 (opt->default_value, &priv->block_size)){
gf_log (this->name, GF_LOG_ERROR,
"Unable to set default block-size ");
ret = -1;
@@ -4589,6 +5258,7 @@ fini (xlator_t *this)
trav = trav->next;
GF_FREE (prev);
}
+ GF_FREE (priv->last_event);
LOCK_DESTROY (&priv->lock);
GF_FREE (priv);
}
@@ -4687,8 +5357,8 @@ stripe_vgetxattr_cbk (call_frame_t *frame, void *cookie,
int32_t callcnt = 0;
int32_t ret = -1;
long cky = 0;
- char *xattr_val = NULL;
- char *xattr_serz = NULL;
+ void *xattr_val = NULL;
+ void *xattr_serz = NULL;
stripe_xattr_sort_t *xattr = NULL;
dict_t *stripe_xattr = NULL;
@@ -4719,18 +5389,20 @@ stripe_vgetxattr_cbk (call_frame_t *frame, void *cookie,
gf_stripe_mt_xattr_sort_t);
if (local->xattr_list) {
- ret = dict_get_str (dict, local->xsel, &xattr_val);
- if (ret)
- goto out;
-
xattr = local->xattr_list + (int32_t) cky;
- xattr_val = gf_strdup (xattr_val);
+ ret = dict_get_ptr_and_len (dict, local->xsel,
+ &xattr_val,
+ &xattr->xattr_len);
+ if (xattr->xattr_len == 0)
+ goto out;
+
xattr->pos = cky;
- xattr->xattr_value = xattr_val;
- xattr->xattr_len = strlen (xattr_val);
+ xattr->xattr_value = gf_memdup (xattr_val,
+ xattr->xattr_len);
- local->xattr_total_len += xattr->xattr_len + 1;
+ if (xattr->xattr_value != NULL)
+ local->xattr_total_len += xattr->xattr_len + 1;
}
}
out:
@@ -4747,19 +5419,24 @@ stripe_vgetxattr_cbk (call_frame_t *frame, void *cookie,
/* select filler based on ->xsel */
if (XATTR_IS_PATHINFO (local->xsel))
ret = stripe_fill_pathinfo_xattr (this, local,
+ (char **)&xattr_serz);
+ else if (XATTR_IS_LOCKINFO (local->xsel)) {
+ ret = stripe_fill_lockinfo_xattr (this, local,
&xattr_serz);
- else {
+ } else {
gf_log (this->name, GF_LOG_WARNING,
"Unknown xattr in xattr request");
goto unwind;
}
if (!ret) {
- ret = dict_set_dynstr (stripe_xattr, local->xsel,
- xattr_serz);
+ ret = dict_set_dynptr (stripe_xattr, local->xsel,
+ xattr_serz,
+ local->xattr_total_len);
if (ret)
gf_log (this->name, GF_LOG_ERROR,
- "Can't set %s key in dict", local->xsel);
+ "Can't set %s key in dict",
+ local->xsel);
}
unwind:
@@ -4810,7 +5487,7 @@ stripe_getxattr (call_frame_t *frame, xlator_t *this,
if (name && (strcmp (GF_XATTR_MARKER_KEY, name) == 0)
- && (-1 == frame->root->pid)) {
+ && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) {
local->marker.call_count = priv->child_count;
sub_volumes = alloca ( priv->child_count *
@@ -4825,7 +5502,8 @@ stripe_getxattr (call_frame_t *frame, xlator_t *this,
if (cluster_getmarkerattr (frame, this, loc, name,
local, stripe_getxattr_unwind,
sub_volumes, priv->child_count,
- MARKER_UUID_TYPE, priv->vol_uuid)) {
+ MARKER_UUID_TYPE, marker_uuid_default_gauge,
+ priv->vol_uuid)) {
op_errno = EINVAL;
goto err;
}
@@ -4847,9 +5525,7 @@ stripe_getxattr (call_frame_t *frame, xlator_t *this,
return 0;
}
- if (name &&
- ((strncmp (name, GF_XATTR_PATHINFO_KEY,
- strlen (GF_XATTR_PATHINFO_KEY)) == 0))) {
+ if (name && (XATTR_IS_PATHINFO (name))) {
if (IA_ISREG (loc->inode->ia_type)) {
ret = inode_ctx_get (loc->inode, this,
(uint64_t *) &local->fctx);
@@ -4881,7 +5557,7 @@ stripe_getxattr (call_frame_t *frame, xlator_t *this,
if (name &&(*priv->vol_uuid)) {
if ((match_uuid_local (name, priv->vol_uuid) == 0)
- && (-1 == frame->root->pid)) {
+ && (GF_CLIENT_PID_GSYNCD == frame->root->pid)) {
if (!IA_FILE_OR_DIR (loc->inode->ia_type))
local->marker.call_count = 1;
@@ -4904,6 +5580,7 @@ stripe_getxattr (call_frame_t *frame, xlator_t *this,
sub_volumes,
local->marker.call_count,
MARKER_XTIME_TYPE,
+ marker_xtime_default_gauge,
priv->vol_uuid)) {
op_errno = EINVAL;
goto err;
@@ -4924,6 +5601,78 @@ err:
return 0;
}
+static inline gf_boolean_t
+stripe_is_special_xattr (const char *name)
+{
+ gf_boolean_t is_spl = _gf_false;
+
+ if (!name) {
+ goto out;
+ }
+
+ if (!strncmp (name, GF_XATTR_LOCKINFO_KEY,
+ strlen (GF_XATTR_LOCKINFO_KEY))
+ || XATTR_IS_PATHINFO (name))
+ is_spl = _gf_true;
+out:
+ return is_spl;
+}
+
+int32_t
+stripe_fgetxattr_from_everyone (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata)
+{
+ stripe_local_t *local = NULL;
+ stripe_private_t *priv = NULL;
+ int32_t ret = -1, op_errno = 0;
+ int i = 0;
+ xlator_list_t *trav = NULL;
+
+ priv = this->private;
+
+ local = mem_get0 (this->local_pool);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->op_ret = -1;
+ frame->local = local;
+
+ strncpy (local->xsel, name, strlen (name));
+ local->nallocs = local->wind_count = priv->child_count;
+
+ for (i = 0, trav = this->children; i < priv->child_count; i++,
+ trav = trav->next) {
+ STACK_WIND_COOKIE (frame, stripe_vgetxattr_cbk,
+ (void *) (long) i, trav->xlator,
+ trav->xlator->fops->fgetxattr,
+ fd, name, xdata);
+ }
+
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT (fgetxattr, frame, -1, op_errno, NULL, NULL);
+ return ret;
+}
+
+int32_t
+stripe_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata)
+{
+ if (stripe_is_special_xattr (name)) {
+ stripe_fgetxattr_from_everyone (frame, this, fd, name, xdata);
+ goto out;
+ }
+
+ STACK_WIND (frame, stripe_internal_getxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata);
+
+out:
+ return 0;
+}
+
int32_t
@@ -5000,9 +5749,13 @@ struct xlator_fops fops = {
.setxattr = stripe_setxattr,
.fsetxattr = stripe_fsetxattr,
.getxattr = stripe_getxattr,
+ .fgetxattr = stripe_fgetxattr,
.removexattr = stripe_removexattr,
.fremovexattr = stripe_fremovexattr,
.readdirp = stripe_readdirp,
+ .fallocate = stripe_fallocate,
+ .discard = stripe_discard,
+ .zerofill = stripe_zerofill,
};
struct xlator_cbks cbks = {
@@ -5028,10 +5781,10 @@ struct volume_options options[] = {
},
{ .key = {"coalesce"},
.type = GF_OPTION_TYPE_BOOL,
- .default_value = "false",
- .description = "Enable coalesce mode to flatten striped files as "
- "stored on the server (i.e., eliminate holes caused "
- "by the traditional format)."
+ .default_value = "true",
+ .description = "Enable/Disable coalesce mode to flatten striped "
+ "files as stored on the server (i.e., eliminate holes "
+ "caused by the traditional format)."
},
{ .key = {NULL} },
};
diff --git a/xlators/cluster/stripe/src/stripe.h b/xlators/cluster/stripe/src/stripe.h
index a440f87ba..5673d18f3 100644
--- a/xlators/cluster/stripe/src/stripe.h
+++ b/xlators/cluster/stripe/src/stripe.h
@@ -73,9 +73,9 @@
} while (0)
typedef struct stripe_xattr_sort {
- int32_t pos;
- int32_t xattr_len;
- char *xattr_value;
+ int pos;
+ int xattr_len;
+ char *xattr_value;
} stripe_xattr_sort_t;
/**
@@ -98,8 +98,8 @@ struct stripe_private {
gf_lock_t lock;
uint8_t nodes_down;
int8_t first_child_down;
+ int *last_event;
int8_t child_count;
- int8_t *state; /* Current state of child node */
gf_boolean_t xattr_supported; /* default yes */
gf_boolean_t coalesce;
char vol_uuid[UUID_SIZE + 1];
@@ -262,6 +262,9 @@ int32_t stripe_xattr_aggregate (char *buffer, stripe_local_t *local,
off_t coalesced_offset(off_t offset, uint64_t stripe_size, int stripe_count);
off_t uncoalesced_size(off_t size, uint64_t stripe_size, int stripe_count,
int stripe_index);
+int32_t
+stripe_fill_lockinfo_xattr (xlator_t *this, stripe_local_t *local,
+ void **xattr_serz);
/*
* Adjust the size attribute for files if coalesce is enabled.
diff --git a/xlators/debug/error-gen/src/Makefile.am b/xlators/debug/error-gen/src/Makefile.am
index 2de23725e..5075c59a8 100644
--- a/xlators/debug/error-gen/src/Makefile.am
+++ b/xlators/debug/error-gen/src/Makefile.am
@@ -2,7 +2,7 @@
xlator_LTLIBRARIES = error-gen.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/debug
-error_gen_la_LDFLAGS = -module -avoidversion
+error_gen_la_LDFLAGS = -module -avoid-version
error_gen_la_SOURCES = error-gen.c
error_gen_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
diff --git a/xlators/debug/error-gen/src/error-gen.c b/xlators/debug/error-gen/src/error-gen.c
index 5778a9472..ec0874b35 100644
--- a/xlators/debug/error-gen/src/error-gen.c
+++ b/xlators/debug/error-gen/src/error-gen.c
@@ -14,6 +14,7 @@
#include "xlator.h"
#include "error-gen.h"
+#include "statedump.h"
sys_error_t error_no_list[] = {
[GF_FOP_LOOKUP] = { .error_no_count = 4,
@@ -81,9 +82,10 @@ sys_error_t error_no_list[] = {
[GF_FOP_READ] = { .error_no_count = 5,
.error_no = {EINVAL,EBADF,EFAULT,EISDIR,
ENAMETOOLONG}},
- [GF_FOP_WRITE] = { .error_no_count = 5,
+ [GF_FOP_WRITE] = { .error_no_count = 7,
.error_no = {EINVAL,EBADF,EFAULT,EISDIR,
- ENAMETOOLONG}},
+ ENAMETOOLONG,ENOSPC,
+ GF_ERROR_SHORT_WRITE}},
[GF_FOP_STATFS] = {.error_no_count = 10,
.error_no = {EACCES,EBADF,EFAULT,EINTR,
EIO,ENAMETOOLONG,ENOENT,
@@ -236,6 +238,8 @@ conv_errno_to_int (char **error_no)
return EINTR;
else if (!strcmp ((*error_no), "EFBIG"))
return EFBIG;
+ else if (!strcmp((*error_no), "GF_ERROR_SHORT_WRITE"))
+ return GF_ERROR_SHORT_WRITE;
else
return EAGAIN;
}
@@ -1104,10 +1108,26 @@ error_gen_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
if (enable)
op_errno = error_gen (this, GF_FOP_WRITE);
- if (op_errno) {
+ if (op_errno == GF_ERROR_SHORT_WRITE) {
+ struct iovec *shortvec;
+
+ /*
+ * A short write error returns some value less than what was
+ * requested from a write. To simulate this, replace the vector
+ * with one half the size;
+ */
+ shortvec = iov_dup(vector, 1);
+ shortvec->iov_len /= 2;
+
+ STACK_WIND(frame, error_gen_writev_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev, fd, shortvec, count,
+ off, flags, iobref, xdata);
+ GF_FREE(shortvec);
+ return 0;
+ } else if (op_errno) {
GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno));
STACK_UNWIND_STRICT (writev, frame, -1, op_errno, NULL, NULL, xdata);
- return 0;
+ return 0;
}
STACK_WIND (frame, error_gen_writev_cbk,
@@ -1934,6 +1954,85 @@ error_gen_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
return 0;
}
+static void
+error_gen_set_failure (eg_t *pvt, int percent)
+{
+ GF_ASSERT (pvt);
+
+ if (percent)
+ pvt->failure_iter_no = 100/percent;
+ else
+ pvt->failure_iter_no = 100/GF_FAILURE_DEFAULT;
+}
+
+static void
+error_gen_parse_fill_fops (eg_t *pvt, char *enable_fops)
+{
+ char *op_no_str = NULL;
+ int op_no = -1;
+ int i = 0;
+ xlator_t *this = THIS;
+ char *saveptr = NULL;
+
+ GF_ASSERT (pvt);
+ GF_ASSERT (this);
+
+ for (i = 0; i < GF_FOP_MAXVALUE; i++)
+ pvt->enable[i] = 0;
+
+ if (!enable_fops) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "All fops are enabled.");
+ for (i = 0; i < GF_FOP_MAXVALUE; i++)
+ pvt->enable[i] = 1;
+ } else {
+ op_no_str = strtok_r (enable_fops, ",", &saveptr);
+ while (op_no_str) {
+ op_no = get_fop_int (&op_no_str);
+ if (op_no == -1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Wrong option value %s", op_no_str);
+ } else
+ pvt->enable[op_no] = 1;
+
+ op_no_str = strtok_r (NULL, ",", &saveptr);
+ }
+ }
+}
+
+int32_t
+error_gen_priv_dump (xlator_t *this)
+{
+ char key_prefix[GF_DUMP_MAX_BUF_LEN];
+ int ret = -1;
+ eg_t *conf = NULL;
+
+ if (!this)
+ goto out;
+
+ conf = this->private;
+ if (!conf)
+ goto out;
+
+ ret = TRY_LOCK(&conf->lock);
+ if (ret != 0) {
+ return ret;
+ }
+
+ gf_proc_dump_add_section("xlator.debug.error-gen.%s.priv", this->name);
+ gf_proc_dump_build_key(key_prefix,"xlator.debug.error-gen","%s.priv",
+ this->name);
+
+ gf_proc_dump_write("op_count", "%d", conf->op_count);
+ gf_proc_dump_write("failure_iter_no", "%d", conf->failure_iter_no);
+ gf_proc_dump_write("error_no", "%s", conf->error_no);
+ gf_proc_dump_write("random_failure", "%d", conf->random_failure);
+
+ UNLOCK(&conf->lock);
+out:
+ return ret;
+}
+
int32_t
mem_acct_init (xlator_t *this)
{
@@ -1954,18 +2053,43 @@ mem_acct_init (xlator_t *this)
}
int
+reconfigure (xlator_t *this, dict_t *options)
+{
+ eg_t *pvt = NULL;
+ int32_t ret = 0;
+ char *error_enable_fops = NULL;
+ int32_t failure_percent_int = 0;
+
+ if (!this || !this->private)
+ goto out;
+
+ pvt = this->private;
+
+ GF_OPTION_RECONF ("error-no", pvt->error_no, options, str, out);
+
+ GF_OPTION_RECONF ("failure", failure_percent_int, options, int32,
+ out);
+
+ GF_OPTION_RECONF ("enable", error_enable_fops, options, str, out);
+
+ GF_OPTION_RECONF ("random-failure", pvt->random_failure, options,
+ bool, out);
+
+ error_gen_parse_fill_fops (pvt, error_enable_fops);
+ error_gen_set_failure (pvt, failure_percent_int);
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_DEBUG, "reconfigure returning %d", ret);
+ return ret;
+}
+
+int
init (xlator_t *this)
{
eg_t *pvt = NULL;
- data_t *error_no = NULL;
- data_t *failure_percent = NULL;
- data_t *enable = NULL;
- gf_boolean_t random_failure = _gf_false;
int32_t ret = 0;
char *error_enable_fops = NULL;
- char *op_no_str = NULL;
- int op_no = -1;
- int i = 0;
int32_t failure_percent_int = 0;
if (!this->children || this->children->next) {
@@ -1980,79 +2104,34 @@ init (xlator_t *this)
"dangling volume. check volfile ");
}
- error_no = dict_get (this->options, "error-no");
- failure_percent = dict_get (this->options, "failure");
- enable = dict_get (this->options, "enable");
-
pvt = GF_CALLOC (1, sizeof (eg_t), gf_error_gen_mt_eg_t);
if (!pvt) {
- gf_log (this->name, GF_LOG_ERROR,
- "out of memory.");
ret = -1;
goto out;
}
LOCK_INIT (&pvt->lock);
- for (i = 0; i < GF_FOP_MAXVALUE; i++)
- pvt->enable[i] = 0;
- if (!error_no) {
- gf_log (this->name, GF_LOG_DEBUG,
- "error-no not specified.");
- } else {
- pvt->error_no = data_to_str (error_no);
- }
+ GF_OPTION_INIT ("error-no", pvt->error_no, str, out);
- if (!failure_percent) {
- gf_log (this->name, GF_LOG_DEBUG,
- "failure percent not specified.");
- pvt->failure_iter_no = 100/GF_FAILURE_DEFAULT;
- } else {
- failure_percent_int = data_to_int32 (failure_percent);
- if (failure_percent_int)
- pvt->failure_iter_no = 100/failure_percent_int;
- else
- pvt->failure_iter_no = 100/GF_FAILURE_DEFAULT;
- }
+ GF_OPTION_INIT ("failure", failure_percent_int, int32, out);
- if (!enable) {
- gf_log (this->name, GF_LOG_WARNING,
- "All fops are enabled.");
- for (i = 0; i < GF_FOP_MAXVALUE; i++)
- pvt->enable[i] = 1;
- } else {
- error_enable_fops = data_to_str (enable);
- op_no_str = error_enable_fops;
- while ((*error_enable_fops) != '\0') {
- error_enable_fops++;
- if (((*error_enable_fops) == ',') ||
- ((*error_enable_fops) == '\0')) {
- if ((*error_enable_fops) != '\0') {
- (*error_enable_fops) = '\0';
- error_enable_fops++;
- }
- op_no = get_fop_int (&op_no_str);
- if (op_no == -1) {
- gf_log (this->name, GF_LOG_WARNING,
- "Wrong option value %s",
- op_no_str);
- } else
- pvt->enable[op_no] = 1;
- op_no_str = error_enable_fops;
- }
- }
- }
+ GF_OPTION_INIT ("enable", error_enable_fops, str, out);
- random_failure = dict_get_str_boolean (this->options, "random-failure",
- _gf_false);
- pvt->random_failure = random_failure;
+ GF_OPTION_INIT ("random-failure", pvt->random_failure, bool, out);
+
+
+ error_gen_parse_fill_fops (pvt, error_enable_fops);
+ error_gen_set_failure (pvt, failure_percent_int);
this->private = pvt;
/* Give some seed value here */
srand (time(NULL));
out:
+ if (ret)
+ GF_FREE (pvt);
return ret;
}
@@ -2074,9 +2153,12 @@ fini (xlator_t *this)
return;
}
-struct xlator_fops cbks = {
+struct xlator_dumpops dumpops = {
+ .priv = error_gen_priv_dump,
};
+struct xlator_fops cbks;
+
struct xlator_fops fops = {
.lookup = error_gen_lookup,
.stat = error_gen_stat,
@@ -2124,17 +2206,27 @@ struct xlator_fops fops = {
struct volume_options options[] = {
{ .key = {"failure"},
- .type = GF_OPTION_TYPE_INT },
+ .type = GF_OPTION_TYPE_INT,
+ .description = "Percentage failure of operations when enabled.",
+ },
+
{ .key = {"error-no"},
.value = {"ENOENT","ENOTDIR","ENAMETOOLONG","EACCES","EBADF",
"EFAULT","ENOMEM","EINVAL","EIO","EEXIST","ENOSPC",
"EPERM","EROFS","EBUSY","EISDIR","ENOTEMPTY","EMLINK"
"ENODEV","EXDEV","EMFILE","ENFILE","ENOSYS","EINTR",
- "EFBIG","EAGAIN"},
- .type = GF_OPTION_TYPE_STR },
+ "EFBIG","EAGAIN","GF_ERROR_SHORT_WRITE"},
+ .type = GF_OPTION_TYPE_STR,
+ },
+
{ .key = {"random-failure"},
- .type = GF_OPTION_TYPE_BOOL},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ },
+
{ .key = {"enable"},
- .type = GF_OPTION_TYPE_STR },
+ .type = GF_OPTION_TYPE_STR,
+ },
+
{ .key = {NULL} }
};
diff --git a/xlators/debug/error-gen/src/error-gen.h b/xlators/debug/error-gen/src/error-gen.h
index 9eaec804f..d92c23062 100644
--- a/xlators/debug/error-gen/src/error-gen.h
+++ b/xlators/debug/error-gen/src/error-gen.h
@@ -19,6 +19,18 @@
#define GF_FAILURE_DEFAULT 10
+/*
+ * Pseudo-errors refer to errors beyond the scope of traditional <-1, op_errno>
+ * returns. This facilitates the ability to return unexpected, but not -1 values
+ * and/or to inject operations that lead to implicit error conditions. The range
+ * for pseudo errors resides at a high value to avoid conflicts with the errno
+ * range.
+ */
+enum GF_PSEUDO_ERRORS {
+ GF_ERROR_SHORT_WRITE = 1000, /* short writev return value */
+ GF_ERROR_MAX
+};
+
typedef struct {
int enable[GF_FOP_MAXVALUE];
int op_count;
diff --git a/xlators/debug/io-stats/src/Makefile.am b/xlators/debug/io-stats/src/Makefile.am
index 08d261125..dff294cd8 100644
--- a/xlators/debug/io-stats/src/Makefile.am
+++ b/xlators/debug/io-stats/src/Makefile.am
@@ -2,14 +2,16 @@
xlator_LTLIBRARIES = io-stats.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/debug
-io_stats_la_LDFLAGS = -module -avoidversion
+io_stats_la_LDFLAGS = -module -avoid-version
io_stats_la_SOURCES = io-stats.c
io_stats_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
noinst_HEADERS = io-stats-mem-types.h
-AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+ -I$(top_srcdir)/rpc/xdr/src \
+ -I$(top_srcdir)/rpc/rpc-lib/src
AM_CFLAGS = -Wall $(GF_CFLAGS)
diff --git a/xlators/debug/io-stats/src/io-stats.c b/xlators/debug/io-stats/src/io-stats.c
index 556c72249..5b4c833fb 100644
--- a/xlators/debug/io-stats/src/io-stats.c
+++ b/xlators/debug/io-stats/src/io-stats.c
@@ -36,6 +36,7 @@
#include <stdarg.h>
#include "defaults.h"
#include "logging.h"
+#include "cli1-xdr.h"
#define MAX_LIST_MEMBERS 100
@@ -508,7 +509,7 @@ out:
return 0;
}
-inline int
+static inline int
ios_stats_cleanup (xlator_t *this, inode_t *inode)
{
@@ -916,8 +917,19 @@ ios_dump_args_init (struct ios_dump_args *args, ios_dump_type_t type,
return ret;
}
+static void
+ios_global_stats_clear (struct ios_global_stats *stats, struct timeval *now)
+{
+ GF_ASSERT (stats);
+ GF_ASSERT (now);
+
+ memset (stats, 0, sizeof (*stats));
+ stats->started_at = *now;
+}
+
int
-io_stats_dump (xlator_t *this, struct ios_dump_args *args)
+io_stats_dump (xlator_t *this, struct ios_dump_args *args,
+ gf1_cli_info_op op, gf_boolean_t is_peek)
{
struct ios_conf *conf = NULL;
struct ios_global_stats cumulative = {0, };
@@ -935,18 +947,32 @@ io_stats_dump (xlator_t *this, struct ios_dump_args *args)
gettimeofday (&now, NULL);
LOCK (&conf->lock);
{
- cumulative = conf->cumulative;
- incremental = conf->incremental;
+ if (op == GF_CLI_INFO_ALL ||
+ op == GF_CLI_INFO_CUMULATIVE)
+ cumulative = conf->cumulative;
+
+ if (op == GF_CLI_INFO_ALL ||
+ op == GF_CLI_INFO_INCREMENTAL) {
+ incremental = conf->incremental;
+ increment = conf->increment;
- increment = conf->increment++;
+ if (!is_peek) {
+ increment = conf->increment++;
- memset (&conf->incremental, 0, sizeof (conf->incremental));
- conf->incremental.started_at = now;
+ ios_global_stats_clear (&conf->incremental,
+ &now);
+ }
+ }
}
UNLOCK (&conf->lock);
- io_stats_dump_global (this, &cumulative, &now, -1, args);
- io_stats_dump_global (this, &incremental, &now, increment, args);
+ if (op == GF_CLI_INFO_ALL ||
+ op == GF_CLI_INFO_CUMULATIVE)
+ io_stats_dump_global (this, &cumulative, &now, -1, args);
+
+ if (op == GF_CLI_INFO_ALL ||
+ op == GF_CLI_INFO_INCREMENTAL)
+ io_stats_dump_global (this, &incremental, &now, increment, args);
return 0;
}
@@ -1724,6 +1750,40 @@ io_stats_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int
+io_stats_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ UPDATE_PROFILE_STATS(frame, FALLOCATE);
+ STACK_UNWIND_STRICT(fallocate, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ return 0;
+}
+
+
+int
+io_stats_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ UPDATE_PROFILE_STATS(frame, DISCARD);
+ STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ return 0;
+}
+
+int
+io_stats_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ UPDATE_PROFILE_STATS(frame, ZEROFILL);
+ STACK_UNWIND_STRICT(zerofill, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ return 0;
+}
+
+int
io_stats_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct gf_flock *lock, dict_t *xdata)
{
@@ -2166,15 +2226,14 @@ conditional_dump (dict_t *dict, char *key, data_t *value, void *data)
return -1;
}
logfp = fopen (filename, "w+");
- GF_ASSERT (logfp);
if (!logfp) {
gf_log (this->name, GF_LOG_ERROR, "failed to open %s "
"for writing", filename);
return -1;
- }
+ }
(void) ios_dump_args_init (&args, IOS_DUMP_TYPE_FILE,
logfp);
- io_stats_dump (this, &args);
+ io_stats_dump (this, &args, GF_CLI_INFO_ALL, _gf_false);
fclose (logfp);
}
return 0;
@@ -2393,6 +2452,45 @@ io_stats_fstat (call_frame_t *frame, xlator_t *this,
int
+io_stats_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ START_FOP_LATENCY(frame);
+
+ STACK_WIND(frame, io_stats_fallocate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len,
+ xdata);
+
+ return 0;
+}
+
+
+int
+io_stats_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ START_FOP_LATENCY(frame);
+
+ STACK_WIND(frame, io_stats_discard_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata);
+
+ return 0;
+}
+
+int
+io_stats_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata)
+{
+ START_FOP_LATENCY(frame);
+
+ STACK_WIND(frame, io_stats_zerofill_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata);
+
+ return 0;
+}
+
+
+int
io_stats_lk (call_frame_t *frame, xlator_t *this,
fd_t *fd, int32_t cmd, struct gf_flock *lock, dict_t *xdata)
{
@@ -2539,15 +2637,42 @@ ios_destroy_top_stats (struct ios_conf *conf)
return;
}
+static int
+io_stats_clear (struct ios_conf *conf)
+{
+ struct timeval now;
+ int ret = -1;
+
+ GF_ASSERT (conf);
+
+ if (!gettimeofday (&now, NULL))
+ {
+ LOCK (&conf->lock);
+ {
+ ios_global_stats_clear (&conf->cumulative, &now);
+ ios_global_stats_clear (&conf->incremental, &now);
+ conf->increment = 0;
+ }
+ UNLOCK (&conf->lock);
+ ret = 0;
+ }
+
+ return ret;
+}
+
int
reconfigure (xlator_t *this, dict_t *options)
{
struct ios_conf *conf = NULL;
int ret = -1;
char *sys_log_str = NULL;
+ char *log_format_str = NULL;
+ char *logger_str = NULL;
int sys_log_level = -1;
char *log_str = NULL;
int log_level = -1;
+ int log_format = -1;
+ int logger = -1;
if (!this || !this->private)
goto out;
@@ -2575,6 +2700,18 @@ reconfigure (xlator_t *this, dict_t *options)
gf_log_set_loglevel (log_level);
}
+ GF_OPTION_RECONF ("logger", logger_str, options, str, out);
+ if (logger_str) {
+ logger = gf_check_logger (logger_str);
+ gf_log_set_logger (logger);
+ }
+
+ GF_OPTION_RECONF ("log-format", log_format_str, options, str, out);
+ if (log_format_str) {
+ log_format = gf_check_log_format (log_format_str);
+ gf_log_set_logformat (log_format);
+ }
+
ret = 0;
out:
gf_log (this->name, GF_LOG_DEBUG, "reconfigure returning %d", ret);
@@ -2606,6 +2743,10 @@ init (xlator_t *this)
{
struct ios_conf *conf = NULL;
char *sys_log_str = NULL;
+ char *logger_str = NULL;
+ char *log_format_str = NULL;
+ int logger = -1;
+ int log_format = -1;
int sys_log_level = -1;
char *log_str = NULL;
int log_level = -1;
@@ -2664,6 +2805,19 @@ init (xlator_t *this)
gf_log_set_loglevel (log_level);
}
+ GF_OPTION_INIT ("logger", logger_str, str, out);
+ if (logger_str) {
+ logger = gf_check_logger (logger_str);
+ gf_log_set_logger (logger);
+ }
+
+ GF_OPTION_INIT ("log-format", log_format_str, str, out);
+ if (log_format_str) {
+ log_format = gf_check_log_format (log_format_str);
+ gf_log_set_logformat (log_format);
+ }
+
+
this->private = conf;
ret = 0;
out:
@@ -2701,10 +2855,11 @@ notify (xlator_t *this, int32_t event, void *data, ...)
struct ios_dump_args args = {0};
dict_t *output = NULL;
dict_t *dict = NULL;
- int32_t top_op = 0;
+ int32_t op = 0;
int32_t list_cnt = 0;
double throughput = 0;
double time = 0;
+ gf_boolean_t is_peek = _gf_false;
va_list ap;
dict = data;
@@ -2715,7 +2870,7 @@ notify (xlator_t *this, int32_t event, void *data, ...)
case GF_EVENT_TRANSLATOR_INFO:
ret = dict_get_str_boolean (dict, "clear-stats", _gf_false);
if (ret) {
- ret = dict_set_int32 (output, "top-op", top_op);
+ ret = dict_set_int32 (output, "top-op", op);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"Failed to set top-op in dict");
@@ -2735,15 +2890,15 @@ notify (xlator_t *this, int32_t event, void *data, ...)
goto out;
}
- ret = dict_get_int32 (dict, "top-op", &top_op);
+ ret = dict_get_int32 (dict, "top-op", &op);
if (!ret) {
ret = dict_get_int32 (dict, "list-cnt", &list_cnt);
- if (top_op > IOS_STATS_TYPE_NONE &&
- top_op < IOS_STATS_TYPE_MAX)
+ if (op > IOS_STATS_TYPE_NONE &&
+ op < IOS_STATS_TYPE_MAX)
ret = io_stats_dump_stats_to_dict (this, output,
- top_op, list_cnt);
- if (top_op == IOS_STATS_TYPE_READ_THROUGHPUT ||
- top_op == IOS_STATS_TYPE_WRITE_THROUGHPUT) {
+ op, list_cnt);
+ if (op == IOS_STATS_TYPE_READ_THROUGHPUT ||
+ op == IOS_STATS_TYPE_WRITE_THROUGHPUT) {
ret = dict_get_double (dict, "throughput",
&throughput);
if (!ret) {
@@ -2764,9 +2919,41 @@ notify (xlator_t *this, int32_t event, void *data, ...)
}
} else {
- (void) ios_dump_args_init (&args, IOS_DUMP_TYPE_DICT,
- output);
- ret = io_stats_dump (this, &args);
+ ret = dict_get_int32 (dict, "info-op", &op);
+ if (ret || op < GF_CLI_INFO_ALL ||
+ GF_CLI_INFO_CLEAR < op)
+ op = GF_CLI_INFO_ALL;
+
+ ret = dict_set_int32 (output, "info-op", op);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set info-op in dict");
+ goto out;
+ }
+
+ if (GF_CLI_INFO_CLEAR == op) {
+ ret = io_stats_clear (this->private);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to clear info stats");
+
+ ret = dict_set_int32 (output, "stats-cleared",
+ ret ? 0 : 1);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set stats-cleared"
+ " in dict");
+ }
+ else {
+ ret = dict_get_str_boolean (dict, "peek",
+ _gf_false);
+ if (-1 != ret)
+ is_peek = ret;
+
+ (void) ios_dump_args_init (&args,
+ IOS_DUMP_TYPE_DICT, output);
+ ret = io_stats_dump (this, &args, op, is_peek);
+ }
}
break;
default:
@@ -2818,6 +3005,9 @@ struct xlator_fops fops = {
.fxattrop = io_stats_fxattrop,
.setattr = io_stats_setattr,
.fsetattr = io_stats_fsetattr,
+ .fallocate = io_stats_fallocate,
+ .discard = io_stats_discard,
+ .zerofill = io_stats_zerofill,
};
struct xlator_cbks cbks = {
@@ -2861,7 +3051,7 @@ struct volume_options options[] = {
.type = GF_OPTION_TYPE_STR,
.default_value = "CRITICAL",
.description = "Gluster's syslog log-level",
- .value = { "WARNING", "ERROR", "CRITICAL"}
+ .value = { "WARNING", "ERROR", "INFO", "CRITICAL"}
},
{ .key = {"brick-log-level"},
.type = GF_OPTION_TYPE_STR,
@@ -2870,6 +3060,40 @@ struct volume_options options[] = {
.value = { "DEBUG", "WARNING", "ERROR", "INFO",
"CRITICAL", "NONE", "TRACE"}
},
+ { .key = {"logger"},
+ .type = GF_OPTION_TYPE_STR,
+ .value = { GF_LOGGER_GLUSTER_LOG, GF_LOGGER_SYSLOG}
+ },
+ { .key = {"client-logger"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = GF_LOGGER_GLUSTER_LOG,
+ .description = "Changes the logging sub-system to log to, for the "
+ "clients",
+ .value = { GF_LOGGER_GLUSTER_LOG, GF_LOGGER_SYSLOG}
+ },
+ { .key = {"brick-logger"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = GF_LOGGER_GLUSTER_LOG,
+ .description = "Changes the logging sub-system to log to, for the "
+ "bricks",
+ .value = { GF_LOGGER_GLUSTER_LOG, GF_LOGGER_SYSLOG}
+ },
+ { .key = {"log-format"},
+ .type = GF_OPTION_TYPE_STR,
+ .value = { GF_LOG_FORMAT_NO_MSG_ID, GF_LOG_FORMAT_WITH_MSG_ID}
+ },
+ { .key = {"client-log-format"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = GF_LOG_FORMAT_WITH_MSG_ID,
+ .description = "Changes log format for the clients",
+ .value = { GF_LOG_FORMAT_NO_MSG_ID, GF_LOG_FORMAT_WITH_MSG_ID}
+ },
+ { .key = {"brick-log-format"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = GF_LOG_FORMAT_WITH_MSG_ID,
+ .description = "Changes the log format for the bricks",
+ .value = { GF_LOG_FORMAT_NO_MSG_ID, GF_LOG_FORMAT_WITH_MSG_ID}
+ },
{ .key = {NULL} },
};
diff --git a/xlators/debug/trace/src/Makefile.am b/xlators/debug/trace/src/Makefile.am
index eb6434a7d..7b2597b4d 100644
--- a/xlators/debug/trace/src/Makefile.am
+++ b/xlators/debug/trace/src/Makefile.am
@@ -2,11 +2,12 @@
xlator_LTLIBRARIES = trace.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/debug
-trace_la_LDFLAGS = -module -avoidversion
+trace_la_LDFLAGS = -module -avoid-version
trace_la_SOURCES = trace.c
trace_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+noinst_HEADERS = trace.h trace-mem-types.h
AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src
AM_CFLAGS = -Wall $(GF_CFLAGS)
diff --git a/xlators/features/marker/utils/src/procdiggy.h b/xlators/debug/trace/src/trace-mem-types.h
index 56dfc4eb2..9fa7d97c2 100644
--- a/xlators/features/marker/utils/src/procdiggy.h
+++ b/xlators/debug/trace/src/trace-mem-types.h
@@ -1,5 +1,5 @@
/*
- Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com>
+ Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
@@ -7,14 +7,15 @@
later), or the GNU General Public License, version 2 (GPLv2), in all
cases as published by the Free Software Foundation.
*/
-#ifdef __NetBSD__
-#include <sys/syslimits.h>
-#endif /* __NetBSD__ */
-#define PROC "/proc"
-pid_t pidinfo (pid_t pid, char **name);
+#ifndef __TRACE_MEM_TYPES_H__
+#define __TRACE_MEM_TYPES_H__
-int prociter (int (*proch) (pid_t pid, pid_t ppid, char *name, void *data),
- void *data);
+#include "mem-types.h"
+enum gf_trace_mem_types_ {
+ gf_trace_mt_trace_conf_t = gf_common_mt_end + 1,
+ gf_trace_mt_end
+};
+#endif
diff --git a/xlators/debug/trace/src/trace.c b/xlators/debug/trace/src/trace.c
index f122498c8..1efd50e65 100644
--- a/xlators/debug/trace/src/trace.c
+++ b/xlators/debug/trace/src/trace.c
@@ -1,16 +1,15 @@
/*
- Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
+ Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
+
+#include "trace.h"
+#include "trace-mem-types.h"
/**
* xlators/debug/trace :
@@ -19,858 +18,1015 @@
* Very helpful translator for debugging.
*/
-#include <time.h>
-#include <errno.h>
-#include "glusterfs.h"
-#include "xlator.h"
-#include "common-utils.h"
+static inline void
+trace_stat_to_str(struct iatt *buf, char *str)
+{
+ char atime_buf[256] = {0,};
+ char mtime_buf[256] = {0,};
+ char ctime_buf[256] = {0,};
+ uint64_t ia_time = 0;
+
+ if (!buf)
+ return;
+
+ ia_time = buf->ia_atime;
+ strftime (atime_buf, 256, "[%b %d %H:%M:%S]",
+ localtime ((time_t *)&ia_time));
+ ia_time = buf->ia_mtime;
+ strftime (mtime_buf, 256, "[%b %d %H:%M:%S]",
+ localtime ((time_t *)&ia_time));
+ ia_time = buf->ia_ctime;
+ strftime (ctime_buf, 256, "[%b %d %H:%M:%S]",
+ localtime ((time_t *)&ia_time));
-struct {
- char *name;
- int enabled;
-} trace_fop_names[GF_FOP_MAXVALUE];
+ snprintf (str, sizeof (str),
+ "gfid=%s ino=%"PRIu64", mode=%o, "
+ "nlink=%"GF_PRI_NLINK", uid=%u, "
+ "gid=%u, size=%"PRIu64", "
+ "blocks=%"PRIu64", atime=%s, "
+ "mtime=%s, ctime=%s",
+ uuid_utoa (buf->ia_gfid),
+ buf->ia_ino,
+ st_mode_from_ia (buf->ia_prot, buf->ia_type),
+ buf->ia_nlink, buf->ia_uid,
+ buf->ia_gid, buf->ia_size,
+ buf->ia_blocks, atime_buf,
+ mtime_buf, ctime_buf);
+}
-int trace_log_level = GF_LOG_INFO;
-static char *
-trace_stat_to_str (struct iatt *buf)
+int
+dump_history_trace (circular_buffer_t *cb, void *data)
{
- char *statstr = NULL;
- char atime_buf[64] = {0,};
- char mtime_buf[64] = {0,};
- char ctime_buf[64] = {0,};
- int asprint_ret_value = 0;
+ char *string = NULL;
+ struct tm *tm = NULL;
+ char timestr[256] = {0,};
- if (!buf) {
- statstr = NULL;
- goto out;
- }
+ string = (char *)cb->data;
+ tm = localtime (&cb->tv.tv_sec);
- gf_time_fmt (atime_buf, sizeof atime_buf, buf->ia_atime, gf_timefmt_bdT);
- gf_time_fmt (mtime_buf, sizeof mtime_buf, buf->ia_mtime, gf_timefmt_bdT);
- gf_time_fmt (ctime_buf, sizeof ctime_buf, buf->ia_ctime, gf_timefmt_bdT);
- asprint_ret_value = gf_asprintf (&statstr,
- "gfid=%s ino=%"PRIu64", mode=%o, "
- "nlink=%"GF_PRI_NLINK", uid=%u, "
- "gid=%u, size=%"PRIu64", "
- "blocks=%"PRIu64", atime=%s, "
- "mtime=%s, ctime=%s",
- uuid_utoa (buf->ia_gfid), buf->ia_ino,
- st_mode_from_ia (buf->ia_prot,
- buf->ia_type),
- buf->ia_nlink, buf->ia_uid,
- buf->ia_gid, buf->ia_size,
- buf->ia_blocks, atime_buf,
- mtime_buf, ctime_buf);
+ /* Since we are continuing with adding entries to the buffer even when
+ gettimeofday () fails, it's safe to check tm and then dump the time
+ at which the entry was added to the buffer */
+ if (tm) {
+ strftime (timestr, 256, "%Y-%m-%d %H:%M:%S", tm);
+ snprintf (timestr + strlen (timestr), 256 - strlen (timestr),
+ ".%"GF_PRI_SUSECONDS, cb->tv.tv_usec);
+ gf_proc_dump_write ("TIME", "%s", timestr);
+ }
- if (asprint_ret_value < 0)
- statstr = NULL;
+ gf_proc_dump_write ("FOP", "%s\n", string);
-out:
- return statstr;
+ return 0;
}
-
int
trace_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, fd_t *fd,
inode_t *inode, struct iatt *buf,
- struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
{
- char *statstr = NULL;
- char *preparentstr = NULL;
- char *postparentstr = NULL;
+ char statstr[4096] = {0, };
+ char preparentstr[4096] = {0, };
+ char postparentstr[4096] = {0, };
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_CREATE].enabled) {
+ char string[4096] = {0,};
if (op_ret >= 0) {
- statstr = trace_stat_to_str (buf);
- preparentstr = trace_stat_to_str (preparent);
- postparentstr = trace_stat_to_str (postparent);
-
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s (op_ret=%d, fd=%p"
- "*stbuf {%s}, *preparent {%s}, *postparent = "
- "{%s})", frame->root->unique,
- uuid_utoa (inode->gfid), op_ret, fd,
- statstr, preparentstr, postparentstr);
-
- GF_FREE (statstr);
- GF_FREE (preparentstr);
- GF_FREE (postparentstr);
+ trace_stat_to_str (buf, statstr);
+ trace_stat_to_str (preparent, preparentstr);
+ trace_stat_to_str (postparent, postparentstr);
+
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s (op_ret=%d, fd=%p"
+ "*stbuf {%s}, *preparent {%s}, "
+ "*postparent = {%s})",
+ frame->root->unique,
+ uuid_utoa (inode->gfid), op_ret, fd,
+ statstr, preparentstr, postparentstr);
/* for 'release' log */
fd_ctx_set (fd, this, 0);
} else {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": (op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
+ snprintf (string, sizeof (string),
+ "%"PRId64": (op_ret=%d, op_errno=%d)",
+ frame->root->unique, op_ret,
+ op_errno);
}
+ LOG_ELEMENT (conf, string);
}
-
- frame->local = NULL;
- STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, buf,
- preparent, postparent, xdata);
+out:
+ TRACE_STACK_UNWIND (create, frame, op_ret, op_errno, fd, inode, buf,
+ preparent, postparent, xdata);
return 0;
}
-
int
trace_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_OPEN].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d, *fd=%p",
- frame->root->unique, uuid_utoa (frame->local), op_ret, op_errno, fd);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, op_errno=%d, "
+ "*fd=%p", frame->root->unique,
+ uuid_utoa (frame->local), op_ret, op_errno,
+ fd);
+
+ LOG_ELEMENT (conf, string);
}
+out:
/* for 'release' log */
if (op_ret >= 0)
fd_ctx_set (fd, this, 0);
- frame->local = NULL;
- STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, xdata);
+ TRACE_STACK_UNWIND (open, frame, op_ret, op_errno, fd, xdata);
return 0;
}
-
int
trace_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata)
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ dict_t *xdata)
{
- char *statstr = NULL;
- if (trace_fop_names[GF_FOP_STAT].enabled) {
- if (op_ret >= 0) {
- statstr = trace_stat_to_str (buf);
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d buf=%s",
- frame->root->unique, uuid_utoa (frame->local),
- op_ret, statstr);
+ char statstr[4096] = {0, };
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
- GF_FREE (statstr);
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_STAT].enabled) {
+ char string[4096] = {0,};
+ if (op_ret == 0) {
+ trace_stat_to_str (buf, statstr);
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d buf=%s",
+ frame->root->unique,
+ uuid_utoa (frame->local), op_ret,
+ statstr);
} else {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d)",
- frame->root->unique, uuid_utoa (frame->local), op_ret, op_errno);
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, "
+ "op_errno=%d)",
+ frame->root->unique,
+ uuid_utoa (frame->local), op_ret,
+ op_errno);
}
+ LOG_ELEMENT (conf, string);
}
-
- frame->local = NULL;
- STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, buf, xdata);
+out:
+ TRACE_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata);
return 0;
}
-
int
trace_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iovec *vector,
- int32_t count, struct iatt *buf, struct iobref *iobref, dict_t *xdata)
+ int32_t count, struct iatt *buf, struct iobref *iobref,
+ dict_t *xdata)
{
- char *statstr = NULL;
+ char statstr[4096] = {0, };
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_READ].enabled) {
+ char string[4096] = {0,};
if (op_ret >= 0) {
- statstr = trace_stat_to_str (buf);
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d buf=%s",
- frame->root->unique, uuid_utoa (frame->local),
- op_ret, statstr);
-
- GF_FREE (statstr);
+ trace_stat_to_str (buf, statstr);
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d buf=%s",
+ frame->root->unique,
+ uuid_utoa (frame->local), op_ret,
+ statstr);
} else {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d)",
- frame->root->unique, uuid_utoa (frame->local),
- op_ret, op_errno);
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, "
+ "op_errno=%d)",
+ frame->root->unique,
+ uuid_utoa (frame->local), op_ret,
+ op_errno);
}
+ LOG_ELEMENT (conf, string);
}
-
- frame->local = NULL;
- STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, vector, count,
- buf, iobref, xdata);
+out:
+ TRACE_STACK_UNWIND (readv, frame, op_ret, op_errno, vector, count,
+ buf, iobref, xdata);
return 0;
}
-
int
trace_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)
{
- char *preopstr = NULL;
- char *postopstr = NULL;
+ char preopstr[4096] = {0, };
+ char postopstr[4096] = {0, };
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_WRITE].enabled) {
+ char string[4096] = {0,};
if (op_ret >= 0) {
- preopstr = trace_stat_to_str (prebuf);
- postopstr = trace_stat_to_str (postbuf);
-
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": (op_ret=%d, *prebuf = {%s}, "
- "*postbuf = {%s})",
- frame->root->unique, op_ret,
- preopstr, postopstr);
-
- GF_FREE (preopstr);
-
- GF_FREE (postopstr);
+ trace_stat_to_str (prebuf, preopstr);
+ trace_stat_to_str (postbuf, postopstr);
+
+ snprintf (string, sizeof (string),
+ "%"PRId64": (op_ret=%d, "
+ "*prebuf = {%s}, *postbuf = {%s})",
+ frame->root->unique, op_ret,
+ preopstr, postopstr);
} else {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d",
- frame->root->unique, uuid_utoa (frame->local),
- op_ret, op_errno);
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, "
+ "op_errno=%d", frame->root->unique,
+ uuid_utoa (frame->local), op_ret,
+ op_errno);
}
+ LOG_ELEMENT (conf, string);
}
-
- frame->local = NULL;
- STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+out:
+ TRACE_STACK_UNWIND (writev, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
return 0;
}
-
-
int
trace_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, gf_dirent_t *buf, dict_t *xdata)
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *buf,
+ dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_READDIR].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64" : gfid=%s op_ret=%d, op_errno=%d",
- frame->root->unique, uuid_utoa (frame->local), op_ret, op_errno);
- }
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64" : gfid=%s op_ret=%d, op_errno=%d",
+ frame->root->unique, uuid_utoa (frame->local),
+ op_ret, op_errno);
- frame->local = NULL;
- STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, buf, xdata);
+ LOG_ELEMENT (conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND (readdir, frame, op_ret, op_errno, buf, xdata);
return 0;
}
-
int
trace_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, gf_dirent_t *buf, dict_t *xdata)
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *buf,
+ dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_READDIRP].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64" : gfid=%s op_ret=%d, op_errno=%d",
- frame->root->unique, uuid_utoa (frame->local), op_ret, op_errno);
- }
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64" : gfid=%s op_ret=%d, op_errno=%d",
+ frame->root->unique, uuid_utoa (frame->local),
+ op_ret, op_errno);
- frame->local = NULL;
- STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, buf, xdata);
+ LOG_ELEMENT (conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND (readdirp, frame, op_ret, op_errno, buf, xdata);
return 0;
}
-
int
trace_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)
{
- char *preopstr = NULL;
- char *postopstr = NULL;
-
- if (trace_fop_names[GF_FOP_FSYNC].enabled) {
- if (op_ret >= 0) {
- preopstr = trace_stat_to_str (prebuf);
- postopstr = trace_stat_to_str (postbuf);
+ char preopstr[4096] = {0, };
+ char postopstr[4096] = {0, };
+ trace_conf_t *conf = NULL;
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": (op_ret=%d, *prebuf = {%s}, "
- "*postbuf = {%s}",
- frame->root->unique, op_ret,
- preopstr, postopstr);
+ conf = this->private;
- GF_FREE (preopstr);
-
- GF_FREE (postopstr);
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_FSYNC].enabled) {
+ char string[4096] = {0,};
+ if (op_ret == 0) {
+ trace_stat_to_str (prebuf, preopstr);
+ trace_stat_to_str (postbuf, postopstr);
+
+ snprintf (string, sizeof (string),
+ "%"PRId64": (op_ret=%d, "
+ "*prebuf = {%s}, *postbuf = {%s}",
+ frame->root->unique, op_ret,
+ preopstr, postopstr);
} else {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d",
- frame->root->unique, uuid_utoa (frame->local),
- op_ret, op_errno);
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, "
+ "op_errno=%d", frame->root->unique,
+ uuid_utoa (frame->local), op_ret,
+ op_errno);
+
}
+ LOG_ELEMENT (conf, string);
}
-
- frame->local = NULL;
- STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+out:
+ TRACE_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
return 0;
}
-
int
trace_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
struct iatt *statpre, struct iatt *statpost, dict_t *xdata)
{
- char *preopstr = NULL;
- char *postopstr = NULL;
-
- if (trace_fop_names[GF_FOP_SETATTR].enabled) {
- if (op_ret >= 0) {
- preopstr = trace_stat_to_str (statpre);
- postopstr = trace_stat_to_str (statpost);
+ char preopstr[4096] = {0, };
+ char postopstr[4096] = {0, };
+ trace_conf_t *conf = NULL;
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": (op_ret=%d, *prebuf = {%s}, "
- "*postbuf = {%s})",
- frame->root->unique, op_ret,
- preopstr, postopstr);
+ conf = this->private;
- GF_FREE (preopstr);
-
- GF_FREE (postopstr);
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_SETATTR].enabled) {
+ char string[4096] = {0,};
+ if (op_ret == 0) {
+ trace_stat_to_str (statpre, preopstr);
+ trace_stat_to_str (statpost, postopstr);
+
+ snprintf (string, sizeof (string),
+ "%"PRId64": (op_ret=%d, "
+ "*prebuf = {%s}, *postbuf = {%s})",
+ frame->root->unique, op_ret,
+ preopstr, postopstr);
} else {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d)",
- frame->root->unique, uuid_utoa (frame->local), op_ret, op_errno);
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, "
+ "op_errno=%d)", frame->root->unique,
+ uuid_utoa (frame->local), op_ret,
+ op_errno);
}
+ LOG_ELEMENT (conf, string);
}
-
- frame->local = NULL;
- STACK_UNWIND_STRICT (setattr, frame, op_ret, op_errno, statpre, statpost, xdata);
+out:
+ TRACE_STACK_UNWIND (setattr, frame, op_ret, op_errno, statpre,
+ statpost, xdata);
return 0;
}
-
int
trace_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
struct iatt *statpre, struct iatt *statpost, dict_t *xdata)
{
- char *preopstr = NULL;
- char *postopstr = NULL;
-
- if (trace_fop_names[GF_FOP_FSETATTR].enabled) {
- if (op_ret >= 0) {
- preopstr = trace_stat_to_str (statpre);
- postopstr = trace_stat_to_str (statpost);
+ char preopstr[4096] = {0, };
+ char postopstr[4096] = {0, };
+ trace_conf_t *conf = NULL;
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": (op_ret=%d, *prebuf = {%s}, "
- "*postbuf = {%s})",
- frame->root->unique, op_ret,
- preopstr, postopstr);
+ conf = this->private;
- GF_FREE (preopstr);
-
- GF_FREE (postopstr);
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_FSETATTR].enabled) {
+ char string[4096] = {0,};
+ if (op_ret == 0) {
+ trace_stat_to_str (statpre, preopstr);
+ trace_stat_to_str (statpost, postopstr);
+
+ snprintf (string, sizeof (string),
+ "%"PRId64": (op_ret=%d, "
+ "*prebuf = {%s}, *postbuf = {%s})",
+ frame->root->unique, op_ret,
+ preopstr, postopstr);
} else {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d)",
- frame->root->unique, uuid_utoa (frame->local), op_ret, op_errno);
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, op_errno=%d)",
+ frame->root->unique, uuid_utoa (frame->local),
+ op_ret, op_errno);
}
+ LOG_ELEMENT (conf, string);
}
-
- frame->local = NULL;
- STACK_UNWIND_STRICT (fsetattr, frame, op_ret, op_errno,
- statpre, statpost, xdata);
+out:
+ TRACE_STACK_UNWIND (fsetattr, frame, op_ret, op_errno,
+ statpre, statpost, xdata);
return 0;
}
-
int
trace_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
- struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
{
- char *preparentstr = NULL;
- char *postparentstr = NULL;
-
- if (trace_fop_names[GF_FOP_UNLINK].enabled) {
- if (op_ret >= 0) {
- preparentstr = trace_stat_to_str (preparent);
- postparentstr = trace_stat_to_str (postparent);
-
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, *preparent = {%s}, "
- "*postparent = {%s})",
- frame->root->unique, uuid_utoa (frame->local), op_ret, preparentstr,
- postparentstr);
+ char preparentstr[4096] = {0, };
+ char postparentstr[4096] = {0, };
+ trace_conf_t *conf = NULL;
- GF_FREE (preparentstr);
+ conf = this->private;
- GF_FREE (postparentstr);
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_UNLINK].enabled) {
+ char string[4096] = {0,};
+ if (op_ret == 0) {
+ trace_stat_to_str (preparent, preparentstr);
+ trace_stat_to_str (postparent, postparentstr);
+
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, "
+ " *preparent = {%s}, "
+ "*postparent = {%s})",
+ frame->root->unique,
+ uuid_utoa (frame->local),
+ op_ret, preparentstr,
+ postparentstr);
} else {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d)",
- frame->root->unique, uuid_utoa (frame->local), op_ret, op_errno);
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, "
+ "op_errno=%d)",
+ frame->root->unique,
+ uuid_utoa (frame->local), op_ret,
+ op_errno);
}
+ LOG_ELEMENT (conf, string);
}
-
- frame->local = NULL;
- STACK_UNWIND_STRICT (unlink, frame, op_ret, op_errno,
- preparent, postparent, xdata);
+out:
+ TRACE_STACK_UNWIND (unlink, frame, op_ret, op_errno,
+ preparent, postparent, xdata);
return 0;
}
-
int
trace_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *buf,
struct iatt *preoldparent, struct iatt *postoldparent,
- struct iatt *prenewparent, struct iatt *postnewparent, dict_t *xdata)
+ struct iatt *prenewparent, struct iatt *postnewparent,
+ dict_t *xdata)
{
- char *statstr = NULL;
- char *preoldparentstr = NULL;
- char *postoldparentstr = NULL;
- char *prenewparentstr = NULL;
- char *postnewparentstr = NULL;
-
- if (trace_fop_names[GF_FOP_RENAME].enabled) {
- if (op_ret >= 0) {
- statstr = trace_stat_to_str (buf);
- preoldparentstr = trace_stat_to_str (preoldparent);
- postoldparentstr = trace_stat_to_str (postoldparent);
-
- prenewparentstr = trace_stat_to_str (prenewparent);
- postnewparentstr = trace_stat_to_str (postnewparent);
-
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": (op_ret=%d, *stbuf = {%s}, "
- "*preoldparent = {%s}, *postoldparent = {%s}"
- " *prenewparent = {%s}, *postnewparent = {%s})",
- frame->root->unique, op_ret, statstr,
- preoldparentstr, postoldparentstr,
- prenewparentstr, postnewparentstr);
+ char statstr[4096] = {0, };
+ char preoldparentstr[4096] = {0, };
+ char postoldparentstr[4096] = {0, };
+ char prenewparentstr[4096] = {0, };
+ char postnewparentstr[4096] = {0, };
+ trace_conf_t *conf = NULL;
- GF_FREE (statstr);
+ conf = this->private;
- GF_FREE (preoldparentstr);
-
- GF_FREE (postoldparentstr);
-
- GF_FREE (prenewparentstr);
-
- GF_FREE (postnewparentstr);
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_RENAME].enabled) {
+ char string[4096] = {0,};
+ if (op_ret == 0) {
+ trace_stat_to_str (buf, statstr);
+ trace_stat_to_str (preoldparent, preoldparentstr);
+ trace_stat_to_str (postoldparent, postoldparentstr);
+ trace_stat_to_str (prenewparent, prenewparentstr);
+ trace_stat_to_str (postnewparent, postnewparentstr);
+
+ snprintf (string, sizeof (string),
+ "%"PRId64": (op_ret=%d, "
+ "*stbuf = {%s}, *preoldparent = {%s},"
+ " *postoldparent = {%s}"
+ " *prenewparent = {%s}, "
+ "*postnewparent = {%s})",
+ frame->root->unique, op_ret, statstr,
+ preoldparentstr, postoldparentstr,
+ prenewparentstr, postnewparentstr);
} else {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d",
- frame->root->unique, uuid_utoa (frame->local),
- op_ret, op_errno);
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, "
+ "op_errno=%d", frame->root->unique,
+ uuid_utoa (frame->local),
+ op_ret, op_errno);
+
}
+ LOG_ELEMENT (conf, string);
}
-
- frame->local = NULL;
- STACK_UNWIND_STRICT (rename, frame, op_ret, op_errno, buf,
- preoldparent, postoldparent,
- prenewparent, postnewparent, xdata);
+out:
+ TRACE_STACK_UNWIND (rename, frame, op_ret, op_errno, buf,
+ preoldparent, postoldparent,
+ prenewparent, postnewparent, xdata);
return 0;
}
-
int
trace_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
const char *buf, struct iatt *stbuf, dict_t *xdata)
{
- char *statstr = NULL;
+ char statstr[4096] = {0, };
+ trace_conf_t *conf = NULL;
- if (trace_fop_names[GF_FOP_READLINK].enabled) {
+ conf = this->private;
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_READLINK].enabled) {
+ char string[4096] = {0,};
if (op_ret == 0) {
- statstr = trace_stat_to_str (stbuf);
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": (op_ret=%d, op_errno=%d, buf=%s, "
- "stbuf = { %s })",
- frame->root->unique, op_ret, op_errno, buf,
- statstr);
- } else
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d",
- frame->root->unique, uuid_utoa (frame->local),
- op_ret, op_errno);
+ trace_stat_to_str (stbuf, statstr);
+ snprintf (string, sizeof (string),
+ "%"PRId64": (op_ret=%d, op_errno=%d,"
+ "buf=%s, stbuf = { %s })",
+ frame->root->unique, op_ret, op_errno,
+ buf, statstr);
+ } else {
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, "
+ "op_errno=%d",
+ frame->root->unique,
+ uuid_utoa (frame->local), op_ret,
+ op_errno);
+ }
- GF_FREE (statstr);
+ LOG_ELEMENT (conf, string);
}
-
- frame->local = NULL;
- STACK_UNWIND_STRICT (readlink, frame, op_ret, op_errno, buf, stbuf, xdata);
+out:
+ TRACE_STACK_UNWIND (readlink, frame, op_ret, op_errno, buf, stbuf,
+ xdata);
return 0;
}
-
int
trace_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
inode_t *inode, struct iatt *buf,
dict_t *xdata, struct iatt *postparent)
{
- char *statstr = NULL;
- char *postparentstr = NULL;
-
- if (trace_fop_names[GF_FOP_LOOKUP].enabled) {
- if (op_ret >= 0) {
- statstr = trace_stat_to_str (buf);
- postparentstr = trace_stat_to_str (postparent);
+ char statstr[4096] = {0, };
+ char postparentstr[4096] = {0, };
+ trace_conf_t *conf = NULL;
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s (op_ret=%d "
- "*buf {%s}, *postparent {%s}",
- frame->root->unique, uuid_utoa (inode->gfid),
- op_ret, statstr, postparentstr);
+ conf = this->private;
- GF_FREE (statstr);
- GF_FREE (postparentstr);
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_LOOKUP].enabled) {
+ char string[4096] = {0,};
+ if (op_ret == 0) {
+ trace_stat_to_str (buf, statstr);
+ trace_stat_to_str (postparent, postparentstr);
+ /* print buf->ia_gfid instead of inode->gfid,
+ * since if the inode is not yet linked to the
+ * inode table (fresh lookup) then null gfid
+ * will be printed.
+ */
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s (op_ret=%d "
+ "*buf {%s}, *postparent {%s}",
+ frame->root->unique,
+ uuid_utoa (buf->ia_gfid),
+ op_ret, statstr, postparentstr);
/* For 'forget' */
inode_ctx_put (inode, this, 0);
} else {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d)",
- frame->root->unique, uuid_utoa (frame->local),
- op_ret, op_errno);
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, "
+ "op_errno=%d)",
+ frame->root->unique,
+ uuid_utoa (frame->local), op_ret,
+ op_errno);
}
+ LOG_ELEMENT (conf, string);
}
-
- frame->local = NULL;
- STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf,
- xdata, postparent);
+out:
+ TRACE_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, buf,
+ xdata, postparent);
return 0;
}
-
int
trace_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
inode_t *inode, struct iatt *buf,
- struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
{
- char *statstr = NULL;
- char *preparentstr = NULL;
- char *postparentstr = NULL;
-
- if (trace_fop_names[GF_FOP_SYMLINK].enabled) {
- if (op_ret >= 0) {
- statstr = trace_stat_to_str (buf);
- preparentstr = trace_stat_to_str (preparent);
- postparentstr = trace_stat_to_str (postparent);
-
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s (op_ret=%d "
- "*stbuf = {%s}, *preparent = {%s}, "
- "*postparent = {%s})",
- frame->root->unique, uuid_utoa (inode->gfid),
- op_ret, statstr, preparentstr, postparentstr);
-
- GF_FREE (statstr);
+ char statstr[4096] = {0, };
+ char preparentstr[4096] = {0, };
+ char postparentstr[4096] = {0, };
+ trace_conf_t *conf = NULL;
- GF_FREE (preparentstr);
-
- GF_FREE (postparentstr);
+ conf = this->private;
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_SYMLINK].enabled) {
+ char string[4096] = {0,};
+ if (op_ret == 0) {
+ trace_stat_to_str (buf, statstr);
+ trace_stat_to_str (preparent, preparentstr);
+ trace_stat_to_str (postparent, postparentstr);
+
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s (op_ret=%d "
+ "*stbuf = {%s}, *preparent = {%s}, "
+ "*postparent = {%s})",
+ frame->root->unique,
+ uuid_utoa (inode->gfid),
+ op_ret, statstr, preparentstr,
+ postparentstr);
} else {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": op_ret=%d, op_errno=%d",
- frame->root->unique, op_ret, op_errno);
+ snprintf (string, sizeof (string),
+ "%"PRId64": op_ret=%d, op_errno=%d",
+ frame->root->unique, op_ret,
+ op_errno);
}
+ LOG_ELEMENT (conf, string);
}
-
- frame->local = NULL;
- STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno, inode, buf,
- preparent, postparent, xdata);
+out:
+ TRACE_STACK_UNWIND (symlink, frame, op_ret, op_errno, inode, buf,
+ preparent, postparent, xdata);
return 0;
}
-
int
trace_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
inode_t *inode, struct iatt *buf,
struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
{
- char *statstr = NULL;
- char *preparentstr = NULL;
- char *postparentstr = NULL;
-
- if (trace_fop_names[GF_FOP_MKNOD].enabled) {
- if (op_ret >= 0) {
- statstr = trace_stat_to_str (buf);
- preparentstr = trace_stat_to_str (preparent);
- postparentstr = trace_stat_to_str (postparent);
-
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s (op_ret=%d "
- "*stbuf = {%s}, *preparent = {%s}, "
- "*postparent = {%s})",
- frame->root->unique, uuid_utoa (inode->gfid),
- op_ret, statstr, preparentstr, postparentstr);
+ char statstr[4096] = {0, };
+ char preparentstr[4096] = {0, };
+ char postparentstr[4096] = {0, };
+ trace_conf_t *conf = NULL;
- GF_FREE (statstr);
+ conf = this->private;
- GF_FREE (preparentstr);
-
- GF_FREE (postparentstr);
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ char string[4096] = {0,};
+ if (trace_fop_names[GF_FOP_MKNOD].enabled) {
+ if (op_ret == 0) {
+ trace_stat_to_str (buf, statstr);
+ trace_stat_to_str (preparent, preparentstr);
+ trace_stat_to_str (postparent, postparentstr);
+
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s (op_ret=%d "
+ "*stbuf = {%s}, *preparent = {%s}, "
+ "*postparent = {%s})",
+ frame->root->unique,
+ uuid_utoa (inode->gfid),
+ op_ret, statstr, preparentstr,
+ postparentstr);
} else {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": (op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
+ snprintf (string, sizeof (string),
+ "%"PRId64": (op_ret=%d, op_errno=%d)",
+ frame->root->unique, op_ret,
+ op_errno);
}
+ LOG_ELEMENT (conf, string);
}
-
- frame->local = NULL;
- STACK_UNWIND_STRICT (mknod, frame, op_ret, op_errno, inode, buf,
- preparent, postparent, xdata);
+out:
+ TRACE_STACK_UNWIND (mknod, frame, op_ret, op_errno, inode, buf,
+ preparent, postparent, xdata);
return 0;
}
-
int
trace_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
inode_t *inode, struct iatt *buf,
struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
{
- char *statstr = NULL;
- char *preparentstr = NULL;
- char *postparentstr = NULL;
-
- if (trace_fop_names[GF_FOP_MKDIR].enabled) {
- if (op_ret >= 0) {
- statstr = trace_stat_to_str (buf);
- preparentstr = trace_stat_to_str (preparent);
- postparentstr = trace_stat_to_str (postparent);
-
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s (op_ret=%d "
- ", *stbuf = {%s}, *prebuf = {%s}, "
- "*postbuf = {%s} )",
- frame->root->unique, uuid_utoa (inode->gfid),
- op_ret, statstr, preparentstr, postparentstr);
+ char statstr[4096] = {0, };
+ char preparentstr[4096] = {0, };
+ char postparentstr[4096] = {0, };
+ trace_conf_t *conf = NULL;
- GF_FREE (statstr);
+ conf = this->private;
- GF_FREE (preparentstr);
-
- GF_FREE (postparentstr);
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_MKDIR].enabled) {
+ char string[4096] = {0,};
+ if (op_ret == 0) {
+ trace_stat_to_str (buf, statstr);
+ trace_stat_to_str (preparent, preparentstr);
+ trace_stat_to_str (postparent, postparentstr);
+
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s (op_ret=%d "
+ ", *stbuf = {%s}, *prebuf = {%s}, "
+ "*postbuf = {%s} )",
+ frame->root->unique,
+ uuid_utoa (inode->gfid),
+ op_ret, statstr, preparentstr,
+ postparentstr);
} else {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": (op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
+ snprintf (string, sizeof (string),
+ "%"PRId64": (op_ret=%d, op_errno=%d)",
+ frame->root->unique, op_ret,
+ op_errno);
}
+ LOG_ELEMENT (conf, string);
}
-
- frame->local = NULL;
- STACK_UNWIND_STRICT (mkdir, frame, op_ret, op_errno, inode, buf,
- preparent, postparent, xdata);
+out:
+ TRACE_STACK_UNWIND (mkdir, frame, op_ret, op_errno, inode, buf,
+ preparent, postparent, xdata);
return 0;
}
-
int
trace_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
inode_t *inode, struct iatt *buf,
struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
{
- char *statstr = NULL;
- char *preparentstr = NULL;
- char *postparentstr = NULL;
-
- if (trace_fop_names[GF_FOP_LINK].enabled) {
- if (op_ret >= 0) {
- statstr = trace_stat_to_str (buf);
- preparentstr = trace_stat_to_str (preparent);
- postparentstr = trace_stat_to_str (postparent);
+ char statstr[4096] = {0, };
+ char preparentstr[4096] = {0, };
+ char postparentstr[4096] = {0, };
+ trace_conf_t *conf = NULL;
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": (op_ret=%d, *stbuf = {%s}, "
- " *prebuf = {%s}, *postbuf = {%s})",
- frame->root->unique, op_ret,
- statstr, preparentstr, postparentstr);
+ conf = this->private;
- GF_FREE (statstr);
-
- GF_FREE (preparentstr);
-
- GF_FREE (postparentstr);
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ char string[4096] = {0,};
+ if (trace_fop_names[GF_FOP_LINK].enabled) {
+ if (op_ret == 0) {
+ trace_stat_to_str (buf, statstr);
+ trace_stat_to_str (preparent, preparentstr);
+ trace_stat_to_str (postparent, postparentstr);
+
+ snprintf (string, sizeof (string),
+ "%"PRId64": (op_ret=%d, "
+ "*stbuf = {%s}, *prebuf = {%s},"
+ " *postbuf = {%s})",
+ frame->root->unique, op_ret,
+ statstr, preparentstr, postparentstr);
} else {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d",
- frame->root->unique, uuid_utoa (frame->local),
- op_ret, op_errno);
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, "
+ "op_errno=%d",
+ frame->root->unique,
+ uuid_utoa (frame->local),
+ op_ret, op_errno);
}
+ LOG_ELEMENT (conf, string);
}
-
- frame->local = NULL;
- STACK_UNWIND_STRICT (link, frame, op_ret, op_errno, inode, buf,
- preparent, postparent, xdata);
+out:
+ TRACE_STACK_UNWIND (link, frame, op_ret, op_errno, inode, buf,
+ preparent, postparent, xdata);
return 0;
}
-
int
trace_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ char string[4096] = {0,};
if (trace_fop_names[GF_FOP_FLUSH].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d",
- frame->root->unique, uuid_utoa (frame->local),
- op_ret, op_errno);
- }
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, op_errno=%d",
+ frame->root->unique, uuid_utoa (frame->local),
+ op_ret, op_errno);
- frame->local = NULL;
- STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno, xdata);
+ LOG_ELEMENT (conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND (flush, frame, op_ret, op_errno, xdata);
return 0;
}
-
int
trace_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ char string[4096] = {0,};
if (trace_fop_names[GF_FOP_OPENDIR].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d, fd=%p",
- frame->root->unique, uuid_utoa (frame->local),
- op_ret, op_errno, fd);
- }
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, op_errno=%d,"
+ " fd=%p",
+ frame->root->unique, uuid_utoa (frame->local),
+ op_ret, op_errno, fd);
+ LOG_ELEMENT (conf, string);
+ }
+out:
/* for 'releasedir' log */
if (op_ret >= 0)
fd_ctx_set (fd, this, 0);
- frame->local = NULL;
- STACK_UNWIND_STRICT (opendir, frame, op_ret, op_errno, fd, xdata);
+ TRACE_STACK_UNWIND (opendir, frame, op_ret, op_errno, fd, xdata);
return 0;
}
-
int
trace_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
{
- char *preparentstr = NULL;
- char *postparentstr = NULL;
+ char preparentstr[4096] = {0, };
+ char postparentstr[4096] = {0, };
+ trace_conf_t *conf = NULL;
- if (trace_fop_names[GF_FOP_RMDIR].enabled) {
- if (op_ret >= 0) {
- preparentstr = trace_stat_to_str (preparent);
- postparentstr = trace_stat_to_str (postparent);
-
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, *prebuf = {%s}, "
- "*postbuf = {%s}",
- frame->root->unique, uuid_utoa (frame->local),
- op_ret, preparentstr, postparentstr);
-
- GF_FREE (preparentstr);
+ conf = this->private;
- GF_FREE (postparentstr);
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_RMDIR].enabled) {
+ char string[4096] = {0,};
+ if (op_ret == 0) {
+ trace_stat_to_str (preparent, preparentstr);
+ trace_stat_to_str (postparent, postparentstr);
+
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, "
+ "*prebuf={%s}, *postbuf={%s}",
+ frame->root->unique,
+ uuid_utoa (frame->local),
+ op_ret, preparentstr, postparentstr);
} else {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d",
- frame->root->unique, uuid_utoa (frame->local),
- op_ret, op_errno);
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, "
+ "op_errno=%d", frame->root->unique,
+ uuid_utoa (frame->local),
+ op_ret, op_errno);
}
+ LOG_ELEMENT (conf, string);
}
-
- frame->local = NULL;
- STACK_UNWIND_STRICT (rmdir, frame, op_ret, op_errno,
- preparent, postparent, xdata);
+out:
+ TRACE_STACK_UNWIND (rmdir, frame, op_ret, op_errno,
+ preparent, postparent, xdata);
return 0;
}
-
int
trace_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)
{
- char *preopstr = NULL;
- char *postopstr = NULL;
+ char preopstr[4096] = {0, };
+ char postopstr[4096] = {0, };
+ trace_conf_t *conf = NULL;
- if (trace_fop_names[GF_FOP_TRUNCATE].enabled) {
- if (op_ret >= 0) {
- preopstr = trace_stat_to_str (prebuf);
- postopstr = trace_stat_to_str (postbuf);
+ conf = this->private;
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": (op_ret=%d, *prebuf = {%s}, "
- "*postbuf = {%s} )",
- frame->root->unique, op_ret, preopstr,
- postopstr);
-
- GF_FREE (preopstr);
-
- GF_FREE (postopstr);
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_TRUNCATE].enabled) {
+ char string[4096] = {0,};
+ if (op_ret == 0) {
+ trace_stat_to_str (prebuf, preopstr);
+ trace_stat_to_str (postbuf, postopstr);
+
+ snprintf (string, sizeof (string),
+ "%"PRId64": (op_ret=%d, "
+ "*prebuf = {%s}, *postbuf = {%s} )",
+ frame->root->unique, op_ret,
+ preopstr, postopstr);
} else {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d",
- frame->root->unique, uuid_utoa (frame->local),
- op_ret, op_errno);
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, "
+ "op_errno=%d", frame->root->unique,
+ uuid_utoa (frame->local), op_ret,
+ op_errno);
}
+ LOG_ELEMENT (conf, string);
}
-
- frame->local = NULL;
- STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+out:
+ TRACE_STACK_UNWIND (truncate, frame, op_ret, op_errno, prebuf,
+ postbuf, xdata);
return 0;
}
-
int
trace_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct statvfs *buf, dict_t *xdata)
+ int32_t op_ret, int32_t op_errno, struct statvfs *buf,
+ dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_STATFS].enabled) {
- if (op_ret >= 0) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": ({f_bsize=%lu, f_frsize=%lu, f_blocks=%"GF_PRI_FSBLK
- ", f_bfree=%"GF_PRI_FSBLK", f_bavail=%"GF_PRI_FSBLK", "
- "f_files=%"GF_PRI_FSBLK", f_ffree=%"GF_PRI_FSBLK", f_favail=%"
- GF_PRI_FSBLK", f_fsid=%lu, f_flag=%lu, f_namemax=%lu}) => ret=%d",
- frame->root->unique, buf->f_bsize, buf->f_frsize, buf->f_blocks,
- buf->f_bfree, buf->f_bavail, buf->f_files, buf->f_ffree,
- buf->f_favail, buf->f_fsid, buf->f_flag, buf->f_namemax, op_ret);
+ char string[4096] = {0,};
+ if (op_ret == 0) {
+ snprintf (string, sizeof (string),
+ "%"PRId64": ({f_bsize=%lu, "
+ "f_frsize=%lu, "
+ "f_blocks=%"GF_PRI_FSBLK
+ ", f_bfree=%"GF_PRI_FSBLK", "
+ "f_bavail=%"GF_PRI_FSBLK", "
+ "f_files=%"GF_PRI_FSBLK", "
+ "f_ffree=%"GF_PRI_FSBLK", "
+ "f_favail=%"GF_PRI_FSBLK", "
+ "f_fsid=%lu, f_flag=%lu, "
+ "f_namemax=%lu}) => ret=%d",
+ frame->root->unique, buf->f_bsize,
+ buf->f_frsize, buf->f_blocks,
+ buf->f_bfree, buf->f_bavail,
+ buf->f_files, buf->f_ffree,
+ buf->f_favail, buf->f_fsid,
+ buf->f_flag, buf->f_namemax, op_ret);
} else {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": (op_ret=%d, op_errno=%d)",
- frame->root->unique, op_ret, op_errno);
+ snprintf (string, sizeof (string),
+ "%"PRId64": (op_ret=%d, "
+ "op_errno=%d)",
+ frame->root->unique, op_ret,
+ op_errno);
}
+ LOG_ELEMENT (conf, string);
}
-
- frame->local = NULL;
- STACK_UNWIND_STRICT (statfs, frame, op_ret, op_errno, buf, xdata);
+out:
+ TRACE_STACK_UNWIND (statfs, frame, op_ret, op_errno, buf, xdata);
return 0;
}
-
int
trace_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_SETXATTR].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d",
- frame->root->unique, uuid_utoa (frame->local),
- op_ret, op_errno);
- }
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, op_errno=%d",
+ frame->root->unique,
+ uuid_utoa (frame->local), op_ret,
+ op_errno);
- frame->local = NULL;
- STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, xdata);
+ LOG_ELEMENT (conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata);
return 0;
}
-
int
trace_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_GETXATTR].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d, dict=%p",
- frame->root->unique, uuid_utoa (frame->local), op_ret,
- op_errno, dict);
- }
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, op_errno=%d,"
+ " dict=%p", frame->root->unique,
+ uuid_utoa (frame->local), op_ret, op_errno,
+ dict);
- frame->local = NULL;
- STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict, xdata);
+ LOG_ELEMENT (conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata);
return 0;
}
@@ -879,32 +1035,49 @@ int
trace_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_FSETXATTR].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d",
- frame->root->unique, uuid_utoa (frame->local), op_ret,
- op_errno);
- }
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, op_errno=%d",
+ frame->root->unique,
+ uuid_utoa (frame->local), op_ret, op_errno);
- frame->local = NULL;
- STACK_UNWIND_STRICT (fsetxattr, frame, op_ret, op_errno, xdata);
+ LOG_ELEMENT (conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, xdata);
return 0;
}
-
int
trace_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_FGETXATTR].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d, dict=%p",
- frame->root->unique, uuid_utoa (frame->local), op_ret,
- op_errno, dict);
- }
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, op_errno=%d,"
+ " dict=%p", frame->root->unique,
+ uuid_utoa (frame->local), op_ret, op_errno,
+ dict);
- frame->local = NULL;
- STACK_UNWIND_STRICT (fgetxattr, frame, op_ret, op_errno, dict, xdata);
+ LOG_ELEMENT (conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, xdata);
return 0;
}
@@ -913,160 +1086,208 @@ int
trace_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_REMOVEXATTR].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d",
- frame->root->unique, uuid_utoa (frame->local),
- op_ret, op_errno);
- }
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, op_errno=%d",
+ frame->root->unique,
+ uuid_utoa (frame->local), op_ret, op_errno);
- frame->local = NULL;
- STACK_UNWIND_STRICT (removexattr, frame, op_ret, op_errno, xdata);
+ LOG_ELEMENT (conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND (removexattr, frame, op_ret, op_errno, xdata);
return 0;
}
-
int
trace_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_FSYNCDIR].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d",
- frame->root->unique, uuid_utoa (frame->local),
- op_ret, op_errno);
- }
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, op_errno=%d",
+ frame->root->unique,
+ uuid_utoa (frame->local), op_ret, op_errno);
- frame->local = NULL;
- STACK_UNWIND_STRICT (fsyncdir, frame, op_ret, op_errno, xdata);
+ LOG_ELEMENT (conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND (fsyncdir, frame, op_ret, op_errno, xdata);
return 0;
}
-
int
trace_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_ACCESS].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d)",
- frame->root->unique, uuid_utoa (frame->local),
- op_ret, op_errno);
- }
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, "
+ "op_errno=%d)", frame->root->unique,
+ uuid_utoa (frame->local), op_ret, op_errno);
- frame->local = NULL;
- STACK_UNWIND_STRICT (access, frame, op_ret, op_errno, xdata);
+ LOG_ELEMENT (conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND (access, frame, op_ret, op_errno, xdata);
return 0;
}
-
int
trace_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)
{
- char *prebufstr = NULL;
- char *postbufstr = NULL;
-
- if (trace_fop_names[GF_FOP_FTRUNCATE].enabled) {
- if (op_ret >= 0) {
- prebufstr = trace_stat_to_str (prebuf);
- postbufstr = trace_stat_to_str (postbuf);
+ char prebufstr[4096] = {0, };
+ char postbufstr[4096] = {0, };
+ trace_conf_t *conf = NULL;
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": op_ret=%d, *prebuf = {%s}, "
- "*postbuf = {%s} )",
- frame->root->unique, op_ret,
- prebufstr, postbufstr);
-
- GF_FREE (prebufstr);
-
- GF_FREE (postbufstr);
+ conf = this->private;
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_FTRUNCATE].enabled) {
+ char string[4096] = {0,};
+ if (op_ret == 0) {
+ trace_stat_to_str (prebuf, prebufstr);
+ trace_stat_to_str (postbuf, postbufstr);
+
+ snprintf (string, sizeof (string),
+ "%"PRId64": op_ret=%d, "
+ "*prebuf = {%s}, *postbuf = {%s} )",
+ frame->root->unique, op_ret,
+ prebufstr, postbufstr);
} else {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d",
- frame->root->unique, uuid_utoa (frame->local),
- op_ret, op_errno);
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, "
+ "op_errno=%d", frame->root->unique,
+ uuid_utoa (frame->local), op_ret,
+ op_errno);
}
+ LOG_ELEMENT (conf, string);
}
-
- frame->local = NULL;
- STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+out:
+ TRACE_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
return 0;
}
-
int
trace_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata)
{
- char *statstr = NULL;
+ char statstr[4096] = {0, };
+ trace_conf_t *conf = NULL;
- if (trace_fop_names[GF_FOP_FSTAT].enabled) {
- if (op_ret >= 0) {
- statstr = trace_stat_to_str (buf);
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d buf=%s",
- frame->root->unique, uuid_utoa (frame->local),
- op_ret, statstr);
+ conf = this->private;
- GF_FREE (statstr);
+ if (!conf->log_file && !conf->log_history)
+ goto out;
+ if (trace_fop_names[GF_FOP_FSTAT].enabled) {
+ char string[4096] = {0.};
+ if (op_ret == 0) {
+ trace_stat_to_str (buf, statstr);
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d "
+ "buf=%s", frame->root->unique,
+ uuid_utoa (frame->local), op_ret,
+ statstr);
} else {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d",
- frame->root->unique, uuid_utoa (frame->local),
- op_ret, op_errno);
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, "
+ "op_errno=%d", frame->root->unique,
+ uuid_utoa (frame->local), op_ret,
+ op_errno);
}
+ LOG_ELEMENT (conf, string);
}
-
- frame->local = NULL;
- STACK_UNWIND_STRICT (fstat, frame, op_ret, op_errno, buf, xdata);
+out:
+ TRACE_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata);
return 0;
}
-
int
trace_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct gf_flock *lock, dict_t *xdata)
+ int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+ dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_LK].enabled) {
- if (op_ret >= 0) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, {l_type=%d, l_whence=%d, "
- "l_start=%"PRId64", l_len=%"PRId64", l_pid=%u})",
- frame->root->unique, uuid_utoa (frame->local),
- op_ret, lock->l_type, lock->l_whence,
- lock->l_start, lock->l_len, lock->l_pid);
+ char string[4096] = {0,};
+ if (op_ret == 0) {
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, "
+ "{l_type=%d, l_whence=%d, "
+ "l_start=%"PRId64", "
+ "l_len=%"PRId64", l_pid=%u})",
+ frame->root->unique,
+ uuid_utoa (frame->local),
+ op_ret, lock->l_type, lock->l_whence,
+ lock->l_start, lock->l_len,
+ lock->l_pid);
} else {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d)",
- frame->root->unique, uuid_utoa (frame->local),
- op_ret, op_errno);
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, "
+ "op_errno=%d)", frame->root->unique,
+ uuid_utoa (frame->local), op_ret,
+ op_errno);
}
- }
- frame->local = NULL;
- STACK_UNWIND_STRICT (lk, frame, op_ret, op_errno, lock, xdata);
+ LOG_ELEMENT (conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND (lk, frame, op_ret, op_errno, lock, xdata);
return 0;
}
-
-
int
trace_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_ENTRYLK].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d",
- frame->root->unique, uuid_utoa (frame->local),
- op_ret, op_errno);
- }
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, op_errno=%d",
+ frame->root->unique,
+ uuid_utoa (frame->local), op_ret, op_errno);
- frame->local = NULL;
- STACK_UNWIND_STRICT (entrylk, frame, op_ret, op_errno, xdata);
+ LOG_ELEMENT (conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND (entrylk, frame, op_ret, op_errno, xdata);
return 0;
}
@@ -1074,66 +1295,97 @@ int
trace_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_FENTRYLK].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d",
- frame->root->unique, uuid_utoa (frame->local),
- op_ret, op_errno);
- }
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, op_errno=%d",
+ frame->root->unique,
+ uuid_utoa (frame->local), op_ret, op_errno);
- frame->local = NULL;
- STACK_UNWIND_STRICT (fentrylk, frame, op_ret, op_errno, xdata);
+ LOG_ELEMENT (conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND (fentrylk, frame, op_ret, op_errno, xdata);
return 0;
}
-
int
trace_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_XATTROP].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d",
- frame->root->unique, uuid_utoa (frame->local),
- op_ret, op_errno);
- }
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, op_errno=%d",
+ frame->root->unique,
+ uuid_utoa (frame->local), op_ret, op_errno);
- frame->local = NULL;
- STACK_UNWIND_STRICT (xattrop, frame, op_ret, op_errno, dict, xdata);
+ LOG_ELEMENT (conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND (xattrop, frame, op_ret, op_errno, dict, xdata);
return 0;
}
-
int
trace_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_FXATTROP].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d",
- frame->root->unique, uuid_utoa (frame->local),
- op_ret, op_errno);
- }
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, op_errno=%d",
+ frame->root->unique,
+ uuid_utoa (frame->local), op_ret, op_errno);
- frame->local = NULL;
- STACK_UNWIND_STRICT (fxattrop, frame, op_ret, op_errno, dict, xdata);
+ LOG_ELEMENT (conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND (fxattrop, frame, op_ret, op_errno, dict, xdata);
return 0;
}
-
int
trace_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_INODELK].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d",
- frame->root->unique, uuid_utoa (frame->local),
- op_ret, op_errno);
- }
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, op_errno=%d",
+ frame->root->unique,
+ uuid_utoa (frame->local),op_ret, op_errno);
- frame->local = NULL;
- STACK_UNWIND_STRICT (inodelk, frame, op_ret, op_errno, xdata);
+ LOG_ELEMENT (conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND (inodelk, frame, op_ret, op_errno, xdata);
return 0;
}
@@ -1141,32 +1393,51 @@ int
trace_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_FINODELK].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d, op_errno=%d",
- frame->root->unique, uuid_utoa (frame->local), op_ret, op_errno);
- }
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d, op_errno=%d",
+ frame->root->unique,
+ uuid_utoa (frame->local), op_ret, op_errno);
- frame->local = NULL;
- STACK_UNWIND_STRICT (finodelk, frame, op_ret, op_errno, xdata);
+ LOG_ELEMENT (conf, string);
+ }
+out:
+ TRACE_STACK_UNWIND (finodelk, frame, op_ret, op_errno, xdata);
return 0;
}
-
int
trace_rchecksum_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
- uint32_t weak_checksum, uint8_t *strong_checksum, dict_t *xdata)
+ uint32_t weak_checksum, uint8_t *strong_checksum,
+ dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_RCHECKSUM].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s op_ret=%d op_errno=%d",
- frame->root->unique, uuid_utoa (frame->local), op_ret, op_errno);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s op_ret=%d op_errno=%d",
+ frame->root->unique,
+ uuid_utoa (frame->local), op_ret, op_errno);
+
+ LOG_ELEMENT (conf, string);
}
- frame->local = NULL;
- STACK_UNWIND_STRICT (rchecksum, frame, op_ret, op_errno, weak_checksum,
- strong_checksum, xdata);
+out:
+ TRACE_STACK_UNWIND (rchecksum, frame, op_ret, op_errno, weak_checksum,
+ strong_checksum, xdata);
return 0;
}
@@ -1178,17 +1449,31 @@ trace_entrylk (call_frame_t *frame, xlator_t *this,
const char *volume, loc_t *loc, const char *basename,
entrylk_cmd cmd, entrylk_type type, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_ENTRYLK].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s volume=%s, (path=%s basename=%s, "
- "cmd=%s, type=%s)",
- frame->root->unique, uuid_utoa (loc->inode->gfid),
- volume, loc->path, basename,
- ((cmd == ENTRYLK_LOCK) ? "ENTRYLK_LOCK" : "ENTRYLK_UNLOCK"),
- ((type == ENTRYLK_RDLCK) ? "ENTRYLK_RDLCK" : "ENTRYLK_WRLCK"));
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s volume=%s, (path=%s "
+ "basename=%s, cmd=%s, type=%s)",
+ frame->root->unique,
+ uuid_utoa (loc->inode->gfid),
+ volume, loc->path, basename,
+ ((cmd == ENTRYLK_LOCK) ? "ENTRYLK_LOCK" :
+ "ENTRYLK_UNLOCK"),
+ ((type == ENTRYLK_RDLCK) ? "ENTRYLK_RDLCK" :
+ "ENTRYLK_WRLCK"));
+
frame->local = loc->inode->gfid;
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_entrylk_cbk,
FIRST_CHILD (this),
FIRST_CHILD (this)->fops->entrylk,
@@ -1196,15 +1481,20 @@ trace_entrylk (call_frame_t *frame, xlator_t *this,
return 0;
}
-
int
trace_inodelk (call_frame_t *frame, xlator_t *this, const char *volume,
loc_t *loc, int32_t cmd, struct gf_flock *flock, dict_t *xdata)
{
- char *cmd_str = NULL;
- char *type_str = NULL;
+ char *cmd_str = NULL;
+ char *type_str = NULL;
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_INODELK].enabled) {
+ char string[4096] = {0,};
switch (cmd) {
#if F_GETLK != F_GETLK64
case F_GETLK64:
@@ -1247,17 +1537,22 @@ trace_inodelk (call_frame_t *frame, xlator_t *this, const char *volume,
break;
}
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s volume=%s, (path=%s "
- "cmd=%s, type=%s, start=%llu, len=%llu, pid=%llu)",
- frame->root->unique, uuid_utoa (loc->inode->gfid),
- volume, loc->path,
- cmd_str, type_str, (unsigned long long) flock->l_start,
- (unsigned long long) flock->l_len,
- (unsigned long long) flock->l_pid);
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s volume=%s, (path=%s "
+ "cmd=%s, type=%s, start=%llu, len=%llu, "
+ "pid=%llu)", frame->root->unique,
+ uuid_utoa (loc->inode->gfid), volume,
+ loc->path, cmd_str, type_str,
+ (unsigned long long)flock->l_start,
+ (unsigned long long) flock->l_len,
+ (unsigned long long) flock->l_pid);
+
frame->local = loc->inode->gfid;
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_inodelk_cbk,
FIRST_CHILD (this),
FIRST_CHILD (this)->fops->inodelk,
@@ -1265,14 +1560,20 @@ trace_inodelk (call_frame_t *frame, xlator_t *this, const char *volume,
return 0;
}
-
int
trace_finodelk (call_frame_t *frame, xlator_t *this, const char *volume,
fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata)
{
- char *cmd_str = NULL, *type_str = NULL;
+ char *cmd_str = NULL;
+ char *type_str = NULL;
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_FINODELK].enabled) {
+ char string[4096] = {0,};
switch (cmd) {
#if F_GETLK != F_GETLK64
case F_GETLK64:
@@ -1315,16 +1616,21 @@ trace_finodelk (call_frame_t *frame, xlator_t *this, const char *volume,
break;
}
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s volume=%s, (fd =%p "
- "cmd=%s, type=%s, start=%llu, len=%llu, pid=%llu)",
- frame->root->unique, uuid_utoa (fd->inode->gfid), volume, fd,
- cmd_str, type_str, (unsigned long long) flock->l_start,
- (unsigned long long) flock->l_len,
- (unsigned long long) flock->l_pid);
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s volume=%s, (fd =%p "
+ "cmd=%s, type=%s, start=%llu, len=%llu, "
+ "pid=%llu)", frame->root->unique,
+ uuid_utoa (fd->inode->gfid), volume, fd,
+ cmd_str, type_str,
+ (unsigned long long) flock->l_start,
+ (unsigned long long) flock->l_len,
+ (unsigned long long) flock->l_pid);
+
frame->local = fd->inode->gfid;
- }
+ LOG_ELEMENT (conf, string);
+ }
+out:
STACK_WIND (frame, trace_finodelk_cbk,
FIRST_CHILD (this),
FIRST_CHILD (this)->fops->finodelk,
@@ -1332,19 +1638,30 @@ trace_finodelk (call_frame_t *frame, xlator_t *this, const char *volume,
return 0;
}
-
int
trace_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_XATTROP].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s (path=%s flags=%d)",
- frame->root->unique, uuid_utoa (loc->inode->gfid),
- loc->path, flags);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s (path=%s flags=%d)",
+ frame->root->unique,
+ uuid_utoa (loc->inode->gfid), loc->path,
+ flags);
+
frame->local = loc->inode->gfid;
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_xattrop_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->xattrop,
@@ -1353,19 +1670,29 @@ trace_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
return 0;
}
-
int
trace_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_FXATTROP].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s fd=%p, flags=%d",
- frame->root->unique, uuid_utoa (fd->inode->gfid),
- fd, flags);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s fd=%p, flags=%d",
+ frame->root->unique,
+ uuid_utoa (fd->inode->gfid), fd, flags);
+
frame->local = fd->inode->gfid;
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_fxattrop_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->fxattrop,
@@ -1374,20 +1701,30 @@ trace_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
return 0;
}
-
int
trace_lookup (call_frame_t *frame, xlator_t *this,
loc_t *loc, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_LOOKUP].enabled) {
+ char string[4096] = {0,};
/* TODO: print all the keys mentioned in xattr_req */
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s path=%s",
- frame->root->unique, uuid_utoa (loc->inode->gfid),
- loc->path);
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s path=%s",
+ frame->root->unique,
+ uuid_utoa (loc->inode->gfid), loc->path);
+
frame->local = loc->inode->gfid;
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_lookup_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->lookup,
@@ -1396,18 +1733,28 @@ trace_lookup (call_frame_t *frame, xlator_t *this,
return 0;
}
-
int
trace_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_STAT].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s path=%s",
- frame->root->unique, uuid_utoa (loc->inode->gfid),
- loc->path);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s path=%s",
+ frame->root->unique,
+ uuid_utoa (loc->inode->gfid), loc->path);
+
frame->local = loc->inode->gfid;
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_stat_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->stat,
@@ -1416,18 +1763,30 @@ trace_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
return 0;
}
-
int
-trace_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, dict_t *xdata)
+trace_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+ dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_READLINK].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s path=%s, size=%"GF_PRI_SIZET")",
- frame->root->unique, uuid_utoa (loc->inode->gfid),
- loc->path, size);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s path=%s, "
+ "size=%"GF_PRI_SIZET")", frame->root->unique,
+ uuid_utoa (loc->inode->gfid), loc->path,
+ size);
+
frame->local = loc->inode->gfid;
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_readlink_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->readlink,
@@ -1436,19 +1795,29 @@ trace_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, di
return 0;
}
-
int
trace_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc,
mode_t mode, dev_t dev, mode_t umask, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_MKNOD].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s path=%s mode=0%o umask=0%o "
- "dev=%"GF_PRI_DEV")",
- frame->root->unique, uuid_utoa (loc->inode->gfid),
- loc->path, mode, umask, dev);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s path=%s mode=%d "
+ "umask=0%o, dev=%"GF_PRI_DEV")",
+ frame->root->unique,
+ uuid_utoa (loc->inode->gfid), loc->path,
+ mode, umask, dev);
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_mknod_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->mknod,
@@ -1457,18 +1826,28 @@ trace_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc,
return 0;
}
-
int
trace_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
mode_t umask, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_MKDIR].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s path=%s mode=0%o umask=0%o",
- frame->root->unique, uuid_utoa (loc->inode->gfid),
- loc->path, mode, umask);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s path=%s mode=%d"
+ " umask=0%o", frame->root->unique,
+ uuid_utoa (loc->inode->gfid), loc->path,
+ mode, umask);
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_mkdir_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->mkdir,
@@ -1476,19 +1855,29 @@ trace_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
return 0;
}
-
int
trace_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_UNLINK].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s path=%s flag=%d",
- frame->root->unique, uuid_utoa (loc->inode->gfid),
- loc->path, xflag);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s path=%s flag=%d",
+ frame->root->unique,
+ uuid_utoa (loc->inode->gfid), loc->path,
+ xflag);
+
frame->local = loc->inode->gfid;
- }
+ LOG_ELEMENT (conf, string);
+ }
+out:
STACK_WIND (frame, trace_unlink_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->unlink,
@@ -1496,19 +1885,30 @@ trace_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
return 0;
}
-
int
trace_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_RMDIR].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s path=%s flags=%d",
- frame->root->unique, uuid_utoa (loc->inode->gfid),
- loc->path, flags);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s path=%s flags=%d",
+ frame->root->unique,
+ uuid_utoa (loc->inode->gfid), loc->path,
+ flags);
+
frame->local = loc->inode->gfid;
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_rmdir_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->rmdir,
@@ -1517,18 +1917,28 @@ trace_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
return 0;
}
-
int
trace_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath,
loc_t *loc, mode_t umask, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_SYMLINK].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s linkpath=%s, path=%s umask=0%o",
- frame->root->unique, uuid_utoa (loc->inode->gfid),
- linkpath, loc->path, umask);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s linkpath=%s, path=%s"
+ " umask=0%o", frame->root->unique,
+ uuid_utoa (loc->inode->gfid), linkpath,
+ loc->path, umask);
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_symlink_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->symlink,
@@ -1537,15 +1947,20 @@ trace_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath,
return 0;
}
-
int
trace_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
dict_t *xdata)
{
- char oldgfid[50] = {0,};
- char newgfid[50] = {0,};
+ char oldgfid[50] = {0,};
+ char newgfid[50] = {0,};
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_RENAME].enabled) {
+ char string[4096] = {0,};
if (newloc->inode)
uuid_utoa_r (newloc->inode->gfid, newgfid);
else
@@ -1553,13 +1968,18 @@ trace_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
uuid_utoa_r (oldloc->inode->gfid, oldgfid);
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": oldgfid=%s oldpath=%s --> newgfid=%s newpath=%s",
- frame->root->unique, oldgfid, oldloc->path, newgfid, newloc->path);
+ snprintf (string, sizeof (string),
+ "%"PRId64": oldgfid=%s oldpath=%s --> "
+ "newgfid=%s newpath=%s",
+ frame->root->unique, oldgfid,
+ oldloc->path, newgfid, newloc->path);
frame->local = oldloc->inode->gfid;
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_rename_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->rename,
@@ -1568,15 +1988,20 @@ trace_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
return 0;
}
-
int
trace_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
dict_t *xdata)
{
- char oldgfid[50] = {0,};
- char newgfid[50] = {0,};
+ char oldgfid[50] = {0,};
+ char newgfid[50] = {0,};
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_LINK].enabled) {
+ char string[4096] = {0,};
if (newloc->inode)
uuid_utoa_r (newloc->inode->gfid, newgfid);
else
@@ -1584,13 +2009,18 @@ trace_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
uuid_utoa_r (oldloc->inode->gfid, oldgfid);
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": oldgfid=%s oldpath=%s --> newgfid=%s newpath=%s",
- frame->root->unique, oldgfid, oldloc->path,
- newgfid, newloc->path);
+ snprintf (string, sizeof (string),
+ "%"PRId64": oldgfid=%s oldpath=%s --> "
+ "newgfid=%s newpath=%s", frame->root->unique,
+ oldgfid, oldloc->path, newgfid,
+ newloc->path);
+
frame->local = oldloc->inode->gfid;
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_link_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->link,
@@ -1598,43 +2028,69 @@ trace_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
return 0;
}
-
int
trace_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
struct iatt *stbuf, int32_t valid, dict_t *xdata)
{
- char actime_str[64] = {0,};
- char modtime_str[64] = {0,};
+ uint64_t ia_time = 0;
+ char actime_str[256] = {0,};
+ char modtime_str[256] = {0,};
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_SETATTR].enabled) {
+ char string[4096] = {0,};
if (valid & GF_SET_ATTR_MODE) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s path=%s mode=%o)",
- frame->root->unique, uuid_utoa (loc->inode->gfid),
- loc->path, st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type));
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s path=%s mode=%o)",
+ frame->root->unique,
+ uuid_utoa (loc->inode->gfid),
+ loc->path,
+ st_mode_from_ia (stbuf->ia_prot,
+ stbuf->ia_type));
+
+ LOG_ELEMENT (conf, string);
+ memset (string, 0 , sizeof (string));
}
if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s path=%s uid=%o, gid=%o",
- frame->root->unique, uuid_utoa (loc->inode->gfid),
- loc->path, stbuf->ia_uid, stbuf->ia_gid);
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s path=%s uid=%o,"
+ " gid=%o", frame->root->unique,
+ uuid_utoa (loc->inode->gfid),
+ loc->path, stbuf->ia_uid,
+ stbuf->ia_gid);
+
+ LOG_ELEMENT (conf, string);
+ memset (string, 0 , sizeof (string));
}
if (valid & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME)) {
- gf_time_fmt (actime_str, sizeof actime_str,
- stbuf->ia_atime, gf_timefmt_bdT);
- gf_time_fmt (modtime_str, sizeof modtime_str,
- stbuf->ia_mtime, gf_timefmt_bdT);
-
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s path=%s ia_atime=%s, ia_mtime=%s",
- frame->root->unique, uuid_utoa (loc->inode->gfid),
- loc->path, actime_str, modtime_str);
+ ia_time = stbuf->ia_atime;
+ strftime (actime_str, 256, "[%b %d %H:%M:%S]",
+ localtime ((time_t *)&ia_time));
+
+ ia_time = stbuf->ia_mtime;
+ strftime (modtime_str, 256, "[%b %d %H:%M:%S]",
+ localtime ((time_t *)&ia_time));
+
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s path=%s "
+ "ia_atime=%s, ia_mtime=%s",
+ frame->root->unique,
+ uuid_utoa (loc->inode->gfid),
+ loc->path, actime_str, modtime_str);
+
+ LOG_ELEMENT (conf, string);
+ memset (string, 0 , sizeof (string));
}
frame->local = loc->inode->gfid;
}
+out:
STACK_WIND (frame, trace_setattr_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->setattr,
@@ -1643,43 +2099,67 @@ trace_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
return 0;
}
-
int
trace_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iatt *stbuf, int32_t valid, dict_t *xdata)
{
- char actime_str[64] = {0,};
- char modtime_str[64] = {0,};
+ uint64_t ia_time = 0;
+ char actime_str[256] = {0,};
+ char modtime_str[256] = {0,};
+ trace_conf_t *conf = NULL;
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_FSETATTR].enabled) {
+ char string[4096] = {0,};
if (valid & GF_SET_ATTR_MODE) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s fd=%p, mode=%o",
- frame->root->unique, uuid_utoa (fd->inode->gfid), fd,
- st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type));
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s fd=%p, mode=%o",
+ frame->root->unique,
+ uuid_utoa (fd->inode->gfid), fd,
+ st_mode_from_ia (stbuf->ia_prot,
+ stbuf->ia_type));
+
+ LOG_ELEMENT (conf, string);
+ memset (string, 0, sizeof (string));
}
if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s fd=%p, uid=%o, gid=%o",
- frame->root->unique, uuid_utoa (fd->inode->gfid),
- fd, stbuf->ia_uid, stbuf->ia_gid);
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s fd=%p, uid=%o, "
+ "gid=%o", frame->root->unique,
+ uuid_utoa (fd->inode->gfid),
+ fd, stbuf->ia_uid, stbuf->ia_gid);
+
+ LOG_ELEMENT (conf, string);
+ memset (string, 0, sizeof (string));
}
if (valid & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME)) {
- gf_time_fmt (actime_str, sizeof actime_str,
- stbuf->ia_atime, gf_timefmt_bdT);
- gf_time_fmt (modtime_str, sizeof modtime_str,
- stbuf->ia_mtime, gf_timefmt_bdT);
-
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s fd=%p ia_atime=%s, ia_mtime=%s",
- frame->root->unique, uuid_utoa (fd->inode->gfid),
- fd, actime_str, modtime_str);
+ ia_time = stbuf->ia_atime;
+ strftime (actime_str, 256, "[%b %d %H:%M:%S]",
+ localtime ((time_t *)&ia_time));
+
+ ia_time = stbuf->ia_mtime;
+ strftime (modtime_str, 256, "[%b %d %H:%M:%S]",
+ localtime ((time_t *)&ia_time));
+
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s fd=%p "
+ "ia_atime=%s, ia_mtime=%s",
+ frame->root->unique,
+ uuid_utoa (fd->inode->gfid),
+ fd, actime_str, modtime_str);
+
+ LOG_ELEMENT (conf, string);
+ memset (string, 0, sizeof (string));
}
frame->local = fd->inode->gfid;
}
+out:
STACK_WIND (frame, trace_fsetattr_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->fsetattr,
@@ -1688,19 +2168,30 @@ trace_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
return 0;
}
-
int
trace_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc,
off_t offset, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_TRUNCATE].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s path=%s, offset=%"PRId64"",
- frame->root->unique, uuid_utoa (loc->inode->gfid),
- loc->path, offset);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s path=%s, "
+ "offset=%"PRId64"", frame->root->unique,
+ uuid_utoa (loc->inode->gfid), loc->path,
+ offset);
+
frame->local = loc->inode->gfid;
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_truncate_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->truncate,
@@ -1709,19 +2200,30 @@ trace_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc,
return 0;
}
-
int
trace_open (call_frame_t *frame, xlator_t *this, loc_t *loc,
int32_t flags, fd_t *fd, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_OPEN].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s path=%s flags=%d fd=%p",
- frame->root->unique, uuid_utoa (loc->inode->gfid),
- loc->path, flags, fd);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s path=%s flags=%d fd=%p",
+ frame->root->unique,
+ uuid_utoa (loc->inode->gfid), loc->path,
+ flags, fd);
+
frame->local = loc->inode->gfid;
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_open_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->open,
@@ -1729,42 +2231,62 @@ trace_open (call_frame_t *frame, xlator_t *this, loc_t *loc,
return 0;
}
-
int
trace_create (call_frame_t *frame, xlator_t *this, loc_t *loc,
int32_t flags, mode_t mode, mode_t umask, fd_t *fd,
dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_CREATE].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s path=%s fd=%p flags=0%o mode=0%o "
- "umask=0%o",
- frame->root->unique, uuid_utoa (loc->inode->gfid),
- loc->path, fd, flags, mode, umask);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s path=%s, fd=%p, "
+ "flags=0%o mode=0%o umask=0%o",
+ frame->root->unique,
+ uuid_utoa (loc->inode->gfid), loc->path,
+ fd, flags, mode, umask);
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_create_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->create,
loc, flags, mode, umask, fd, xdata);
-
return 0;
}
-
int
trace_readv (call_frame_t *frame, xlator_t *this, fd_t *fd,
size_t size, off_t offset, uint32_t flags, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_READ].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s fd=%p, size=%"GF_PRI_SIZET", "
- "offset=%"PRId64" flags=0%x)",
- frame->root->unique, uuid_utoa (fd->inode->gfid),
- fd, size, offset, flags);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s fd=%p, size=%"
+ GF_PRI_SIZET"offset=%"PRId64" flags=0%x)",
+ frame->root->unique,
+ uuid_utoa (fd->inode->gfid), fd, size,
+ offset, flags);
+
frame->local = fd->inode->gfid;
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_readv_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->readv,
@@ -1772,21 +2294,32 @@ trace_readv (call_frame_t *frame, xlator_t *this, fd_t *fd,
return 0;
}
-
int
trace_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iovec *vector, int32_t count,
off_t offset, uint32_t flags, struct iobref *iobref, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_WRITE].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s fd=%p, count=%d, offset=%"PRId64
- " flag=0%x)",
- frame->root->unique, uuid_utoa (fd->inode->gfid),
- fd, count, offset, flags);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s fd=%p, count=%d, "
+ " offset=%"PRId64" flags=0%x)",
+ frame->root->unique,
+ uuid_utoa (fd->inode->gfid), fd, count,
+ offset, flags);
+
frame->local = fd->inode->gfid;
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_writev_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->writev,
@@ -1794,17 +2327,26 @@ trace_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
return 0;
}
-
int
trace_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_STATFS].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s path=%s",
- frame->root->unique, (loc->inode)?
- uuid_utoa (loc->inode->gfid):"0", loc->path);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s path=%s",
+ frame->root->unique, (loc->inode)?
+ uuid_utoa (loc->inode->gfid):"0", loc->path);
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_statfs_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->statfs,
@@ -1812,17 +2354,28 @@ trace_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
return 0;
}
-
int
trace_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_FLUSH].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s fd=%p",
- frame->root->unique, uuid_utoa (fd->inode->gfid), fd);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s fd=%p",
+ frame->root->unique,
+ uuid_utoa (fd->inode->gfid), fd);
+
frame->local = fd->inode->gfid;
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_flush_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->flush,
@@ -1830,17 +2383,29 @@ trace_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
return 0;
}
-
int
-trace_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, dict_t *xdata)
+trace_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+ dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_FSYNC].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s flags=%d fd=%p",
- frame->root->unique, uuid_utoa (fd->inode->gfid), flags, fd);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s flags=%d fd=%p",
+ frame->root->unique,
+ uuid_utoa (fd->inode->gfid), flags, fd);
+
frame->local = fd->inode->gfid;
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_fsync_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->fsync,
@@ -1848,19 +2413,30 @@ trace_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, dict_
return 0;
}
-
int
trace_setxattr (call_frame_t *frame, xlator_t *this,
loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_SETXATTR].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s path=%s flags=%d",
- frame->root->unique, uuid_utoa (loc->inode->gfid),
- loc->path, flags);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s path=%s flags=%d",
+ frame->root->unique,
+ uuid_utoa (loc->inode->gfid), loc->path,
+ flags);
+
frame->local = loc->inode->gfid;
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_setxattr_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->setxattr,
@@ -1868,19 +2444,30 @@ trace_setxattr (call_frame_t *frame, xlator_t *this,
return 0;
}
-
int
trace_getxattr (call_frame_t *frame, xlator_t *this,
loc_t *loc, const char *name, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_GETXATTR].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s path=%s name=%s",
- frame->root->unique, uuid_utoa (loc->inode->gfid),
- loc->path, name);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s path=%s name=%s",
+ frame->root->unique,
+ uuid_utoa (loc->inode->gfid), loc->path,
+ name);
+
frame->local = loc->inode->gfid;
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_getxattr_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->getxattr,
@@ -1888,19 +2475,30 @@ trace_getxattr (call_frame_t *frame, xlator_t *this,
return 0;
}
-
int
trace_removexattr (call_frame_t *frame, xlator_t *this,
loc_t *loc, const char *name, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_REMOVEXATTR].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s path=%s name=%s",
- frame->root->unique, uuid_utoa (loc->inode->gfid),
- loc->path, name);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s path=%s name=%s",
+ frame->root->unique,
+ uuid_utoa (loc->inode->gfid), loc->path,
+ name);
+
frame->local = loc->inode->gfid;
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_removexattr_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->removexattr,
@@ -1909,18 +2507,29 @@ trace_removexattr (call_frame_t *frame, xlator_t *this,
return 0;
}
-
int
-trace_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, dict_t *xdata)
+trace_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+ dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_OPENDIR].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s path=%s fd=%p",
- frame->root->unique, uuid_utoa (loc->inode->gfid),
- loc->path, fd);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s path=%s fd=%p",
+ frame->root->unique,
+ uuid_utoa (loc->inode->gfid), loc->path, fd);
+
frame->local = loc->inode->gfid;
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_opendir_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->opendir,
@@ -1932,15 +2541,27 @@ int
trace_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
off_t offset, dict_t *dict)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_READDIRP].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s fd=%p, size=%"GF_PRI_SIZET", "
- "offset=%"PRId64" dict=%p",
- frame->root->unique, uuid_utoa (fd->inode->gfid),
- fd, size, offset, dict);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s fd=%p, size=%"GF_PRI_SIZET
+ ", offset=%"PRId64" dict=%p",
+ frame->root->unique,
+ uuid_utoa (fd->inode->gfid), fd, size,
+ offset, dict);
+
frame->local = fd->inode->gfid;
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_readdirp_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->readdirp,
@@ -1949,19 +2570,31 @@ trace_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
return 0;
}
-
int
trace_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd,
size_t size, off_t offset, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_READDIR].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s fd=%p, size=%"GF_PRI_SIZET", offset=%"PRId64,
- frame->root->unique, uuid_utoa (fd->inode->gfid),
- fd, size, offset);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s fd=%p, size=%"GF_PRI_SIZET
+ ", offset=%"PRId64,
+ frame->root->unique,
+ uuid_utoa (fd->inode->gfid), fd, size,
+ offset);
+
frame->local = fd->inode->gfid;
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_readdir_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->readdir,
@@ -1970,19 +2603,29 @@ trace_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd,
return 0;
}
-
int
trace_fsyncdir (call_frame_t *frame, xlator_t *this,
fd_t *fd, int32_t datasync, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_FSYNCDIR].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s datasync=%d fd=%p",
- frame->root->unique, uuid_utoa (fd->inode->gfid),
- datasync, fd);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s datasync=%d fd=%p",
+ frame->root->unique,
+ uuid_utoa (fd->inode->gfid), datasync, fd);
+
frame->local = fd->inode->gfid;
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_fsyncdir_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->fsyncdir,
@@ -1990,18 +2633,30 @@ trace_fsyncdir (call_frame_t *frame, xlator_t *this,
return 0;
}
-
int
-trace_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, dict_t *xdata)
+trace_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
+ dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_ACCESS].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s path=%s mask=0%o",
- frame->root->unique, uuid_utoa (loc->inode->gfid),
- loc->path, mask);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s path=%s mask=0%o",
+ frame->root->unique,
+ uuid_utoa (loc->inode->gfid),
+ loc->path, mask);
+
frame->local = loc->inode->gfid;
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_access_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->access,
@@ -2009,19 +2664,30 @@ trace_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask, dic
return 0;
}
-
int32_t
trace_rchecksum (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
int32_t len, dict_t *xdata)
{
+
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_RCHECKSUM].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s offset=%"PRId64" len=%u fd=%p",
- frame->root->unique, uuid_utoa (fd->inode->gfid),
- offset, len, fd);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s offset=%"PRId64
+ "len=%u fd=%p", frame->root->unique,
+ uuid_utoa (fd->inode->gfid), offset, len, fd);
+
frame->local = fd->inode->gfid;
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_rchecksum_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->rchecksum,
@@ -2036,17 +2702,31 @@ trace_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume,
fd_t *fd, const char *basename, entrylk_cmd cmd,
entrylk_type type, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_FENTRYLK].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s volume=%s, (fd=%p basename=%s, "
- "cmd=%s, type=%s)",
- frame->root->unique, uuid_utoa (fd->inode->gfid),
- volume, fd, basename,
- ((cmd == ENTRYLK_LOCK) ? "ENTRYLK_LOCK" : "ENTRYLK_UNLOCK"),
- ((type == ENTRYLK_RDLCK) ? "ENTRYLK_RDLCK" : "ENTRYLK_WRLCK"));
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s volume=%s, (fd=%p "
+ "basename=%s, cmd=%s, type=%s)",
+ frame->root->unique,
+ uuid_utoa (fd->inode->gfid), volume, fd,
+ basename,
+ ((cmd == ENTRYLK_LOCK) ? "ENTRYLK_LOCK" :
+ "ENTRYLK_UNLOCK"),
+ ((type == ENTRYLK_RDLCK) ? "ENTRYLK_RDLCK" :
+ "ENTRYLK_WRLCK"));
+
frame->local = fd->inode->gfid;
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_fentrylk_cbk,
FIRST_CHILD (this),
FIRST_CHILD (this)->fops->fentrylk,
@@ -2059,14 +2739,25 @@ int32_t
trace_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
const char *name, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_FGETXATTR].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s fd=%p name=%s",
- frame->root->unique, uuid_utoa (fd->inode->gfid),
- fd, name);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s fd=%p name=%s",
+ frame->root->unique,
+ uuid_utoa (fd->inode->gfid), fd, name);
+
frame->local = fd->inode->gfid;
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_fgetxattr_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->fgetxattr,
@@ -2078,14 +2769,25 @@ int32_t
trace_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
dict_t *dict, int32_t flags, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_FSETXATTR].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s fd=%p flags=%d",
- frame->root->unique, uuid_utoa (fd->inode->gfid),
- fd, flags);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s fd=%p flags=%d",
+ frame->root->unique,
+ uuid_utoa (fd->inode->gfid), fd, flags);
+
frame->local = fd->inode->gfid;
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_fsetxattr_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->fsetxattr,
@@ -2097,14 +2799,25 @@ int
trace_ftruncate (call_frame_t *frame, xlator_t *this,
fd_t *fd, off_t offset, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_FTRUNCATE].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s offset=%"PRId64" fd=%p",
- frame->root->unique, uuid_utoa (fd->inode->gfid),
- offset, fd);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s offset=%"PRId64" fd=%p",
+ frame->root->unique,
+ uuid_utoa (fd->inode->gfid), offset, fd);
+
frame->local = fd->inode->gfid;
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_ftruncate_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->ftruncate,
@@ -2113,17 +2826,28 @@ trace_ftruncate (call_frame_t *frame, xlator_t *this,
return 0;
}
-
int
trace_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_FSTAT].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s fd=%p",
- frame->root->unique, uuid_utoa (fd->inode->gfid), fd);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s fd=%p",
+ frame->root->unique,
+ uuid_utoa (fd->inode->gfid), fd);
+
frame->local = fd->inode->gfid;
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_fstat_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->fstat,
@@ -2131,21 +2855,34 @@ trace_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
return 0;
}
-
int
trace_lk (call_frame_t *frame, xlator_t *this, fd_t *fd,
int32_t cmd, struct gf_flock *lock, dict_t *xdata)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_LK].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": gfid=%s fd=%p, cmd=%d, lock {l_type=%d, l_whence=%d, "
- "l_start=%"PRId64", l_len=%"PRId64", l_pid=%u})",
- frame->root->unique, uuid_utoa (fd->inode->gfid), fd,
- cmd, lock->l_type, lock->l_whence,
- lock->l_start, lock->l_len, lock->l_pid);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "%"PRId64": gfid=%s fd=%p, cmd=%d, "
+ "lock {l_type=%d, "
+ "l_whence=%d, l_start=%"PRId64", "
+ "l_len=%"PRId64", l_pid=%u})",
+ frame->root->unique,
+ uuid_utoa (fd->inode->gfid), fd, cmd,
+ lock->l_type, lock->l_whence,
+ lock->l_start, lock->l_len, lock->l_pid);
+
frame->local = fd->inode->gfid;
+
+ LOG_ELEMENT (conf, string);
}
+out:
STACK_WIND (frame, trace_lk_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->lk,
@@ -2156,40 +2893,71 @@ trace_lk (call_frame_t *frame, xlator_t *this, fd_t *fd,
int32_t
trace_forget (xlator_t *this, inode_t *inode)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
/* If user want to understand when a lookup happens,
he should know about 'forget' too */
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_LOOKUP].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "gfid=%s", uuid_utoa (inode->gfid));
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "gfid=%s", uuid_utoa (inode->gfid));
+
+ LOG_ELEMENT (conf, string);
}
+
+out:
return 0;
}
-
int32_t
trace_releasedir (xlator_t *this, fd_t *fd)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_OPENDIR].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "gfid=%s fd=%p", uuid_utoa (fd->inode->gfid), fd);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "gfid=%s fd=%p",
+ uuid_utoa (fd->inode->gfid), fd);
+
+ LOG_ELEMENT (conf, string);
}
+out:
return 0;
}
int32_t
trace_release (xlator_t *this, fd_t *fd)
{
+ trace_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->log_file && !conf->log_history)
+ goto out;
if (trace_fop_names[GF_FOP_OPEN].enabled ||
trace_fop_names[GF_FOP_CREATE].enabled) {
- gf_log (this->name, GF_LOG_INFO,
- "gfid=%s fd=%p", uuid_utoa (fd->inode->gfid), fd);
+ char string[4096] = {0,};
+ snprintf (string, sizeof (string),
+ "gfid=%s fd=%p",
+ uuid_utoa (fd->inode->gfid), fd);
+
+ LOG_ELEMENT (conf, string);
}
+
+out:
return 0;
}
-
void
enable_all_calls (int enabled)
{
@@ -2199,7 +2967,6 @@ enable_all_calls (int enabled)
trace_fop_names[i].enabled = enabled;
}
-
void
enable_call (const char *name, int enabled)
{
@@ -2227,6 +2994,105 @@ process_call_list (const char *list, int include)
}
}
+int32_t
+trace_dump_history (xlator_t *this)
+{
+ int ret = -1;
+ char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0,};
+ trace_conf_t *conf = NULL;
+
+ GF_VALIDATE_OR_GOTO ("trace", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, this->history, out);
+
+ conf = this->private;
+ // Is it ok to return silently if log-history option his off?
+ if (conf && conf->log_history == _gf_true) {
+ gf_proc_dump_build_key (key_prefix, "xlator.debug.trace",
+ "history");
+ gf_proc_dump_add_section (key_prefix);
+ eh_dump (this->history, NULL, dump_history_trace);
+ }
+ ret = 0;
+
+out:
+ return ret;
+}
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ if (!this)
+ return ret;
+
+ ret = xlator_mem_acct_init (this, gf_trace_mt_end + 1);
+
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Memory accounting init"
+ " failed");
+ return ret;
+ }
+
+ return ret;
+}
+
+int
+reconfigure (xlator_t *this, dict_t *options)
+{
+ int32_t ret = -1;
+ trace_conf_t *conf = NULL;
+ char *includes = NULL, *excludes = NULL;
+
+ GF_VALIDATE_OR_GOTO ("quick-read", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, this->private, out);
+ GF_VALIDATE_OR_GOTO (this->name, options, out);
+
+ conf = this->private;
+
+ includes = data_to_str (dict_get (options, "include-ops"));
+ excludes = data_to_str (dict_get (options, "exclude-ops"));
+
+ {
+ int i;
+ for (i = 0; i < GF_FOP_MAXVALUE; i++) {
+ if (gf_fop_list[i])
+ strncpy (trace_fop_names[i].name,
+ gf_fop_list[i],
+ strlen (gf_fop_list[i]));
+ else
+ strncpy (trace_fop_names[i].name, ":O",
+ strlen (":O"));
+ trace_fop_names[i].enabled = 1;
+ }
+ }
+
+ if (includes && excludes) {
+ gf_log (this->name,
+ GF_LOG_ERROR,
+ "must specify only one of 'include-ops' and "
+ "'exclude-ops'");
+ goto out;
+ }
+
+ if (includes)
+ process_call_list (includes, 1);
+ if (excludes)
+ process_call_list (excludes, 0);
+
+ /* Should resizing of the event-history be allowed in reconfigure?
+ * for which a new event_history might have to be allocated and the
+ * older history has to be freed.
+ */
+ GF_OPTION_RECONF ("log-file", conf->log_file, options, bool, out);
+
+ GF_OPTION_RECONF ("log-history", conf->log_history, options, bool, out);
+
+ ret = 0;
+
+out:
+ return ret;
+}
int32_t
init (xlator_t *this)
@@ -2234,6 +3100,10 @@ init (xlator_t *this)
dict_t *options = NULL;
char *includes = NULL, *excludes = NULL;
char *forced_loglevel = NULL;
+ eh_t *history = NULL;
+ int ret = -1;
+ size_t history_size = TRACE_DEFAULT_HISTORY_SIZE;
+ trace_conf_t *conf = NULL;
if (!this)
return -1;
@@ -2248,6 +3118,12 @@ init (xlator_t *this)
"dangling volume. check volfile ");
}
+ conf = GF_CALLOC (1, sizeof (trace_conf_t), gf_trace_mt_trace_conf_t);
+ if (!conf) {
+ gf_log (this->name, GF_LOG_ERROR, "cannot allocate "
+ "xl->private");
+ return -1;
+ }
options = this->options;
includes = data_to_str (dict_get (options, "include-ops"));
@@ -2256,9 +3132,13 @@ init (xlator_t *this)
{
int i;
for (i = 0; i < GF_FOP_MAXVALUE; i++) {
- trace_fop_names[i].name = (gf_fop_list[i] ?
- (char *)gf_fop_list[i] :
- ":O");
+ if (gf_fop_list[i])
+ strncpy (trace_fop_names[i].name,
+ gf_fop_list[i],
+ strlen (gf_fop_list[i]));
+ else
+ strncpy (trace_fop_names[i].name, ":O",
+ strlen (":O"));
trace_fop_names[i].enabled = 1;
}
}
@@ -2266,14 +3146,43 @@ init (xlator_t *this)
if (includes && excludes) {
gf_log (this->name,
GF_LOG_ERROR,
- "must specify only one of 'include-ops' and 'exclude-ops'");
+ "must specify only one of 'include-ops' and "
+ "'exclude-ops'");
return -1;
}
+
if (includes)
process_call_list (includes, 1);
if (excludes)
process_call_list (excludes, 0);
+
+ GF_OPTION_INIT ("history-size", conf->history_size, size, out);
+
+ gf_log (this->name, GF_LOG_INFO, "history size %"GF_PRI_SIZET,
+ history_size);
+
+ GF_OPTION_INIT ("log-file", conf->log_file, bool, out);
+
+ gf_log (this->name, GF_LOG_INFO, "logging to file %s",
+ (conf->log_file == _gf_true)?"enabled":"disabled");
+
+ GF_OPTION_INIT ("log-history", conf->log_history, bool, out);
+
+ gf_log (this->name, GF_LOG_DEBUG, "logging to history %s",
+ (conf->log_history == _gf_true)?"enabled":"disabled");
+
+ history = eh_new (history_size, _gf_false, NULL);
+ if (!history) {
+ gf_log (this->name, GF_LOG_ERROR, "event history cannot be "
+ "initialized");
+ return -1;
+ }
+
+ this->history = history;
+
+ conf->trace_log_level = GF_LOG_INFO;
+
if (dict_get (options, "force-log-level")) {
forced_loglevel = data_to_str (dict_get (options,
"force-log-level"));
@@ -2281,25 +3190,34 @@ init (xlator_t *this)
goto setloglevel;
if (strcmp (forced_loglevel, "INFO") == 0)
- trace_log_level = GF_LOG_INFO;
+ conf->trace_log_level = GF_LOG_INFO;
else if (strcmp (forced_loglevel, "TRACE") == 0)
- trace_log_level = GF_LOG_TRACE;
+ conf->trace_log_level = GF_LOG_TRACE;
else if (strcmp (forced_loglevel, "ERROR") == 0)
- trace_log_level = GF_LOG_ERROR;
+ conf->trace_log_level = GF_LOG_ERROR;
else if (strcmp (forced_loglevel, "DEBUG") == 0)
- trace_log_level = GF_LOG_DEBUG;
+ conf->trace_log_level = GF_LOG_DEBUG;
else if (strcmp (forced_loglevel, "WARNING") == 0)
- trace_log_level = GF_LOG_WARNING;
+ conf->trace_log_level = GF_LOG_WARNING;
else if (strcmp (forced_loglevel, "CRITICAL") == 0)
- trace_log_level = GF_LOG_CRITICAL;
+ conf->trace_log_level = GF_LOG_CRITICAL;
else if (strcmp (forced_loglevel, "NONE") == 0)
- trace_log_level = GF_LOG_NONE;
+ conf->trace_log_level = GF_LOG_NONE;
}
setloglevel:
- gf_log_set_loglevel (trace_log_level);
+ gf_log_set_loglevel (conf->trace_log_level);
+ this->private = conf;
+ ret = 0;
+out:
+ if (ret == -1) {
+ if (history)
+ GF_FREE (history);
+ if (conf)
+ GF_FREE (conf);
+ }
- return 0;
+ return ret;
}
void
@@ -2308,6 +3226,9 @@ fini (xlator_t *this)
if (!this)
return;
+ if (this->history)
+ eh_destroy (this->history);
+
gf_log (this->name, GF_LOG_INFO,
"trace translator unloaded");
return;
@@ -2356,7 +3277,6 @@ struct xlator_fops fops = {
.fsetattr = trace_fsetattr,
};
-
struct xlator_cbks cbks = {
.release = trace_release,
.releasedir = trace_releasedir,
@@ -2372,5 +3292,21 @@ struct volume_options options[] = {
.type = GF_OPTION_TYPE_STR
/*.value = { ""} */
},
+ { .key = {"history-size"},
+ .type = GF_OPTION_TYPE_SIZET,
+ .default_value = "1024",
+ },
+ { .key = {"log-file"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ },
+ { .key = {"log-history"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ },
{ .key = {NULL} },
};
+
+struct xlator_dumpops dumpops = {
+ .history = trace_dump_history
+};
diff --git a/xlators/debug/trace/src/trace.h b/xlators/debug/trace/src/trace.h
new file mode 100644
index 000000000..62d1bc9c9
--- /dev/null
+++ b/xlators/debug/trace/src/trace.h
@@ -0,0 +1,61 @@
+/*
+ Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <time.h>
+#include <errno.h>
+#include "glusterfs.h"
+#include "xlator.h"
+#include "common-utils.h"
+#include "event-history.h"
+#include "logging.h"
+#include "circ-buff.h"
+#include "statedump.h"
+#include "options.h"
+
+#define TRACE_DEFAULT_HISTORY_SIZE 1024
+
+typedef struct {
+ /* Since the longest fop name is fremovexattr i.e 12 characters, array size
+ * is kept 24, i.e double of the maximum.
+ */
+ char name[24];
+ int enabled;
+} trace_fop_name_t;
+
+trace_fop_name_t trace_fop_names[GF_FOP_MAXVALUE];
+
+typedef struct {
+ gf_boolean_t log_file;
+ gf_boolean_t log_history;
+ size_t history_size;
+ int trace_log_level;
+} trace_conf_t;
+
+#define TRACE_STACK_UNWIND(op, frame, params ...) \
+ do { \
+ frame->local = NULL; \
+ STACK_UNWIND_STRICT (op, frame, params); \
+ } while (0);
+
+#define LOG_ELEMENT(_conf, _string) \
+ do { \
+ if (_conf) { \
+ if ((_conf->log_history) == _gf_true) \
+ gf_log_eh ("%s", _string); \
+ if ((_conf->log_file) == _gf_true) \
+ gf_log (THIS->name, _conf->trace_log_level, \
+ "%s", _string); \
+ } \
+ } while (0);
diff --git a/xlators/encryption/Makefile.am b/xlators/encryption/Makefile.am
index 2cbde680f..36efc6698 100644
--- a/xlators/encryption/Makefile.am
+++ b/xlators/encryption/Makefile.am
@@ -1,3 +1,3 @@
-SUBDIRS = rot-13
+SUBDIRS = rot-13 crypt
CLEANFILES =
diff --git a/xlators/encryption/crypt/Makefile.am b/xlators/encryption/crypt/Makefile.am
new file mode 100644
index 000000000..d471a3f92
--- /dev/null
+++ b/xlators/encryption/crypt/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/encryption/crypt/src/Makefile.am b/xlators/encryption/crypt/src/Makefile.am
new file mode 100644
index 000000000..b13f65043
--- /dev/null
+++ b/xlators/encryption/crypt/src/Makefile.am
@@ -0,0 +1,24 @@
+if ENABLE_CRYPT_XLATOR
+
+xlator_LTLIBRARIES = crypt.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/encryption
+
+crypt_la_LDFLAGS = -module -avoid-version -lssl -lcrypto
+
+crypt_la_SOURCES = keys.c data.c metadata.c atom.c crypt.c
+crypt_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = crypt-common.h crypt-mem-types.h crypt.h metadata.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
+
+else
+
+noinst_DIST = keys.c data.c metadata.c atom.c crypt.c
+noinst_HEADERS = crypt-common.h crypt-mem-types.h crypt.h metadata.h
+
+endif \ No newline at end of file
diff --git a/xlators/encryption/crypt/src/atom.c b/xlators/encryption/crypt/src/atom.c
new file mode 100644
index 000000000..1ec41495c
--- /dev/null
+++ b/xlators/encryption/crypt/src/atom.c
@@ -0,0 +1,962 @@
+/*
+ Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "defaults.h"
+#include "crypt-common.h"
+#include "crypt.h"
+
+/*
+ * Glossary
+ *
+ *
+ * cblock (or cipher block). A logical unit in a file.
+ * cblock size is defined as the number of bits
+ * in an input (or output) block of the block
+ * cipher (*). Cipher block size is a property of
+ * cipher algorithm. E.g. cblock size is 64 bits
+ * for DES, 128 bits for AES, etc.
+ *
+ * atomic cipher A cipher algorithm, which requires some chunks of
+ * algorithm text to be padded at left and(or) right sides before
+ * cipher transaform.
+ *
+ *
+ * block (atom) Minimal chunk of file's data, which doesn't require
+ * padding. We'll consider logical units in a file of
+ * block size (atom size).
+ *
+ * cipher algorithm Atomic cipher algorithm, which requires the last
+ * with EOF issue incomplete cblock in a file to be padded with some
+ * data (usually zeros).
+ *
+ *
+ * operation, which reading/writing from offset, which is not aligned to
+ * forms a gap at to atom size
+ * the beginning
+ *
+ *
+ * operation, which reading/writing count bytes starting from offset off,
+ * forms a gap at so that off+count is not aligned to atom_size
+ * the end
+ *
+ * head block the first atom affected by an operation, which forms
+ * a gap at the beginning, or(and) at the end.
+ * Сomment. Head block has at least one gap (either at
+ * the beginning, or at the end)
+ *
+ *
+ * tail block the last atom different from head, affected by an
+ * operation, which forms a gap at the end.
+ * Сomment: Tail block has exactly one gap (at the end).
+ *
+ *
+ * partial block head or tail block
+ *
+ *
+ * full block block without gaps.
+ *
+ *
+ * (*) Recommendation for Block Cipher Modes of Operation
+ * Methods and Techniques
+ * NIST Special Publication 800-38A Edition 2001
+ */
+
+/*
+ * atom->offset_at()
+ */
+static off_t offset_at_head(struct avec_config *conf)
+{
+ return conf->aligned_offset;
+}
+
+static off_t offset_at_hole_head(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return offset_at_head(get_hole_conf(frame));
+}
+
+static off_t offset_at_data_head(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return offset_at_head(get_data_conf(frame));
+}
+
+
+static off_t offset_at_tail(struct avec_config *conf,
+ struct object_cipher_info *object)
+{
+ return conf->aligned_offset +
+ (conf->off_in_head ? get_atom_size(object) : 0) +
+ (conf->nr_full_blocks << get_atom_bits(object));
+}
+
+static off_t offset_at_hole_tail(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return offset_at_tail(get_hole_conf(frame), object);
+}
+
+
+static off_t offset_at_data_tail(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return offset_at_tail(get_data_conf(frame), object);
+}
+
+static off_t offset_at_full(struct avec_config *conf,
+ struct object_cipher_info *object)
+{
+ return conf->aligned_offset +
+ (conf->off_in_head ? get_atom_size(object) : 0);
+}
+
+static off_t offset_at_data_full(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return offset_at_full(get_data_conf(frame), object);
+}
+
+static off_t offset_at_hole_full(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return offset_at_full(get_hole_conf(frame), object);
+}
+
+/*
+ * atom->io_size_nopad()
+ */
+
+static uint32_t io_size_nopad_head(struct avec_config *conf,
+ struct object_cipher_info *object)
+{
+ uint32_t gap_at_beg;
+ uint32_t gap_at_end;
+
+ check_head_block(conf);
+
+ gap_at_beg = conf->off_in_head;
+
+ if (has_tail_block(conf) || has_full_blocks(conf) || conf->off_in_tail == 0 )
+ gap_at_end = 0;
+ else
+ gap_at_end = get_atom_size(object) - conf->off_in_tail;
+
+ return get_atom_size(object) - (gap_at_beg + gap_at_end);
+}
+
+static uint32_t io_size_nopad_tail(struct avec_config *conf,
+ struct object_cipher_info *object)
+{
+ check_tail_block(conf);
+ return conf->off_in_tail;
+}
+
+static uint32_t io_size_nopad_full(struct avec_config *conf,
+ struct object_cipher_info *object)
+{
+ check_full_block(conf);
+ return get_atom_size(object);
+}
+
+static uint32_t io_size_nopad_data_head(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return io_size_nopad_head(get_data_conf(frame), object);
+}
+
+static uint32_t io_size_nopad_hole_head(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return io_size_nopad_head(get_hole_conf(frame), object);
+}
+
+static uint32_t io_size_nopad_data_tail(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return io_size_nopad_tail(get_data_conf(frame), object);
+}
+
+static uint32_t io_size_nopad_hole_tail(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return io_size_nopad_tail(get_hole_conf(frame), object);
+}
+
+static uint32_t io_size_nopad_data_full(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return io_size_nopad_full(get_data_conf(frame), object);
+}
+
+static uint32_t io_size_nopad_hole_full(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return io_size_nopad_full(get_hole_conf(frame), object);
+}
+
+static uint32_t offset_in_head(struct avec_config *conf)
+{
+ check_cursor_head(conf);
+
+ return conf->off_in_head;
+}
+
+static uint32_t offset_in_tail(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return 0;
+}
+
+static uint32_t offset_in_full(struct avec_config *conf,
+ struct object_cipher_info *object)
+{
+ check_cursor_full(conf);
+
+ if (has_head_block(conf))
+ return (conf->cursor - 1) << get_atom_bits(object);
+ else
+ return conf->cursor << get_atom_bits(object);
+}
+
+static uint32_t offset_in_data_head(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return offset_in_head(get_data_conf(frame));
+}
+
+static uint32_t offset_in_hole_head(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return offset_in_head(get_hole_conf(frame));
+}
+
+static uint32_t offset_in_data_full(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return offset_in_full(get_data_conf(frame), object);
+}
+
+static uint32_t offset_in_hole_full(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return offset_in_full(get_hole_conf(frame), object);
+}
+
+/*
+ * atom->rmw()
+ */
+/*
+ * Pre-conditions:
+ * @vec contains plain text of the latest
+ * version.
+ *
+ * Uptodate gaps of the @partial block with
+ * this plain text, encrypt the whole block
+ * and write the result to disk.
+ */
+static int32_t rmw_partial_block(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iovec *vec,
+ int32_t count,
+ struct iatt *stbuf,
+ struct iobref *iobref,
+ struct rmw_atom *atom)
+{
+ size_t was_read = 0;
+ uint64_t file_size;
+ crypt_local_t *local = frame->local;
+ struct object_cipher_info *object = &local->info->cinfo;
+
+ struct iovec *partial = atom->get_iovec(frame, 0);
+ struct avec_config *conf = atom->get_config(frame);
+ end_writeback_handler_t end_writeback_partial_block;
+#if DEBUG_CRYPT
+ gf_boolean_t check_last_cblock = _gf_false;
+#endif
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ if (op_ret < 0)
+ goto exit;
+
+ file_size = local->cur_file_size;
+ was_read = op_ret;
+
+ if (atom->locality == HEAD_ATOM && conf->off_in_head) {
+ /*
+ * head atom with a non-uptodate gap
+ * at the beginning
+ *
+ * fill the gap with plain text of the
+ * latest version. Convert a part of hole
+ * (if any) to zeros.
+ */
+ int32_t i;
+ int32_t copied = 0;
+ int32_t to_gap; /* amount of data needed to uptodate
+ the gap at the beginning */
+#if 0
+ int32_t hole = 0; /* The part of the hole which
+ * got in the head block */
+#endif /* 0 */
+ to_gap = conf->off_in_head;
+
+ if (was_read < to_gap) {
+ if (file_size >
+ offset_at_head(conf) + was_read) {
+ /*
+ * It is impossible to uptodate
+ * head block: too few bytes have
+ * been read from disk, so that
+ * partial write is impossible.
+ *
+ * It could happen because of many
+ * reasons: IO errors, (meta)data
+ * corruption in the local file system,
+ * etc.
+ */
+ gf_log(this->name, GF_LOG_WARNING,
+ "Can not uptodate a gap at the beginning");
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ goto exit;
+ }
+#if 0
+ hole = to_gap - was_read;
+#endif /* 0 */
+ to_gap = was_read;
+ }
+ /*
+ * uptodate the gap at the beginning
+ */
+ for (i = 0; i < count && copied < to_gap; i++) {
+ int32_t to_copy;
+
+ to_copy = vec[i].iov_len;
+ if (to_copy > to_gap - copied)
+ to_copy = to_gap - copied;
+
+ memcpy(partial->iov_base, vec[i].iov_base, to_copy);
+ copied += to_copy;
+ }
+#if 0
+ /*
+ * If possible, convert part of the
+ * hole, which got in the head block
+ */
+ ret = TRY_LOCK(&local->hole_lock);
+ if (!ret) {
+ if (local->hole_handled)
+ /*
+ * already converted by
+ * crypt_writev_cbk()
+ */
+ UNLOCK(&local->hole_lock);
+ else {
+ /*
+ * convert the part of the hole
+ * which got in the head block
+ * to zeros.
+ *
+ * Update the orig_offset to make
+ * sure writev_cbk() won't care
+ * about this part of the hole.
+ *
+ */
+ memset(partial->iov_base + to_gap, 0, hole);
+
+ conf->orig_offset -= hole;
+ conf->orig_size += hole;
+ UNLOCK(&local->hole_lock);
+ }
+ }
+ else /*
+ * conversion is being performed
+ * by crypt_writev_cbk()
+ */
+ ;
+#endif /* 0 */
+ }
+ if (atom->locality == TAIL_ATOM ||
+ (!has_tail_block(conf) && conf->off_in_tail)) {
+ /*
+ * tail atom, or head atom with a non-uptodate
+ * gap at the end.
+ *
+ * fill the gap at the end of the block
+ * with plain text of the latest version.
+ * Pad the result, (if needed)
+ */
+ int32_t i;
+ int32_t to_gap;
+ int copied;
+ off_t off_in_tail;
+ int32_t to_copy;
+
+ off_in_tail = conf->off_in_tail;
+ to_gap = conf->gap_in_tail;
+
+ if (to_gap && was_read < off_in_tail + to_gap) {
+ /*
+ * It is impossible to uptodate
+ * the gap at the end: too few bytes
+ * have been read from disk, so that
+ * partial write is impossible.
+ *
+ * It could happen because of many
+ * reasons: IO errors, (meta)data
+ * corruption in the local file system,
+ * etc.
+ */
+ gf_log(this->name, GF_LOG_WARNING,
+ "Can not uptodate a gap at the end");
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ goto exit;
+ }
+ /*
+ * uptodate the gap at the end
+ */
+ copied = 0;
+ to_copy = to_gap;
+ for(i = count - 1; i >= 0 && to_copy > 0; i--) {
+ uint32_t from_vec, off_in_vec;
+
+ off_in_vec = 0;
+ from_vec = vec[i].iov_len;
+ if (from_vec > to_copy) {
+ off_in_vec = from_vec - to_copy;
+ from_vec = to_copy;
+ }
+ memcpy(partial->iov_base +
+ off_in_tail + to_gap - copied - from_vec,
+ vec[i].iov_base + off_in_vec,
+ from_vec);
+
+ gf_log(this->name, GF_LOG_DEBUG,
+ "uptodate %d bytes at tail. Offset at target(source): %d(%d)",
+ (int)from_vec,
+ (int)off_in_tail + to_gap - copied - from_vec,
+ (int)off_in_vec);
+
+ copied += from_vec;
+ to_copy -= from_vec;
+ }
+ partial->iov_len = off_in_tail + to_gap;
+
+ if (object_alg_should_pad(object)) {
+ int32_t resid = 0;
+ resid = partial->iov_len & (object_alg_blksize(object) - 1);
+ if (resid) {
+ /*
+ * append a new EOF padding
+ */
+ local->eof_padding_size =
+ object_alg_blksize(object) - resid;
+
+ gf_log(this->name, GF_LOG_DEBUG,
+ "set padding size %d",
+ local->eof_padding_size);
+
+ memset(partial->iov_base + partial->iov_len,
+ 1,
+ local->eof_padding_size);
+ partial->iov_len += local->eof_padding_size;
+#if DEBUG_CRYPT
+ gf_log(this->name, GF_LOG_DEBUG,
+ "pad cblock with %d zeros:",
+ local->eof_padding_size);
+ dump_cblock(this,
+ (unsigned char *)partial->iov_base +
+ partial->iov_len - object_alg_blksize(object));
+ check_last_cblock = _gf_true;
+#endif
+ }
+ }
+ }
+ /*
+ * encrypt the whole block
+ */
+ encrypt_aligned_iov(object,
+ partial,
+ 1,
+ atom->offset_at(frame, object));
+#if DEBUG_CRYPT
+ if (check_last_cblock == _gf_true) {
+ gf_log(this->name, GF_LOG_DEBUG,
+ "encrypt last cblock with offset %llu",
+ (unsigned long long)atom->offset_at(frame, object));
+ dump_cblock(this, (unsigned char *)partial->iov_base +
+ partial->iov_len - object_alg_blksize(object));
+ }
+#endif
+ set_local_io_params_writev(frame, object, atom,
+ atom->offset_at(frame, object),
+ iovec_get_size(partial, 1));
+ /*
+ * write the whole block to disk
+ */
+ end_writeback_partial_block = dispatch_end_writeback(local->fop);
+ conf->cursor ++;
+ STACK_WIND(frame,
+ end_writeback_partial_block,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev,
+ local->fd,
+ partial,
+ 1,
+ atom->offset_at(frame, object),
+ local->flags,
+ local->iobref_data,
+ local->xdata);
+
+ gf_log("crypt", GF_LOG_DEBUG,
+ "submit partial block: %d bytes from %d offset",
+ (int)iovec_get_size(partial, 1),
+ (int)atom->offset_at(frame, object));
+ exit:
+ return 0;
+}
+
+/*
+ * Perform a (read-)modify-write sequence.
+ * This should be performed only after approval
+ * of upper server-side manager, i.e. the caller
+ * needs to make sure this is his turn to rmw.
+ */
+void submit_partial(call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ atom_locality_type ltype)
+{
+ int32_t ret;
+ dict_t *dict;
+ struct rmw_atom *atom;
+ crypt_local_t *local = frame->local;
+ struct object_cipher_info *object = &local->info->cinfo;
+
+ atom = atom_by_types(local->active_setup, ltype);
+ /*
+ * To perform the "read" component of the read-modify-write
+ * sequence the crypt translator does stack_wind to itself.
+ *
+ * Pass current file size to crypt_readv()
+ */
+ dict = dict_new();
+ if (!dict) {
+ /*
+ * FIXME: Handle the error
+ */
+ gf_log("crypt", GF_LOG_WARNING, "Can not alloc dict");
+ return;
+ }
+ ret = dict_set(dict,
+ FSIZE_XATTR_PREFIX,
+ data_from_uint64(local->cur_file_size));
+ if (ret) {
+ /*
+ * FIXME: Handle the error
+ */
+ dict_unref(dict);
+ gf_log("crypt", GF_LOG_WARNING, "Can not set dict");
+ goto exit;
+ }
+ STACK_WIND(frame,
+ atom->rmw,
+ this,
+ this->fops->readv, /* crypt_readv */
+ fd,
+ atom->count_to_uptodate(frame, object), /* count */
+ atom->offset_at(frame, object), /* offset to read from */
+ 0,
+ dict);
+ exit:
+ dict_unref(dict);
+}
+
+/*
+ * submit blocks of FULL_ATOM type
+ */
+void submit_full(call_frame_t *frame, xlator_t *this)
+{
+ crypt_local_t *local = frame->local;
+ struct object_cipher_info *object = &local->info->cinfo;
+ struct rmw_atom *atom = atom_by_types(local->active_setup, FULL_ATOM);
+ uint32_t count; /* total number of full blocks to submit */
+ uint32_t granularity; /* number of blocks to submit in one iteration */
+
+ uint64_t off_in_file; /* start offset in the file, bytes */
+ uint32_t off_in_atom; /* start offset in the atom, blocks */
+ uint32_t blocks_written = 0; /* blocks written for this submit */
+
+ struct avec_config *conf = atom->get_config(frame);
+ end_writeback_handler_t end_writeback_full_block;
+ /*
+ * Write full blocks by groups of granularity size.
+ */
+ end_writeback_full_block = dispatch_end_writeback(local->fop);
+
+ if (is_ordered_mode(frame)) {
+ uint32_t skip = has_head_block(conf) ? 1 : 0;
+ count = 1;
+ granularity = 1;
+ /*
+ * calculate start offset using cursor value;
+ * here we should take into accout head block,
+ * which corresponds to cursor value 0.
+ */
+ off_in_file = atom->offset_at(frame, object) +
+ ((conf->cursor - skip) << get_atom_bits(object));
+ off_in_atom = conf->cursor - skip;
+ }
+ else {
+ /*
+ * in parallel mode
+ */
+ count = conf->nr_full_blocks;
+ granularity = MAX_IOVEC;
+ off_in_file = atom->offset_at(frame, object);
+ off_in_atom = 0;
+ }
+ while (count) {
+ uint32_t blocks_to_write = count;
+
+ if (blocks_to_write > granularity)
+ blocks_to_write = granularity;
+ if (conf->type == HOLE_ATOM)
+ /*
+ * reset iovec before encryption
+ */
+ memset(atom->get_iovec(frame, 0)->iov_base,
+ 0,
+ get_atom_size(object));
+ /*
+ * encrypt the group
+ */
+ encrypt_aligned_iov(object,
+ atom->get_iovec(frame,
+ off_in_atom +
+ blocks_written),
+ blocks_to_write,
+ off_in_file + (blocks_written <<
+ get_atom_bits(object)));
+
+ set_local_io_params_writev(frame, object, atom,
+ off_in_file + (blocks_written << get_atom_bits(object)),
+ blocks_to_write << get_atom_bits(object));
+
+ conf->cursor += blocks_to_write;
+
+ STACK_WIND(frame,
+ end_writeback_full_block,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev,
+ local->fd,
+ atom->get_iovec(frame, off_in_atom + blocks_written),
+ blocks_to_write,
+ off_in_file + (blocks_written << get_atom_bits(object)),
+ local->flags,
+ local->iobref_data ? local->iobref_data : local->iobref,
+ local->xdata);
+
+ gf_log("crypt", GF_LOG_DEBUG, "submit %d full blocks from %d offset",
+ blocks_to_write,
+ (int)(off_in_file + (blocks_written << get_atom_bits(object))));
+
+ count -= blocks_to_write;
+ blocks_written += blocks_to_write;
+ }
+ return;
+}
+
+static int32_t rmw_data_head(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iovec *vec,
+ int32_t count,
+ struct iatt *stbuf,
+ struct iobref *iobref,
+ dict_t *xdata)
+{
+ return rmw_partial_block(frame,
+ cookie,
+ this,
+ op_ret,
+ op_errno,
+ vec,
+ count,
+ stbuf,
+ iobref,
+ atom_by_types(DATA_ATOM, HEAD_ATOM));
+}
+
+static int32_t rmw_data_tail(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iovec *vec,
+ int32_t count,
+ struct iatt *stbuf,
+ struct iobref *iobref,
+ dict_t *xdata)
+{
+ return rmw_partial_block(frame,
+ cookie,
+ this,
+ op_ret,
+ op_errno,
+ vec,
+ count,
+ stbuf,
+ iobref,
+ atom_by_types(DATA_ATOM, TAIL_ATOM));
+}
+
+static int32_t rmw_hole_head(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iovec *vec,
+ int32_t count,
+ struct iatt *stbuf,
+ struct iobref *iobref,
+ dict_t *xdata)
+{
+ return rmw_partial_block(frame,
+ cookie,
+ this,
+ op_ret,
+ op_errno,
+ vec,
+ count,
+ stbuf,
+ iobref,
+ atom_by_types(HOLE_ATOM, HEAD_ATOM));
+}
+
+static int32_t rmw_hole_tail(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iovec *vec,
+ int32_t count,
+ struct iatt *stbuf,
+ struct iobref *iobref,
+ dict_t *xdata)
+{
+ return rmw_partial_block(frame,
+ cookie,
+ this,
+ op_ret,
+ op_errno,
+ vec,
+ count,
+ stbuf,
+ iobref,
+ atom_by_types(HOLE_ATOM, TAIL_ATOM));
+}
+
+/*
+ * atom->count_to_uptodate()
+ */
+static uint32_t count_to_uptodate_head(struct avec_config *conf,
+ struct object_cipher_info *object)
+{
+ if (conf->acount == 1 && conf->off_in_tail)
+ return get_atom_size(object);
+ else
+ /* there is no need to read the whole head block */
+ return conf->off_in_head;
+}
+
+static uint32_t count_to_uptodate_tail(struct avec_config *conf,
+ struct object_cipher_info *object)
+{
+ /* we need to read the whole tail block */
+ return get_atom_size(object);
+}
+
+static uint32_t count_to_uptodate_data_head(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return count_to_uptodate_head(get_data_conf(frame), object);
+}
+
+static uint32_t count_to_uptodate_data_tail(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return count_to_uptodate_tail(get_data_conf(frame), object);
+}
+
+static uint32_t count_to_uptodate_hole_head(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return count_to_uptodate_head(get_hole_conf(frame), object);
+}
+
+static uint32_t count_to_uptodate_hole_tail(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ return count_to_uptodate_tail(get_hole_conf(frame), object);
+}
+
+/* atom->get_config() */
+
+static struct avec_config *get_config_data(call_frame_t *frame)
+{
+ return &((crypt_local_t *)frame->local)->data_conf;
+}
+
+static struct avec_config *get_config_hole(call_frame_t *frame)
+{
+ return &((crypt_local_t *)frame->local)->hole_conf;
+}
+
+/*
+ * atom->get_iovec()
+ */
+static struct iovec *get_iovec_hole_head(call_frame_t *frame,
+ uint32_t count)
+{
+ struct avec_config *conf = get_hole_conf(frame);
+
+ return conf->avec;
+}
+
+static struct iovec *get_iovec_hole_full(call_frame_t *frame,
+ uint32_t count)
+{
+ struct avec_config *conf = get_hole_conf(frame);
+
+ return conf->avec + (conf->off_in_head ? 1 : 0);
+}
+
+static inline struct iovec *get_iovec_hole_tail(call_frame_t *frame,
+ uint32_t count)
+{
+ struct avec_config *conf = get_hole_conf(frame);
+
+ return conf->avec + (conf->blocks_in_pool - 1);
+}
+
+static inline struct iovec *get_iovec_data_head(call_frame_t *frame,
+ uint32_t count)
+{
+ struct avec_config *conf = get_data_conf(frame);
+
+ return conf->avec;
+}
+
+static inline struct iovec *get_iovec_data_full(call_frame_t *frame,
+ uint32_t count)
+{
+ struct avec_config *conf = get_data_conf(frame);
+
+ return conf->avec + (conf->off_in_head ? 1 : 0) + count;
+}
+
+static inline struct iovec *get_iovec_data_tail(call_frame_t *frame,
+ uint32_t count)
+{
+ struct avec_config *conf = get_data_conf(frame);
+
+ return conf->avec +
+ (conf->off_in_head ? 1 : 0) +
+ conf->nr_full_blocks;
+}
+
+static struct rmw_atom atoms[LAST_DATA_TYPE][LAST_LOCALITY_TYPE] = {
+ [DATA_ATOM][HEAD_ATOM] =
+ { .locality = HEAD_ATOM,
+ .rmw = rmw_data_head,
+ .offset_at = offset_at_data_head,
+ .offset_in = offset_in_data_head,
+ .get_iovec = get_iovec_data_head,
+ .io_size_nopad = io_size_nopad_data_head,
+ .count_to_uptodate = count_to_uptodate_data_head,
+ .get_config = get_config_data
+ },
+ [DATA_ATOM][TAIL_ATOM] =
+ { .locality = TAIL_ATOM,
+ .rmw = rmw_data_tail,
+ .offset_at = offset_at_data_tail,
+ .offset_in = offset_in_tail,
+ .get_iovec = get_iovec_data_tail,
+ .io_size_nopad = io_size_nopad_data_tail,
+ .count_to_uptodate = count_to_uptodate_data_tail,
+ .get_config = get_config_data
+ },
+ [DATA_ATOM][FULL_ATOM] =
+ { .locality = FULL_ATOM,
+ .offset_at = offset_at_data_full,
+ .offset_in = offset_in_data_full,
+ .get_iovec = get_iovec_data_full,
+ .io_size_nopad = io_size_nopad_data_full,
+ .get_config = get_config_data
+ },
+ [HOLE_ATOM][HEAD_ATOM] =
+ { .locality = HEAD_ATOM,
+ .rmw = rmw_hole_head,
+ .offset_at = offset_at_hole_head,
+ .offset_in = offset_in_hole_head,
+ .get_iovec = get_iovec_hole_head,
+ .io_size_nopad = io_size_nopad_hole_head,
+ .count_to_uptodate = count_to_uptodate_hole_head,
+ .get_config = get_config_hole
+ },
+ [HOLE_ATOM][TAIL_ATOM] =
+ { .locality = TAIL_ATOM,
+ .rmw = rmw_hole_tail,
+ .offset_at = offset_at_hole_tail,
+ .offset_in = offset_in_tail,
+ .get_iovec = get_iovec_hole_tail,
+ .io_size_nopad = io_size_nopad_hole_tail,
+ .count_to_uptodate = count_to_uptodate_hole_tail,
+ .get_config = get_config_hole
+ },
+ [HOLE_ATOM][FULL_ATOM] =
+ { .locality = FULL_ATOM,
+ .offset_at = offset_at_hole_full,
+ .offset_in = offset_in_hole_full,
+ .get_iovec = get_iovec_hole_full,
+ .io_size_nopad = io_size_nopad_hole_full,
+ .get_config = get_config_hole
+ }
+};
+
+struct rmw_atom *atom_by_types(atom_data_type data,
+ atom_locality_type locality)
+{
+ return &atoms[data][locality];
+}
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
diff --git a/xlators/encryption/crypt/src/crypt-common.h b/xlators/encryption/crypt/src/crypt-common.h
new file mode 100644
index 000000000..7c212ad5d
--- /dev/null
+++ b/xlators/encryption/crypt/src/crypt-common.h
@@ -0,0 +1,141 @@
+/*
+ Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __CRYPT_COMMON_H__
+#define __CRYPT_COMMON_H__
+
+#define INVAL_SUBVERSION_NUMBER (0xff)
+#define CRYPT_INVAL_OP (GF_FOP_NULL)
+
+#define CRYPTO_FORMAT_PREFIX "trusted.glusterfs.crypt.att.cfmt"
+#define FSIZE_XATTR_PREFIX "trusted.glusterfs.crypt.att.size"
+#define SUBREQ_PREFIX "trusted.glusterfs.crypt.msg.sreq"
+#define FSIZE_MSG_PREFIX "trusted.glusterfs.crypt.msg.size"
+#define DE_MSG_PREFIX "trusted.glusterfs.crypt.msg.dent"
+#define REQUEST_ID_PREFIX "trusted.glusterfs.crypt.msg.rqid"
+#define MSGFLAGS_PREFIX "trusted.glusterfs.crypt.msg.xfgs"
+
+
+/* messages for crypt_open() */
+#define MSGFLAGS_REQUEST_MTD_RLOCK 1 /* take read lock and don't unlock */
+#define MSGFLAGS_REQUEST_MTD_WLOCK 2 /* take write lock and don't unlock */
+
+#define AES_BLOCK_BITS (4) /* AES_BLOCK_SIZE == 1 << AES_BLOCK_BITS */
+
+#define noop do {; } while (0)
+#define cassert(cond) ({ switch (-1) { case (cond): case 0: break; } })
+#define __round_mask(x, y) ((__typeof__(x))((y)-1))
+#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
+
+/*
+ * Format of file's metadata
+ */
+struct crypt_format {
+ uint8_t loader_id; /* version of metadata loader */
+ uint8_t versioned[0]; /* file's metadata of specific version */
+} __attribute__((packed));
+
+typedef enum {
+ AES_CIPHER_ALG,
+ LAST_CIPHER_ALG
+} cipher_alg_t;
+
+typedef enum {
+ XTS_CIPHER_MODE,
+ LAST_CIPHER_MODE
+} cipher_mode_t;
+
+typedef enum {
+ MTD_LOADER_V1,
+ LAST_MTD_LOADER
+} mtd_loader_id;
+
+static inline void msgflags_set_mtd_rlock(uint32_t *flags)
+{
+ *flags |= MSGFLAGS_REQUEST_MTD_RLOCK;
+}
+
+static inline void msgflags_set_mtd_wlock(uint32_t *flags)
+{
+ *flags |= MSGFLAGS_REQUEST_MTD_WLOCK;
+}
+
+static inline gf_boolean_t msgflags_check_mtd_rlock(uint32_t *flags)
+{
+ return *flags & MSGFLAGS_REQUEST_MTD_RLOCK;
+}
+
+static inline gf_boolean_t msgflags_check_mtd_wlock(uint32_t *flags)
+{
+ return *flags & MSGFLAGS_REQUEST_MTD_WLOCK;
+}
+
+static inline gf_boolean_t msgflags_check_mtd_lock(uint32_t *flags)
+{
+ return msgflags_check_mtd_rlock(flags) ||
+ msgflags_check_mtd_wlock(flags);
+}
+
+/*
+ * returns number of logical blocks occupied
+ * (maybe partially) by @count bytes
+ * at offset @start.
+ */
+static inline off_t logical_blocks_occupied(uint64_t start, off_t count,
+ int blkbits)
+{
+ return ((start + count - 1) >> blkbits) - (start >> blkbits) + 1;
+}
+
+/*
+ * are two bytes (represented by offsets @off1
+ * and @off2 respectively) in the same logical
+ * block.
+ */
+static inline int in_same_lblock(uint64_t off1, uint64_t off2,
+ int blkbits)
+{
+ return off1 >> blkbits == off2 >> blkbits;
+}
+
+static inline void dump_cblock(xlator_t *this, unsigned char *buf)
+{
+ gf_log(this->name, GF_LOG_DEBUG,
+ "dump cblock: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x",
+ (buf)[0],
+ (buf)[1],
+ (buf)[2],
+ (buf)[3],
+ (buf)[4],
+ (buf)[5],
+ (buf)[6],
+ (buf)[7],
+ (buf)[8],
+ (buf)[9],
+ (buf)[10],
+ (buf)[11],
+ (buf)[12],
+ (buf)[13],
+ (buf)[14],
+ (buf)[15]);
+}
+
+#endif /* __CRYPT_COMMON_H__ */
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
diff --git a/xlators/encryption/crypt/src/crypt-mem-types.h b/xlators/encryption/crypt/src/crypt-mem-types.h
new file mode 100644
index 000000000..2eab921fc
--- /dev/null
+++ b/xlators/encryption/crypt/src/crypt-mem-types.h
@@ -0,0 +1,44 @@
+/*
+ Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef __CRYPT_MEM_TYPES_H__
+#define __CRYPT_MEM_TYPES_H__
+
+#include "mem-types.h"
+
+enum gf_crypt_mem_types_ {
+ gf_crypt_mt_priv = gf_common_mt_end + 1,
+ gf_crypt_mt_inode,
+ gf_crypt_mt_data,
+ gf_crypt_mt_mtd,
+ gf_crypt_mt_loc,
+ gf_crypt_mt_iatt,
+ gf_crypt_mt_key,
+ gf_crypt_mt_iovec,
+ gf_crypt_mt_char,
+ gf_crypt_mt_end,
+};
+
+#endif /* __CRYPT_MEM_TYPES_H__ */
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
+
+
+
diff --git a/xlators/encryption/crypt/src/crypt.c b/xlators/encryption/crypt/src/crypt.c
new file mode 100644
index 000000000..13b1bd962
--- /dev/null
+++ b/xlators/encryption/crypt/src/crypt.c
@@ -0,0 +1,4522 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#include <ctype.h>
+#include <sys/uio.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "xlator.h"
+#include "logging.h"
+#include "defaults.h"
+
+#include "crypt-common.h"
+#include "crypt.h"
+
+static void init_inode_info_head(struct crypt_inode_info *info, fd_t *fd);
+static int32_t init_inode_info_tail(struct crypt_inode_info *info,
+ struct master_cipher_info *master);
+static int32_t prepare_for_submit_hole(call_frame_t *frame, xlator_t *this,
+ uint64_t from, off_t size);
+static int32_t load_file_size(call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *dict, dict_t *xdata);
+static void do_ordered_submit(call_frame_t *frame, xlator_t *this,
+ atom_data_type dtype);
+static void do_parallel_submit(call_frame_t *frame, xlator_t *this,
+ atom_data_type dtype);
+static void put_one_call_open(call_frame_t *frame);
+static void put_one_call_readv(call_frame_t *frame, xlator_t *this);
+static void put_one_call_writev(call_frame_t *frame, xlator_t *this);
+static void put_one_call_ftruncate(call_frame_t *frame, xlator_t *this);
+static void free_avec(struct iovec *avec, char **pool, int blocks_in_pool);
+static void free_avec_data(crypt_local_t *local);
+static void free_avec_hole(crypt_local_t *local);
+
+static crypt_local_t *crypt_alloc_local(call_frame_t *frame, xlator_t *this,
+ glusterfs_fop_t fop)
+{
+ crypt_local_t *local = NULL;
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ gf_log(this->name, GF_LOG_ERROR, "out of memory");
+ return NULL;
+ }
+ local->fop = fop;
+ LOCK_INIT(&local->hole_lock);
+ LOCK_INIT(&local->call_lock);
+ LOCK_INIT(&local->rw_count_lock);
+
+ frame->local = local;
+ return local;
+}
+
+struct crypt_inode_info *get_crypt_inode_info(inode_t *inode, xlator_t *this)
+{
+ int ret;
+ uint64_t value = 0;
+ struct crypt_inode_info *info;
+
+ ret = inode_ctx_get(inode, this, &value);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Can not get inode info");
+ return NULL;
+ }
+ info = (struct crypt_inode_info *)(long)value;
+ if (info == NULL) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Can not obtain inode info");
+ return NULL;
+ }
+ return info;
+}
+
+static struct crypt_inode_info *local_get_inode_info(crypt_local_t *local,
+ xlator_t *this)
+{
+ if (local->info)
+ return local->info;
+ local->info = get_crypt_inode_info(local->fd->inode, this);
+ return local->info;
+}
+
+static struct crypt_inode_info *alloc_inode_info(crypt_local_t *local,
+ loc_t *loc)
+{
+ struct crypt_inode_info *info;
+
+ info = GF_CALLOC(1, sizeof(*info), gf_crypt_mt_inode);
+ if (!info) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ gf_log ("crypt", GF_LOG_WARNING,
+ "Can not allocate inode info");
+ return NULL;
+ }
+ memset(info, 0, sizeof(*info));
+#if DEBUG_CRYPT
+ info->loc = GF_CALLOC(1, sizeof(*loc), gf_crypt_mt_loc);
+ if (!info->loc) {
+ gf_log("crypt", GF_LOG_WARNING, "Can not allocate loc");
+ GF_FREE(info);
+ return NULL;
+ }
+ if (loc_copy(info->loc, loc)){
+ GF_FREE(info->loc);
+ GF_FREE(info);
+ return NULL;
+ }
+#endif /* DEBUG_CRYPT */
+
+ local->info = info;
+ return info;
+}
+
+static void free_inode_info(struct crypt_inode_info *info)
+{
+#if DEBUG_CRYPT
+ loc_wipe(info->loc);
+ GF_FREE(info->loc);
+#endif
+ memset(info, 0, sizeof(*info));
+ GF_FREE(info);
+}
+
+int crypt_forget (xlator_t *this, inode_t *inode)
+{
+ uint64_t ctx_addr = 0;
+ if (!inode_ctx_del (inode, this, &ctx_addr))
+ free_inode_info((struct crypt_inode_info *)(long)ctx_addr);
+ return 0;
+}
+
+#if DEBUG_CRYPT
+static void check_read(call_frame_t *frame, xlator_t *this, int32_t read,
+ struct iovec *vec, int32_t count, struct iatt *stbuf)
+{
+ crypt_local_t *local = frame->local;
+ struct object_cipher_info *object = get_object_cinfo(local->info);
+ struct avec_config *conf = &local->data_conf;
+ uint32_t resid = stbuf->ia_size & (object_alg_blksize(object) - 1);
+
+ if (read <= 0)
+ return;
+ if (read != iovec_get_size(vec, count))
+ gf_log ("crypt", GF_LOG_DEBUG,
+ "op_ret differs from amount of read bytes");
+
+ if (object_alg_should_pad(object) && (read & (object_alg_blksize(object) - 1)))
+ gf_log ("crypt", GF_LOG_DEBUG,
+ "bad amount of read bytes (!= 0 mod(cblock size))");
+
+ if (conf->aligned_offset + read >
+ stbuf->ia_size + (resid ? object_alg_blksize(object) - resid : 0))
+ gf_log ("crypt", GF_LOG_DEBUG,
+ "bad amount of read bytes (too large))");
+
+}
+
+#define PT_BYTES_TO_DUMP (32)
+static void dump_plain_text(crypt_local_t *local, struct iovec *avec)
+{
+ int32_t to_dump;
+ char str[PT_BYTES_TO_DUMP + 1];
+
+ if (!avec)
+ return;
+ to_dump = avec->iov_len;
+ if (to_dump > PT_BYTES_TO_DUMP)
+ to_dump = PT_BYTES_TO_DUMP;
+ memcpy(str, avec->iov_base, to_dump);
+ memset(str + to_dump, '0', 1);
+ gf_log("crypt", GF_LOG_DEBUG, "Read file: %s", str);
+}
+
+static int32_t data_conf_invariant(struct avec_config *conf)
+{
+ return conf->acount ==
+ !!has_head_block(conf) +
+ !!has_tail_block(conf)+
+ conf->nr_full_blocks;
+}
+
+static int32_t hole_conf_invariant(struct avec_config *conf)
+{
+ return conf->blocks_in_pool ==
+ !!has_head_block(conf) +
+ !!has_tail_block(conf)+
+ !!has_full_blocks(conf);
+}
+
+static void crypt_check_conf(struct avec_config *conf)
+{
+ int32_t ret = 0;
+ const char *msg;
+
+ switch (conf->type) {
+ case DATA_ATOM:
+ msg = "data";
+ ret = data_conf_invariant(conf);
+ break;
+ case HOLE_ATOM:
+ msg = "hole";
+ ret = hole_conf_invariant(conf);
+ break;
+ default:
+ msg = "unknown";
+ }
+ if (!ret)
+ gf_log("crypt", GF_LOG_DEBUG, "bad %s conf", msg);
+}
+
+static void check_buf(call_frame_t *frame, xlator_t *this, struct iatt *buf)
+{
+ crypt_local_t *local = frame->local;
+ struct object_cipher_info *object = &local->info->cinfo;
+ uint64_t local_file_size;
+
+ switch(local->fop) {
+ case GF_FOP_FTRUNCATE:
+ return;
+ case GF_FOP_WRITE:
+ local_file_size = local->new_file_size;
+ break;
+ case GF_FOP_READ:
+ if (parent_is_crypt_xlator(frame, this))
+ return;
+ local_file_size = local->cur_file_size;
+ break;
+ default:
+ gf_log("crypt", GF_LOG_DEBUG, "bad file operation");
+ return;
+ }
+ if (buf->ia_size != round_up(local_file_size,
+ object_alg_blksize(object)))
+ gf_log("crypt", GF_LOG_DEBUG,
+ "bad ia_size in buf (%llu), should be %llu",
+ (unsigned long long)buf->ia_size,
+ (unsigned long long)round_up(local_file_size,
+ object_alg_blksize(object)));
+}
+
+#else
+#define check_read(frame, this, op_ret, vec, count, stbuf) noop
+#define dump_plain_text(local, avec) noop
+#define crypt_check_conf(conf) noop
+#define check_buf(frame, this, buf) noop
+#endif /* DEBUG_CRYPT */
+
+/*
+ * Pre-conditions:
+ * @vec represents a ciphertext of expanded size and
+ * aligned offset.
+ *
+ * Compound a temporal vector @avec with block-aligned
+ * components, decrypt and fix it up to represent a chunk
+ * of data corresponding to the original size and offset.
+ * Pass the result to the next translator.
+ */
+int32_t crypt_readv_cbk(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iovec *vec,
+ int32_t count,
+ struct iatt *stbuf,
+ struct iobref *iobref,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+ struct avec_config *conf = &local->data_conf;
+ struct object_cipher_info *object = &local->info->cinfo;
+
+ struct iovec *avec;
+ uint32_t i;
+ uint32_t to_vec;
+ uint32_t to_user;
+
+ check_buf(frame, this, stbuf);
+ check_read(frame, this, op_ret, vec, count, stbuf);
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ local->iobref = iobref_ref(iobref);
+
+ local->buf = *stbuf;
+ local->buf.ia_size = local->cur_file_size;
+
+ if (op_ret <= 0 || count == 0 || vec[0].iov_len == 0)
+ goto put_one_call;
+
+ if (conf->orig_offset >= local->cur_file_size) {
+ local->op_ret = 0;
+ goto put_one_call;
+ }
+ /*
+ * correct config params with real file size
+ * and actual amount of bytes read
+ */
+ set_config_offsets(frame, this,
+ conf->orig_offset, op_ret, DATA_ATOM, 0);
+
+ if (conf->orig_offset + conf->orig_size > local->cur_file_size)
+ conf->orig_size = local->cur_file_size - conf->orig_offset;
+ /*
+ * calculate amount of data to be returned
+ * to user.
+ */
+ to_user = op_ret;
+ if (conf->aligned_offset + to_user <= conf->orig_offset) {
+ gf_log(this->name, GF_LOG_WARNING, "Incomplete read");
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ goto put_one_call;
+ }
+ to_user -= (conf->aligned_offset - conf->orig_offset);
+
+ if (to_user > conf->orig_size)
+ to_user = conf->orig_size;
+ local->rw_count = to_user;
+
+ op_errno = set_config_avec_data(this, local,
+ conf, object, vec, count);
+ if (op_errno) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ goto put_one_call;
+ }
+ avec = conf->avec;
+#if DEBUG_CRYPT
+ if (conf->off_in_tail != 0 &&
+ conf->off_in_tail < object_alg_blksize(object) &&
+ object_alg_should_pad(object))
+ gf_log(this->name, GF_LOG_DEBUG, "Bad offset in tail %d",
+ conf->off_in_tail);
+ if (iovec_get_size(vec, count) != 0 &&
+ in_same_lblock(conf->orig_offset + iovec_get_size(vec, count) - 1,
+ local->cur_file_size - 1,
+ object_alg_blkbits(object))) {
+ gf_log(this->name, GF_LOG_DEBUG, "Compound last cblock");
+ dump_cblock(this,
+ (unsigned char *)(avec[conf->acount - 1].iov_base) +
+ avec[conf->acount - 1].iov_len - object_alg_blksize(object));
+ dump_cblock(this,
+ (unsigned char *)(vec[count - 1].iov_base) +
+ vec[count - 1].iov_len - object_alg_blksize(object));
+ }
+#endif
+ decrypt_aligned_iov(object, avec,
+ conf->acount, conf->aligned_offset);
+ /*
+ * pass proper plain data to user
+ */
+ avec[0].iov_base += (conf->aligned_offset - conf->orig_offset);
+ avec[0].iov_len -= (conf->aligned_offset - conf->orig_offset);
+
+ to_vec = to_user;
+ for (i = 0; i < conf->acount; i++) {
+ if (avec[i].iov_len > to_vec)
+ avec[i].iov_len = to_vec;
+ to_vec -= avec[i].iov_len;
+ }
+ put_one_call:
+ put_one_call_readv(frame, this);
+ return 0;
+}
+
+static int32_t do_readv(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *dict,
+ dict_t *xdata)
+{
+ data_t *data;
+ crypt_local_t *local = frame->local;
+
+ if (op_ret < 0)
+ goto error;
+ /*
+ * extract regular file size
+ */
+ data = dict_get(dict, FSIZE_XATTR_PREFIX);
+ if (!data) {
+ gf_log("crypt", GF_LOG_WARNING, "Regular file size not found");
+ op_errno = EIO;
+ goto error;
+ }
+ local->cur_file_size = data_to_uint64(data);
+
+ get_one_call(frame);
+ STACK_WIND(frame,
+ crypt_readv_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->readv,
+ local->fd,
+ /*
+ * FIXME: read amount can be reduced
+ */
+ local->data_conf.expanded_size,
+ local->data_conf.aligned_offset,
+ local->flags,
+ local->xdata);
+ return 0;
+ error:
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+
+ get_one_call(frame);
+ put_one_call_readv(frame, this);
+ return 0;
+}
+
+static int32_t crypt_readv_finodelk_cbk(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ if (op_ret < 0)
+ goto error;
+ /*
+ * An access has been granted,
+ * retrieve file size
+ */
+ STACK_WIND(frame,
+ do_readv,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fgetxattr,
+ local->fd,
+ FSIZE_XATTR_PREFIX,
+ NULL);
+ return 0;
+ error:
+ fd_unref(local->fd);
+ if (local->xdata)
+ dict_unref(local->xdata);
+ STACK_UNWIND_STRICT(readv,
+ frame,
+ -1,
+ op_errno,
+ NULL,
+ 0,
+ NULL,
+ NULL,
+ NULL);
+ return 0;
+}
+
+static int32_t readv_trivial_completion(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iatt *buf,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ if (op_ret < 0) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "stat failed (%d)", op_errno);
+ goto error;
+ }
+ local->buf = *buf;
+ STACK_WIND(frame,
+ load_file_size,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->getxattr,
+ local->loc,
+ FSIZE_XATTR_PREFIX,
+ NULL);
+ return 0;
+ error:
+ STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno,
+ NULL, 0, NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t crypt_readv(call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ size_t size,
+ off_t offset,
+ uint32_t flags, dict_t *xdata)
+{
+ int32_t ret;
+ crypt_local_t *local;
+ struct crypt_inode_info *info;
+ struct gf_flock lock = {0, };
+
+#if DEBUG_CRYPT
+ gf_log("crypt", GF_LOG_DEBUG, "reading %d bytes from offset %llu",
+ (int)size, (long long)offset);
+ if (parent_is_crypt_xlator(frame, this))
+ gf_log("crypt", GF_LOG_DEBUG, "parent is crypt");
+#endif
+ local = crypt_alloc_local(frame, this, GF_FOP_READ);
+ if (!local) {
+ ret = ENOMEM;
+ goto error;
+ }
+ if (size == 0)
+ goto trivial;
+
+ local->fd = fd_ref(fd);
+ local->flags = flags;
+
+ info = local_get_inode_info(local, this);
+ if (info == NULL) {
+ ret = EINVAL;
+ fd_unref(fd);
+ goto error;
+ }
+ if (!object_alg_atomic(&info->cinfo)) {
+ ret = EINVAL;
+ fd_unref(fd);
+ goto error;
+ }
+ set_config_offsets(frame, this, offset, size,
+ DATA_ATOM, 0);
+ if (parent_is_crypt_xlator(frame, this)) {
+ data_t *data;
+ /*
+ * We are called by crypt_writev (or cypt_ftruncate)
+ * to perform the "read" component of the read-modify-write
+ * (or read-prune-write) sequence for some atom;
+ *
+ * don't ask for access:
+ * it has already been acquired
+ *
+ * Retrieve current file size
+ */
+ if (!xdata) {
+ gf_log("crypt", GF_LOG_WARNING,
+ "Regular file size hasn't been passed");
+ ret = EIO;
+ goto error;
+ }
+ data = dict_get(xdata, FSIZE_XATTR_PREFIX);
+ if (!data) {
+ gf_log("crypt", GF_LOG_WARNING,
+ "Regular file size not found");
+ ret = EIO;
+ goto error;
+ }
+ local->old_file_size =
+ local->cur_file_size = data_to_uint64(data);
+
+ get_one_call(frame);
+ STACK_WIND(frame,
+ crypt_readv_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readv,
+ local->fd,
+ /*
+ * FIXME: read amount can be reduced
+ */
+ local->data_conf.expanded_size,
+ local->data_conf.aligned_offset,
+ flags,
+ NULL);
+ return 0;
+ }
+ if (xdata)
+ local->xdata = dict_ref(xdata);
+
+ lock.l_len = 0;
+ lock.l_start = 0;
+ lock.l_type = F_RDLCK;
+ lock.l_whence = SEEK_SET;
+
+ STACK_WIND(frame,
+ crypt_readv_finodelk_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->finodelk,
+ this->name,
+ fd,
+ F_SETLKW,
+ &lock,
+ NULL);
+ return 0;
+ trivial:
+ STACK_WIND(frame,
+ readv_trivial_completion,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fstat,
+ fd,
+ NULL);
+ return 0;
+ error:
+ STACK_UNWIND_STRICT(readv,
+ frame,
+ -1,
+ ret,
+ NULL,
+ 0,
+ NULL,
+ NULL,
+ NULL);
+ return 0;
+}
+
+void set_local_io_params_writev(call_frame_t *frame,
+ struct object_cipher_info *object,
+ struct rmw_atom *atom,
+ off_t io_offset,
+ uint32_t io_size)
+{
+ crypt_local_t *local = frame->local;
+
+ local->io_offset = io_offset;
+ local->io_size = io_size;
+
+ local->io_offset_nopad =
+ atom->offset_at(frame, object) + atom->offset_in(frame, object);
+
+ gf_log("crypt", GF_LOG_DEBUG,
+ "set nopad offset to %llu",
+ (unsigned long long)local->io_offset_nopad);
+
+ local->io_size_nopad = atom->io_size_nopad(frame, object);
+
+ gf_log("crypt", GF_LOG_DEBUG,
+ "set nopad size to %llu",
+ (unsigned long long)local->io_size_nopad);
+
+ local->update_disk_file_size = 0;
+ /*
+ * NOTE: eof_padding_size is 0 for all full atoms;
+ * For head and tail atoms it will be set up at rmw_partial block()
+ */
+ local->new_file_size = local->cur_file_size;
+
+ if (local->io_offset_nopad + local->io_size_nopad > local->cur_file_size) {
+
+ local->new_file_size = local->io_offset_nopad + local->io_size_nopad;
+
+ gf_log("crypt", GF_LOG_DEBUG,
+ "set new file size to %llu",
+ (unsigned long long)local->new_file_size);
+
+ local->update_disk_file_size = 1;
+ }
+}
+
+void set_local_io_params_ftruncate(call_frame_t *frame,
+ struct object_cipher_info *object)
+{
+ uint32_t resid;
+ crypt_local_t *local = frame->local;
+ struct avec_config *conf = &local->data_conf;
+
+ resid = conf->orig_offset & (object_alg_blksize(object) - 1);
+ if (resid) {
+ local->eof_padding_size =
+ object_alg_blksize(object) - resid;
+ local->new_file_size = conf->aligned_offset;
+ local->update_disk_file_size = 0;
+ /*
+ * file size will be updated
+ * in the ->writev() stack,
+ * when submitting file tail
+ */
+ }
+ else {
+ local->eof_padding_size = 0;
+ local->new_file_size = conf->orig_offset;
+ local->update_disk_file_size = 1;
+ /*
+ * file size will be updated
+ * in this ->ftruncate stack
+ */
+ }
+}
+
+static inline void submit_head(call_frame_t *frame, xlator_t *this)
+{
+ crypt_local_t *local = frame->local;
+ submit_partial(frame, this, local->fd, HEAD_ATOM);
+}
+
+static inline void submit_tail(call_frame_t *frame, xlator_t *this)
+{
+ crypt_local_t *local = frame->local;
+ submit_partial(frame, this, local->fd, TAIL_ATOM);
+}
+
+static void submit_hole(call_frame_t *frame, xlator_t *this)
+{
+ /*
+ * hole conversion always means
+ * appended write and goes in ordered fashion
+ */
+ do_ordered_submit(frame, this, HOLE_ATOM);
+}
+
+static void submit_data(call_frame_t *frame, xlator_t *this)
+{
+ if (is_ordered_mode(frame)) {
+ do_ordered_submit(frame, this, DATA_ATOM);
+ return;
+ }
+ gf_log("crypt", GF_LOG_WARNING, "Bad submit mode");
+ get_nr_calls(frame, nr_calls_data(frame));
+ do_parallel_submit(frame, this, DATA_ATOM);
+ return;
+}
+
+/*
+ * heplers called by writev_cbk, fruncate_cbk in ordered mode
+ */
+
+static inline int32_t should_submit_hole(crypt_local_t *local)
+{
+ struct avec_config *conf = &local->hole_conf;
+
+ return conf->avec != NULL;
+}
+
+static inline int32_t should_resume_submit_hole(crypt_local_t *local)
+{
+ struct avec_config *conf = &local->hole_conf;
+
+ if (local->fop == GF_FOP_WRITE && has_tail_block(conf))
+ /*
+ * Don't submit a part of hole, which
+ * fits into a data block:
+ * this part of hole will be converted
+ * as a gap filled by zeros in data head
+ * block.
+ */
+ return conf->cursor < conf->acount - 1;
+ else
+ return conf->cursor < conf->acount;
+}
+
+static inline int32_t should_resume_submit_data(call_frame_t *frame)
+{
+ crypt_local_t *local = frame->local;
+ struct avec_config *conf = &local->data_conf;
+
+ if (is_ordered_mode(frame))
+ return conf->cursor < conf->acount;
+ /*
+ * parallel writes
+ */
+ return 0;
+}
+
+static inline int32_t should_submit_data_after_hole(crypt_local_t *local)
+{
+ return local->data_conf.avec != NULL;
+}
+
+static void update_local_file_params(call_frame_t *frame,
+ xlator_t *this,
+ struct iatt *prebuf,
+ struct iatt *postbuf)
+{
+ crypt_local_t *local = frame->local;
+
+ check_buf(frame, this, postbuf);
+
+ local->prebuf = *prebuf;
+ local->postbuf = *postbuf;
+
+ local->prebuf.ia_size = local->cur_file_size;
+ local->postbuf.ia_size = local->new_file_size;
+
+ local->cur_file_size = local->new_file_size;
+}
+
+static int32_t end_writeback_writev(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iatt *prebuf,
+ struct iatt *postbuf,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ if (op_ret <= 0) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "writev iteration failed");
+ goto put_one_call;
+ }
+ /*
+ * op_ret includes paddings (atom's head, atom's tail and EOF)
+ */
+ if (op_ret < local->io_size) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "Incomplete writev iteration");
+ goto put_one_call;
+ }
+ op_ret -= local->eof_padding_size;
+ local->op_ret = op_ret;
+
+ update_local_file_params(frame, this, prebuf, postbuf);
+
+ if (data_write_in_progress(local)) {
+
+ LOCK(&local->rw_count_lock);
+ local->rw_count += op_ret;
+ UNLOCK(&local->rw_count_lock);
+
+ if (should_resume_submit_data(frame))
+ submit_data(frame, this);
+ }
+ else {
+ /*
+ * hole conversion is going on;
+ * don't take into account written zeros
+ */
+ if (should_resume_submit_hole(local))
+ submit_hole(frame, this);
+
+ else if (should_submit_data_after_hole(local))
+ submit_data(frame, this);
+ }
+ put_one_call:
+ put_one_call_writev(frame, this);
+ return 0;
+}
+
+#define crypt_writev_cbk end_writeback_writev
+
+#define HOLE_WRITE_CHUNK_BITS 12
+#define HOLE_WRITE_CHUNK_SIZE (1 << HOLE_WRITE_CHUNK_BITS)
+
+/*
+ * Convert hole of size @size at offset @off to
+ * zeros and prepare respective iovecs for submit.
+ * The hole lock should be held.
+ *
+ * Pre-conditions:
+ * @local->file_size is set and valid.
+ */
+int32_t prepare_for_submit_hole(call_frame_t *frame, xlator_t *this,
+ uint64_t off, off_t size)
+{
+ int32_t ret;
+ crypt_local_t *local = frame->local;
+ struct object_cipher_info *object = &local->info->cinfo;
+
+ set_config_offsets(frame, this, off, size, HOLE_ATOM, 1);
+
+ ret = set_config_avec_hole(this, local,
+ &local->hole_conf, object, local->fop);
+ crypt_check_conf(&local->hole_conf);
+
+ return ret;
+}
+
+/*
+ * prepare for submit @count bytes at offset @from
+ */
+int32_t prepare_for_submit_data(call_frame_t *frame, xlator_t *this,
+ off_t from, int32_t size, struct iovec *vec,
+ int32_t vec_count, int32_t setup_gap)
+{
+ uint32_t ret;
+ crypt_local_t *local = frame->local;
+ struct object_cipher_info *object = &local->info->cinfo;
+
+ set_config_offsets(frame, this, from, size,
+ DATA_ATOM, setup_gap);
+
+ ret = set_config_avec_data(this, local,
+ &local->data_conf, object, vec, vec_count);
+ crypt_check_conf(&local->data_conf);
+
+ return ret;
+}
+
+static void free_avec(struct iovec *avec,
+ char **pool, int blocks_in_pool)
+{
+ if (!avec)
+ return;
+ GF_FREE(pool);
+ GF_FREE(avec);
+}
+
+static void free_avec_data(crypt_local_t *local)
+{
+ return free_avec(local->data_conf.avec,
+ local->data_conf.pool,
+ local->data_conf.blocks_in_pool);
+}
+
+static void free_avec_hole(crypt_local_t *local)
+{
+ return free_avec(local->hole_conf.avec,
+ local->hole_conf.pool,
+ local->hole_conf.blocks_in_pool);
+}
+
+
+static void do_parallel_submit(call_frame_t *frame, xlator_t *this,
+ atom_data_type dtype)
+{
+ crypt_local_t *local = frame->local;
+ struct avec_config *conf;
+
+ local->active_setup = dtype;
+ conf = conf_by_type(frame, dtype);
+
+ if (has_head_block(conf))
+ submit_head(frame, this);
+
+ if (has_full_blocks(conf))
+ submit_full(frame, this);
+
+ if (has_tail_block(conf))
+ submit_tail(frame, this);
+ return;
+}
+
+static void do_ordered_submit(call_frame_t *frame, xlator_t *this,
+ atom_data_type dtype)
+{
+ crypt_local_t *local = frame->local;
+ struct avec_config *conf;
+
+ local->active_setup = dtype;
+ conf = conf_by_type(frame, dtype);
+
+ if (should_submit_head_block(conf)) {
+ get_one_call_nolock(frame);
+ submit_head(frame, this);
+ }
+ else if (should_submit_full_block(conf)) {
+ get_one_call_nolock(frame);
+ submit_full(frame, this);
+ }
+ else if (should_submit_tail_block(conf)) {
+ get_one_call_nolock(frame);
+ submit_tail(frame, this);
+ }
+ else
+ gf_log("crypt", GF_LOG_DEBUG,
+ "nothing has been submitted in ordered mode");
+ return;
+}
+
+static int32_t do_writev(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *dict,
+ dict_t *xdata)
+{
+ data_t *data;
+ crypt_local_t *local = frame->local;
+ struct object_cipher_info *object = &local->info->cinfo;
+ /*
+ * extract regular file size
+ */
+ data = dict_get(dict, FSIZE_XATTR_PREFIX);
+ if (!data) {
+ gf_log("crypt", GF_LOG_WARNING, "Regular file size not found");
+ op_ret = -1;
+ op_errno = EIO;
+ goto error;
+ }
+ local->old_file_size = local->cur_file_size = data_to_uint64(data);
+
+ set_gap_at_end(frame, object, &local->data_conf, DATA_ATOM);
+
+ if (local->cur_file_size < local->data_conf.orig_offset) {
+ /*
+ * Set up hole config
+ */
+ op_errno = prepare_for_submit_hole(frame,
+ this,
+ local->cur_file_size,
+ local->data_conf.orig_offset - local->cur_file_size);
+ if (op_errno) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ goto error;
+ }
+ }
+ if (should_submit_hole(local))
+ submit_hole(frame, this);
+ else
+ submit_data(frame, this);
+ return 0;
+ error:
+ get_one_call_nolock(frame);
+ put_one_call_writev(frame, this);
+ return 0;
+}
+
+static int32_t crypt_writev_finodelk_cbk(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ if (op_ret < 0)
+ goto error;
+ /*
+ * An access has been granted,
+ * retrieve file size first
+ */
+ STACK_WIND(frame,
+ do_writev,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fgetxattr,
+ local->fd,
+ FSIZE_XATTR_PREFIX,
+ NULL);
+ return 0;
+ error:
+ get_one_call_nolock(frame);
+ put_one_call_writev(frame, this);
+ return 0;
+}
+
+static int32_t writev_trivial_completion(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iatt *buf,
+ dict_t *dict)
+{
+ crypt_local_t *local = frame->local;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ local->prebuf = *buf;
+ local->postbuf = *buf;
+
+ local->prebuf.ia_size = local->cur_file_size;
+ local->postbuf.ia_size = local->cur_file_size;
+
+ get_one_call(frame);
+ put_one_call_writev(frame, this);
+ return 0;
+}
+
+int crypt_writev(call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ struct iovec *vec,
+ int32_t count,
+ off_t offset,
+ uint32_t flags,
+ struct iobref *iobref,
+ dict_t *xdata)
+{
+ int32_t ret;
+ crypt_local_t *local;
+ struct crypt_inode_info *info;
+ struct gf_flock lock = {0, };
+#if DEBUG_CRYPT
+ gf_log ("crypt", GF_LOG_DEBUG, "writing %d bytes from offset %llu",
+ (int)iovec_get_size(vec, count), (long long)offset);
+#endif
+ local = crypt_alloc_local(frame, this, GF_FOP_WRITE);
+ if (!local) {
+ ret = ENOMEM;
+ goto error;
+ }
+ local->fd = fd_ref(fd);
+
+ if (iobref)
+ local->iobref = iobref_ref(iobref);
+ /*
+ * to update real file size on the server
+ */
+ local->xattr = dict_new();
+ if (!local->xattr) {
+ ret = ENOMEM;
+ goto error;
+ }
+ local->flags = flags;
+
+ info = local_get_inode_info(local, this);
+ if (info == NULL) {
+ ret = EINVAL;
+ goto error;
+ }
+ if (!object_alg_atomic(&info->cinfo)) {
+ ret = EINVAL;
+ goto error;
+ }
+ if (iovec_get_size(vec, count) == 0)
+ goto trivial;
+
+ ret = prepare_for_submit_data(frame, this, offset,
+ iovec_get_size(vec, count),
+ vec, count, 0 /* don't setup gup
+ in tail: we don't
+ know file size yet */);
+ if (ret)
+ goto error;
+
+ if (parent_is_crypt_xlator(frame, this)) {
+ data_t *data;
+ /*
+ * we are called by shinking crypt_ftruncate(),
+ * which doesn't perform hole conversion;
+ *
+ * don't ask for access:
+ * it has already been acquired
+ */
+
+ /*
+ * extract file size
+ */
+ if (!xdata) {
+ gf_log("crypt", GF_LOG_WARNING,
+ "Regular file size hasn't been passed");
+ ret = EIO;
+ goto error;
+ }
+ data = dict_get(xdata, FSIZE_XATTR_PREFIX);
+ if (!data) {
+ gf_log("crypt", GF_LOG_WARNING,
+ "Regular file size not found");
+ ret = EIO;
+ goto error;
+ }
+ local->old_file_size =
+ local->cur_file_size = data_to_uint64(data);
+
+ submit_data(frame, this);
+ return 0;
+ }
+ if (xdata)
+ local->xdata = dict_ref(xdata);
+ /*
+ * lock the file and retrieve its size
+ */
+ lock.l_len = 0;
+ lock.l_start = 0;
+ lock.l_type = F_WRLCK;
+ lock.l_whence = SEEK_SET;
+
+ STACK_WIND(frame,
+ crypt_writev_finodelk_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->finodelk,
+ this->name,
+ fd,
+ F_SETLKW,
+ &lock,
+ NULL);
+ return 0;
+ trivial:
+ STACK_WIND(frame,
+ writev_trivial_completion,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fstat,
+ fd,
+ NULL);
+ return 0;
+ error:
+ if (local && local->fd)
+ fd_unref(fd);
+ if (local && local->iobref)
+ iobref_unref(iobref);
+ if (local && local->xdata)
+ dict_unref(xdata);
+ if (local && local->xattr)
+ dict_unref(local->xattr);
+ if (local && local->info)
+ free_inode_info(local->info);
+
+ STACK_UNWIND_STRICT(writev, frame, -1, ret, NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t prepare_for_prune(call_frame_t *frame, xlator_t *this, uint64_t offset)
+{
+ set_config_offsets(frame, this,
+ offset,
+ 0, /* count */
+ DATA_ATOM,
+ 0 /* since we prune, there is no
+ gap in tail to uptodate */);
+ return 0;
+}
+
+/*
+ * Finish the read-prune-modify sequence
+ *
+ * Can be invoked as
+ * 1) ->ftruncate_cbk() for cblock-aligned, or trivial prune
+ * 2) ->writev_cbk() for non-cblock-aligned prune
+ */
+
+static int32_t prune_complete(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iatt *prebuf,
+ struct iatt *postbuf,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ update_local_file_params(frame, this, prebuf, postbuf);
+
+ put_one_call_ftruncate(frame, this);
+ return 0;
+}
+
+/*
+ * This is called as ->ftruncate_cbk()
+ *
+ * Perform the "write" component of the
+ * read-prune-write sequence.
+ *
+ * submuit the rest of the file
+ */
+static int32_t prune_submit_file_tail(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iatt *prebuf,
+ struct iatt *postbuf,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+ struct avec_config *conf = &local->data_conf;
+ dict_t *dict;
+
+ if (op_ret < 0)
+ goto put_one_call;
+
+ if (local->xdata) {
+ dict_unref(local->xdata);
+ local->xdata = NULL;
+ }
+ if (xdata)
+ local->xdata = dict_ref(xdata);
+
+ dict = dict_new();
+ if (!dict) {
+ op_errno = ENOMEM;
+ goto error;
+ }
+
+ update_local_file_params(frame, this, prebuf, postbuf);
+ local->new_file_size = conf->orig_offset;
+
+ /*
+ * The rest of the file is a partial block and, hence,
+ * should be written via RMW sequence, so the crypt xlator
+ * does STACK_WIND to itself.
+ *
+ * Pass current file size to crypt_writev()
+ */
+ op_errno = dict_set(dict,
+ FSIZE_XATTR_PREFIX,
+ data_from_uint64(local->cur_file_size));
+ if (op_errno) {
+ gf_log("crypt", GF_LOG_WARNING,
+ "can not set key to update file size");
+ dict_unref(dict);
+ goto error;
+ }
+ gf_log("crypt", GF_LOG_DEBUG,
+ "passing current file size (%llu) to crypt_writev",
+ (unsigned long long)local->cur_file_size);
+ /*
+ * Padding will be filled with
+ * zeros by rmw_partial_block()
+ */
+ STACK_WIND(frame,
+ prune_complete,
+ this,
+ this->fops->writev, /* crypt_writev */
+ local->fd,
+ &local->vec,
+ 1,
+ conf->aligned_offset, /* offset to write from */
+ 0,
+ local->iobref,
+ dict);
+
+ dict_unref(dict);
+ return 0;
+ error:
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ put_one_call:
+ put_one_call_ftruncate(frame, this);
+ return 0;
+}
+
+/*
+ * This is called as a callback of ->writev() invoked in behalf
+ * of ftruncate(): it can be
+ * 1) ordered writes issued by hole conversion in the case of
+ * expanded truncate, or
+ * 2) an rmw partial data block issued by non-cblock-aligned
+ * prune.
+ */
+int32_t end_writeback_ftruncate(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iatt *prebuf,
+ struct iatt *postbuf,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+ /*
+ * if nothing has been written,
+ * then it must be an error
+ */
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ if (op_ret < 0)
+ goto put_one_call;
+
+ update_local_file_params(frame, this, prebuf, postbuf);
+
+ if (data_write_in_progress(local))
+ /* case (2) */
+ goto put_one_call;
+ /* case (1) */
+ if (should_resume_submit_hole(local))
+ submit_hole(frame, this);
+ /*
+ * case of hole, when we should't resume
+ */
+ put_one_call:
+ put_one_call_ftruncate(frame, this);
+ return 0;
+}
+
+/*
+ * Perform prune and write components of the
+ * read-prune-write sequence.
+ *
+ * Called as ->readv_cbk()
+ *
+ * Pre-conditions:
+ * @vec contains the latest atom of the file
+ * (plain text)
+ */
+static int32_t prune_write(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iovec *vec,
+ int32_t count,
+ struct iatt *stbuf,
+ struct iobref *iobref,
+ dict_t *xdata)
+{
+ int32_t i;
+ size_t to_copy;
+ size_t copied = 0;
+ crypt_local_t *local = frame->local;
+ struct avec_config *conf = &local->data_conf;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ if (op_ret == -1)
+ goto put_one_call;
+
+ /*
+ * At first, uptodate head block
+ */
+ if (iovec_get_size(vec, count) < conf->off_in_head) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "Failed to uptodate head block for prune");
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ goto put_one_call;
+ }
+ local->vec.iov_len = conf->off_in_head;
+ local->vec.iov_base = GF_CALLOC(1, local->vec.iov_len,
+ gf_crypt_mt_data);
+
+ if (local->vec.iov_base == NULL) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "Failed to calloc head block for prune");
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto put_one_call;
+ }
+ for (i = 0; i < count; i++) {
+ to_copy = vec[i].iov_len;
+ if (to_copy > local->vec.iov_len - copied)
+ to_copy = local->vec.iov_len - copied;
+
+ memcpy((char *)local->vec.iov_base + copied,
+ vec[i].iov_base,
+ to_copy);
+ copied += to_copy;
+ if (copied == local->vec.iov_len)
+ break;
+ }
+ /*
+ * perform prune with aligned offset
+ * (i.e. at this step we prune a bit
+ * more then it is needed
+ */
+ STACK_WIND(frame,
+ prune_submit_file_tail,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ftruncate,
+ local->fd,
+ conf->aligned_offset,
+ local->xdata);
+ return 0;
+ put_one_call:
+ put_one_call_ftruncate(frame, this);
+ return 0;
+}
+
+/*
+ * Perform a read-prune-write sequence
+ */
+int32_t read_prune_write(call_frame_t *frame, xlator_t *this)
+{
+ int32_t ret = 0;
+ dict_t *dict = NULL;
+ crypt_local_t *local = frame->local;
+ struct avec_config *conf = &local->data_conf;
+ struct object_cipher_info *object = &local->info->cinfo;
+
+ set_local_io_params_ftruncate(frame, object);
+ get_one_call_nolock(frame);
+
+ if ((conf->orig_offset & (object_alg_blksize(object) - 1)) == 0) {
+ /*
+ * cblock-aligned prune:
+ * we don't need read and write components,
+ * just cut file body
+ */
+ gf_log("crypt", GF_LOG_DEBUG,
+ "prune without RMW (at offset %llu",
+ (unsigned long long)conf->orig_offset);
+
+ STACK_WIND(frame,
+ prune_complete,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ftruncate,
+ local->fd,
+ conf->orig_offset,
+ local->xdata);
+ return 0;
+ }
+ gf_log("crypt", GF_LOG_DEBUG,
+ "prune with RMW (at offset %llu",
+ (unsigned long long)conf->orig_offset);
+ /*
+ * We are about to perform the "read" component of the
+ * read-prune-write sequence. It means that we need to
+ * read encrypted data from disk and decrypt it.
+ * So, the crypt translator does STACK_WIND to itself.
+ *
+ * Pass current file size to crypt_readv()
+
+ */
+ dict = dict_new();
+ if (!dict) {
+ gf_log("crypt", GF_LOG_WARNING, "Can not alloc dict");
+ ret = ENOMEM;
+ goto exit;
+ }
+ ret = dict_set(dict,
+ FSIZE_XATTR_PREFIX,
+ data_from_uint64(local->cur_file_size));
+ if (ret) {
+ gf_log("crypt", GF_LOG_WARNING, "Can not set dict");
+ goto exit;
+ }
+ STACK_WIND(frame,
+ prune_write,
+ this,
+ this->fops->readv, /* crypt_readv */
+ local->fd,
+ get_atom_size(object), /* bytes to read */
+ conf->aligned_offset, /* offset to read from */
+ 0,
+ dict);
+ exit:
+ if (dict)
+ dict_unref(dict);
+ return ret;
+}
+
+/*
+ * File prune is more complicated than expand.
+ * First we need to read the latest atom to not lose info
+ * needed for proper update. Also we need to make sure that
+ * every component of read-prune-write sequence leaves data
+ * consistent
+ *
+ * Non-cblock aligned prune is performed as read-prune-write
+ * sequence:
+ *
+ * 1) read the latest atom;
+ * 2) perform cblock-aligned prune
+ * 3) issue a write request for the end-of-file
+ */
+int32_t prune_file(call_frame_t *frame, xlator_t *this, uint64_t offset)
+{
+ int32_t ret;
+
+ ret = prepare_for_prune(frame, this, offset);
+ if (ret)
+ return ret;
+ return read_prune_write(frame, this);
+}
+
+int32_t expand_file(call_frame_t *frame, xlator_t *this,
+ uint64_t offset)
+{
+ int32_t ret;
+ crypt_local_t *local = frame->local;
+
+ ret = prepare_for_submit_hole(frame, this,
+ local->old_file_size,
+ offset - local->old_file_size);
+ if (ret)
+ return ret;
+ submit_hole(frame, this);
+ return 0;
+}
+
+static int32_t ftruncate_trivial_completion(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iatt *buf,
+ dict_t *dict)
+{
+ crypt_local_t *local = frame->local;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ local->prebuf = *buf;
+ local->postbuf = *buf;
+
+ local->prebuf.ia_size = local->cur_file_size;
+ local->postbuf.ia_size = local->cur_file_size;
+
+ get_one_call(frame);
+ put_one_call_ftruncate(frame, this);
+ return 0;
+}
+
+static int32_t do_ftruncate(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *dict,
+ dict_t *xdata)
+{
+ data_t *data;
+ crypt_local_t *local = frame->local;
+
+ if (op_ret)
+ goto error;
+ /*
+ * extract regular file size
+ */
+ data = dict_get(dict, FSIZE_XATTR_PREFIX);
+ if (!data) {
+ gf_log("crypt", GF_LOG_WARNING, "Regular file size not found");
+ op_errno = EIO;
+ goto error;
+ }
+ local->old_file_size = local->cur_file_size = data_to_uint64(data);
+
+ if (local->data_conf.orig_offset == local->cur_file_size) {
+#if DEBUG_CRYPT
+ gf_log("crypt", GF_LOG_DEBUG,
+ "trivial ftruncate (current file size %llu)",
+ (unsigned long long)local->cur_file_size);
+#endif
+ goto trivial;
+ }
+ else if (local->data_conf.orig_offset < local->cur_file_size) {
+#if DEBUG_CRYPT
+ gf_log("crypt", GF_LOG_DEBUG, "prune from %llu to %llu",
+ (unsigned long long)local->cur_file_size,
+ (unsigned long long)local->data_conf.orig_offset);
+#endif
+ op_errno = prune_file(frame,
+ this,
+ local->data_conf.orig_offset);
+ }
+ else {
+#if DEBUG_CRYPT
+ gf_log("crypt", GF_LOG_DEBUG, "expand from %llu to %llu",
+ (unsigned long long)local->cur_file_size,
+ (unsigned long long)local->data_conf.orig_offset);
+#endif
+ op_errno = expand_file(frame,
+ this,
+ local->data_conf.orig_offset);
+ }
+ if (op_errno)
+ goto error;
+ return 0;
+ trivial:
+ STACK_WIND(frame,
+ ftruncate_trivial_completion,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fstat,
+ local->fd,
+ NULL);
+ return 0;
+ error:
+ /*
+ * finish with ftruncate
+ */
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+
+ get_one_call_nolock(frame);
+ put_one_call_ftruncate(frame, this);
+ return 0;
+}
+
+static int32_t crypt_ftruncate_finodelk_cbk(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ if (op_ret < 0)
+ goto error;
+ /*
+ * An access has been granted,
+ * retrieve file size first
+ */
+ STACK_WIND(frame,
+ do_ftruncate,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fgetxattr,
+ local->fd,
+ FSIZE_XATTR_PREFIX,
+ NULL);
+ return 0;
+ error:
+ get_one_call_nolock(frame);
+ put_one_call_ftruncate(frame, this);
+ return 0;
+}
+
+/*
+ * ftruncate is performed in 2 steps:
+ * . recieve file size;
+ * . expand or prune file.
+ */
+static int32_t crypt_ftruncate(call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ off_t offset,
+ dict_t *xdata)
+{
+ int32_t ret;
+ crypt_local_t *local;
+ struct crypt_inode_info *info;
+ struct gf_flock lock = {0, };
+
+ local = crypt_alloc_local(frame, this, GF_FOP_FTRUNCATE);
+ if (!local) {
+ ret = ENOMEM;
+ goto error;
+ }
+ local->xattr = dict_new();
+ if (!local->xattr) {
+ ret = ENOMEM;
+ goto error;
+ }
+ local->fd = fd_ref(fd);
+ info = local_get_inode_info(local, this);
+ if (info == NULL) {
+ ret = EINVAL;
+ goto error;
+ }
+ if (!object_alg_atomic(&info->cinfo)) {
+ ret = EINVAL;
+ goto error;
+ }
+ local->data_conf.orig_offset = offset;
+ if (xdata)
+ local->xdata = dict_ref(xdata);
+
+ lock.l_len = 0;
+ lock.l_start = 0;
+ lock.l_type = F_WRLCK;
+ lock.l_whence = SEEK_SET;
+
+ STACK_WIND(frame,
+ crypt_ftruncate_finodelk_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->finodelk,
+ this->name,
+ fd,
+ F_SETLKW,
+ &lock,
+ NULL);
+ return 0;
+ error:
+ if (local && local->fd)
+ fd_unref(fd);
+ if (local && local->xdata)
+ dict_unref(xdata);
+ if (local && local->xattr)
+ dict_unref(local->xattr);
+ if (local && local->info)
+ free_inode_info(local->info);
+
+ STACK_UNWIND_STRICT(ftruncate, frame, -1, ret, NULL, NULL, NULL);
+ return 0;
+}
+
+/* ->flush_cbk() */
+int32_t truncate_end(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ STACK_UNWIND_STRICT(truncate,
+ frame,
+ op_ret,
+ op_errno,
+ &local->prebuf,
+ &local->postbuf,
+ local->xdata);
+ return 0;
+}
+
+/* ftruncate_cbk() */
+int32_t truncate_flush(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iatt *prebuf,
+ struct iatt *postbuf,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+ fd_t *fd = local->fd;
+ local->prebuf = *prebuf;
+ local->postbuf = *postbuf;
+
+ STACK_WIND(frame,
+ truncate_end,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->flush,
+ fd,
+ NULL);
+ fd_unref(fd);
+ return 0;
+}
+
+/*
+ * is called as ->open_cbk()
+ */
+static int32_t truncate_begin(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ fd_t *fd,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ if (op_ret < 0) {
+ fd_unref(fd);
+ STACK_UNWIND_STRICT(truncate,
+ frame,
+ op_ret,
+ op_errno, NULL, NULL, NULL);
+ return 0;
+ }
+ /*
+ * crypt_truncate() is implemented via crypt_ftruncate(),
+ * so the crypt xlator does STACK_WIND to itself here
+ */
+ STACK_WIND(frame,
+ truncate_flush,
+ this,
+ this->fops->ftruncate, /* crypt_ftruncate */
+ fd,
+ local->offset,
+ NULL);
+ return 0;
+}
+
+/*
+ * crypt_truncate() is implemented via crypt_ftruncate() as a
+ * sequence crypt_open() - crypt_ftruncate() - truncate_flush()
+ */
+int32_t crypt_truncate(call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ off_t offset,
+ dict_t *xdata)
+{
+ fd_t *fd;
+ crypt_local_t *local;
+
+#if DEBUG_CRYPT
+ gf_log(this->name, GF_LOG_DEBUG,
+ "truncate file %s at offset %llu",
+ loc->path, (unsigned long long)offset);
+#endif
+ local = crypt_alloc_local(frame, this, GF_FOP_TRUNCATE);
+ if (!local)
+ goto error;
+
+ fd = fd_create(loc->inode, frame->root->pid);
+ if (!fd) {
+ gf_log(this->name, GF_LOG_ERROR, "Can not create fd");
+ goto error;
+ }
+ local->fd = fd;
+ local->offset = offset;
+ local->xdata = xdata;
+ STACK_WIND(frame,
+ truncate_begin,
+ this,
+ this->fops->open, /* crypt_open() */
+ loc,
+ O_RDWR,
+ fd,
+ NULL);
+ return 0;
+ error:
+ STACK_UNWIND_STRICT(truncate, frame, -1, EINVAL, NULL, NULL, NULL);
+ return 0;
+}
+
+end_writeback_handler_t dispatch_end_writeback(glusterfs_fop_t fop)
+{
+ switch (fop) {
+ case GF_FOP_WRITE:
+ return end_writeback_writev;
+ case GF_FOP_FTRUNCATE:
+ return end_writeback_ftruncate;
+ default:
+ gf_log("crypt", GF_LOG_WARNING, "Bad wb operation %d", fop);
+ return NULL;
+ }
+}
+
+/*
+ * true, if the caller needs metadata string
+ */
+static int32_t is_custom_mtd(dict_t *xdata)
+{
+ data_t *data;
+ uint32_t flags;
+
+ if (!xdata)
+ return 0;
+
+ data = dict_get(xdata, MSGFLAGS_PREFIX);
+ if (!data)
+ return 0;
+ if (data->len != sizeof(uint32_t)) {
+ gf_log("crypt", GF_LOG_WARNING,
+ "Bad msgflags size (%d)", data->len);
+ return -1;
+ }
+ flags = *((uint32_t *)data->data);
+ return msgflags_check_mtd_lock(&flags);
+}
+
+static int32_t crypt_open_done(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ if (op_ret < 0)
+ gf_log(this->name, GF_LOG_WARNING, "mtd unlock failed (%d)",
+ op_errno);
+ put_one_call_open(frame);
+ return 0;
+}
+
+static void crypt_open_tail(call_frame_t *frame, xlator_t *this)
+{
+ struct gf_flock lock = {0, };
+ crypt_local_t *local = frame->local;
+
+ lock.l_type = F_UNLCK;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 0;
+ lock.l_pid = 0;
+
+ STACK_WIND(frame,
+ crypt_open_done,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->finodelk,
+ this->name,
+ local->fd,
+ F_SETLKW,
+ &lock,
+ NULL);
+}
+
+/*
+ * load private inode info at open time
+ * called as ->fgetxattr_cbk()
+ */
+static int load_mtd_open(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *dict,
+ dict_t *xdata)
+{
+ int32_t ret;
+ gf_boolean_t upload_info;
+ data_t *mtd;
+ uint64_t value = 0;
+ struct crypt_inode_info *info;
+ crypt_local_t *local = frame->local;
+ crypt_private_t *priv = this->private;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ if (local->fd->inode->ia_type == IA_IFLNK)
+ goto exit;
+ if (op_ret < 0)
+ goto exit;
+ /*
+ * first, check for cached info
+ */
+ ret = inode_ctx_get(local->fd->inode, this, &value);
+ if (ret != -1) {
+ info = (struct crypt_inode_info *)(long)value;
+ if (info == NULL) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "Inode info expected, but not found");
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ goto exit;
+ }
+ /*
+ * info has been found in the cache
+ */
+ upload_info = _gf_false;
+ }
+ else {
+ /*
+ * info hasn't been found in the cache.
+ */
+ info = alloc_inode_info(local, local->loc);
+ if (!info) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto exit;
+ }
+ init_inode_info_head(info, local->fd);
+ upload_info = _gf_true;
+ }
+ /*
+ * extract metadata
+ */
+ mtd = dict_get(dict, CRYPTO_FORMAT_PREFIX);
+ if (!mtd) {
+ local->op_ret = -1;
+ local->op_errno = ENOENT;
+ gf_log (this->name, GF_LOG_WARNING,
+ "Format string wasn't found");
+ goto exit;
+ }
+ /*
+ * authenticate metadata against the path
+ */
+ ret = open_format((unsigned char *)mtd->data,
+ mtd->len,
+ local->loc,
+ info,
+ get_master_cinfo(priv),
+ local,
+ upload_info);
+ if (ret) {
+ local->op_ret = -1;
+ local->op_errno = ret;
+ goto exit;
+ }
+ if (upload_info) {
+ ret = init_inode_info_tail(info, get_master_cinfo(priv));
+ if (ret) {
+ local->op_ret = -1;
+ local->op_errno = ret;
+ goto exit;
+ }
+ ret = inode_ctx_put(local->fd->inode,
+ this, (uint64_t)(long)info);
+ if (ret == -1) {
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ goto exit;
+ }
+ }
+ if (local->custom_mtd) {
+ /*
+ * pass the metadata string to the customer
+ */
+ ret = dict_set_static_bin(local->xdata,
+ CRYPTO_FORMAT_PREFIX,
+ mtd->data,
+ mtd->len);
+ if (ret) {
+ local->op_ret = -1;
+ local->op_errno = ret;
+ goto exit;
+ }
+ }
+ exit:
+ if (!local->custom_mtd)
+ crypt_open_tail(frame, this);
+ else
+ put_one_call_open(frame);
+ return 0;
+}
+
+static int32_t crypt_open_finodelk_cbk(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ if (op_ret < 0) {
+ gf_log(this->name, GF_LOG_WARNING, "finodelk (LOCK) failed");
+ goto exit;
+ }
+ STACK_WIND(frame,
+ load_mtd_open,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fgetxattr,
+ local->fd,
+ CRYPTO_FORMAT_PREFIX,
+ NULL);
+ return 0;
+ exit:
+ put_one_call_open(frame);
+ return 0;
+}
+
+/*
+ * verify metadata against the specified pathname
+ */
+static int32_t crypt_open_cbk(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ fd_t *fd,
+ dict_t *xdata)
+{
+ struct gf_flock lock = {0, };
+ crypt_local_t *local = frame->local;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ if (local->fd->inode->ia_type == IA_IFLNK)
+ goto exit;
+ if (op_ret < 0)
+ goto exit;
+ if (xdata)
+ local->xdata = dict_ref(xdata);
+ else if (local->custom_mtd){
+ local->xdata = dict_new();
+ if (!local->xdata) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ gf_log ("crypt", GF_LOG_ERROR,
+ "Can not get new dict for mtd string");
+ goto exit;
+ }
+ }
+ lock.l_len = 0;
+ lock.l_start = 0;
+ lock.l_type = local->custom_mtd ? F_WRLCK : F_RDLCK;
+ lock.l_whence = SEEK_SET;
+
+ STACK_WIND(frame,
+ crypt_open_finodelk_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->finodelk,
+ this->name,
+ fd,
+ F_SETLKW,
+ &lock,
+ NULL);
+ return 0;
+ exit:
+ put_one_call_open(frame);
+ return 0;
+}
+
+static int32_t crypt_open(call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ int32_t flags,
+ fd_t *fd,
+ dict_t *xdata)
+{
+ int32_t ret = ENOMEM;
+ crypt_local_t *local;
+
+ local = crypt_alloc_local(frame, this, GF_FOP_OPEN);
+ if (!local)
+ goto error;
+ local->loc = GF_CALLOC(1, sizeof(*loc), gf_crypt_mt_loc);
+ if (!local->loc) {
+ ret = ENOMEM;
+ goto error;
+ }
+ memset(local->loc, 0, sizeof(*local->loc));
+ ret = loc_copy(local->loc, loc);
+ if (ret) {
+ GF_FREE(local->loc);
+ goto error;
+ }
+ local->fd = fd_ref(fd);
+
+ ret = is_custom_mtd(xdata);
+ if (ret < 0) {
+ loc_wipe(local->loc);
+ GF_FREE(local->loc);
+ ret = EINVAL;
+ goto error;
+ }
+ local->custom_mtd = ret;
+
+ if ((flags & O_ACCMODE) == O_WRONLY)
+ /*
+ * we can't open O_WRONLY, because
+ * we need to do read-modify-write
+ */
+ flags = (flags & ~O_ACCMODE) | O_RDWR;
+ /*
+ * Make sure that out translated offsets
+ * and counts won't be ignored
+ */
+ flags &= ~O_APPEND;
+ get_one_call_nolock(frame);
+ STACK_WIND(frame,
+ crypt_open_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->open,
+ loc,
+ flags,
+ fd,
+ xdata);
+ return 0;
+ error:
+ STACK_UNWIND_STRICT(open,
+ frame,
+ -1,
+ ret,
+ NULL,
+ NULL);
+ return 0;
+}
+
+static int32_t init_inode_info_tail(struct crypt_inode_info *info,
+ struct master_cipher_info *master)
+{
+ int32_t ret;
+ struct object_cipher_info *object = &info->cinfo;
+
+#if DEBUG_CRYPT
+ gf_log("crypt", GF_LOG_DEBUG, "Init inode info for object %s",
+ uuid_utoa(info->oid));
+#endif
+ ret = data_cipher_algs[object->o_alg][object->o_mode].set_private(info,
+ master);
+ if (ret) {
+ gf_log("crypt", GF_LOG_ERROR, "Set private info failed");
+ return ret;
+ }
+ return 0;
+}
+
+/*
+ * Init inode info at ->create() time
+ */
+static void init_inode_info_create(struct crypt_inode_info *info,
+ struct master_cipher_info *master,
+ data_t *data)
+{
+ struct object_cipher_info *object;
+
+ info->nr_minor = CRYPT_XLATOR_ID;
+ memcpy(info->oid, data->data, data->len);
+
+ object = &info->cinfo;
+
+ object->o_alg = master->m_alg;
+ object->o_mode = master->m_mode;
+ object->o_block_bits = master->m_block_bits;
+ object->o_dkey_size = master->m_dkey_size;
+}
+
+static void init_inode_info_head(struct crypt_inode_info *info, fd_t *fd)
+{
+ memcpy(info->oid, fd->inode->gfid, sizeof(uuid_t));
+}
+
+static int32_t crypt_create_done(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ crypt_private_t *priv = this->private;
+ crypt_local_t *local = frame->local;
+ struct crypt_inode_info *info = local->info;
+ fd_t *local_fd = local->fd;
+ dict_t *local_xdata = local->xdata;
+ inode_t *local_inode = local->inode;
+
+ if (op_ret < 0) {
+ free_inode_info(info);
+ goto unwind;
+ }
+ op_errno = init_inode_info_tail(info, get_master_cinfo(priv));
+ if (op_errno) {
+ op_ret = -1;
+ free_inode_info(info);
+ goto unwind;
+ }
+ /*
+ * FIXME: drop major subversion number
+ */
+ op_ret = inode_ctx_put(local->fd->inode, this, (uint64_t)(long)info);
+ if (op_ret == -1) {
+ op_errno = EIO;
+ free_inode_info(info);
+ goto unwind;
+ }
+ unwind:
+ free_format(local);
+ STACK_UNWIND_STRICT(create,
+ frame,
+ op_ret,
+ op_errno,
+ local_fd,
+ local_inode,
+ &local->buf,
+ &local->prebuf,
+ &local->postbuf,
+ local_xdata);
+ fd_unref(local_fd);
+ inode_unref(local_inode);
+ if (local_xdata)
+ dict_unref(local_xdata);
+ return 0;
+}
+
+static int crypt_create_tail(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *xdata)
+{
+ struct gf_flock lock = {0, };
+ crypt_local_t *local = frame->local;
+ fd_t *local_fd = local->fd;
+ dict_t *local_xdata = local->xdata;
+ inode_t *local_inode = local->inode;
+
+ dict_unref(local->xattr);
+
+ if (op_ret < 0)
+ goto error;
+
+ lock.l_type = F_UNLCK;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 0;
+ lock.l_pid = 0;
+
+ STACK_WIND(frame,
+ crypt_create_done,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->finodelk,
+ this->name,
+ local->fd,
+ F_SETLKW,
+ &lock,
+ NULL);
+ return 0;
+ error:
+ free_inode_info(local->info);
+ free_format(local);
+
+ STACK_UNWIND_STRICT(create,
+ frame,
+ op_ret,
+ op_errno,
+ local_fd,
+ local_inode,
+ &local->buf,
+ &local->prebuf,
+ &local->postbuf,
+ local_xdata);
+
+ fd_unref(local_fd);
+ inode_unref(local_inode);
+ if (local_xdata)
+ dict_unref(local_xdata);
+ return 0;
+}
+
+static int32_t crypt_create_finodelk_cbk(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+ struct crypt_inode_info *info = local->info;
+
+ if (op_ret < 0)
+ goto error;
+
+ STACK_WIND(frame,
+ crypt_create_tail,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr,
+ local->fd,
+ local->xattr, /* CRYPTO_FORMAT_PREFIX */
+ 0,
+ NULL);
+ return 0;
+ error:
+ free_inode_info(info);
+ free_format(local);
+ fd_unref(local->fd);
+ dict_unref(local->xattr);
+ if (local->xdata)
+ dict_unref(local->xdata);
+
+ STACK_UNWIND_STRICT(create,
+ frame,
+ op_ret,
+ op_errno,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL);
+ return 0;
+}
+
+/*
+ * Create and store crypt-specific format on disk;
+ * Populate cache with private inode info
+ */
+static int32_t crypt_create_cbk(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ fd_t *fd,
+ inode_t *inode,
+ struct iatt *buf,
+ struct iatt *preparent,
+ struct iatt *postparent,
+ dict_t *xdata)
+{
+ struct gf_flock lock = {0, };
+ crypt_local_t *local = frame->local;
+ struct crypt_inode_info *info = local->info;
+
+ if (op_ret < 0)
+ goto error;
+ if (xdata)
+ local->xdata = dict_ref(xdata);
+ local->inode = inode_ref(inode);
+ local->buf = *buf;
+ local->prebuf = *preparent;
+ local->postbuf = *postparent;
+
+ lock.l_len = 0;
+ lock.l_start = 0;
+ lock.l_type = F_WRLCK;
+ lock.l_whence = SEEK_SET;
+
+ STACK_WIND(frame,
+ crypt_create_finodelk_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->finodelk,
+ this->name,
+ local->fd,
+ F_SETLKW,
+ &lock,
+ NULL);
+ return 0;
+ error:
+ free_inode_info(info);
+ free_format(local);
+ fd_unref(local->fd);
+ dict_unref(local->xattr);
+
+ STACK_UNWIND_STRICT(create,
+ frame,
+ op_ret,
+ op_errno,
+ NULL, NULL, NULL,
+ NULL, NULL, NULL);
+ return 0;
+}
+
+static int32_t crypt_create(call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ int32_t flags,
+ mode_t mode,
+ mode_t umask,
+ fd_t *fd,
+ dict_t *xdata)
+{
+ int ret;
+ data_t *data;
+ crypt_local_t *local;
+ crypt_private_t *priv;
+ struct master_cipher_info *master;
+ struct crypt_inode_info *info;
+
+ priv = this->private;
+ master = get_master_cinfo(priv);
+
+ if (master_alg_atomic(master)) {
+ /*
+ * We can't open O_WRONLY, because we
+ * need to do read-modify-write.
+ */
+ if ((flags & O_ACCMODE) == O_WRONLY)
+ flags = (flags & ~O_ACCMODE) | O_RDWR;
+ /*
+ * Make sure that out translated offsets
+ * and counts won't be ignored
+ */
+ flags &= ~O_APPEND;
+ }
+ local = crypt_alloc_local(frame, this, GF_FOP_CREATE);
+ if (!local) {
+ ret = ENOMEM;
+ goto error;
+ }
+ data = dict_get(xdata, "gfid-req");
+ if (!data) {
+ ret = EINVAL;
+ gf_log("crypt", GF_LOG_WARNING, "gfid not found");
+ goto error;
+ }
+ if (data->len != sizeof(uuid_t)) {
+ ret = EINVAL;
+ gf_log("crypt", GF_LOG_WARNING,
+ "bad gfid size (%d), should be %d",
+ (int)data->len, (int)sizeof(uuid_t));
+ goto error;
+ }
+ info = alloc_inode_info(local, loc);
+ if (!info){
+ ret = ENOMEM;
+ goto error;
+ }
+ /*
+ * NOTE:
+ * format has to be created BEFORE
+ * proceeding to the untrusted server
+ */
+ ret = alloc_format_create(local);
+ if (ret) {
+ free_inode_info(info);
+ goto error;
+ }
+ init_inode_info_create(info, master, data);
+
+ ret = create_format(local->format,
+ loc,
+ info,
+ master);
+ if (ret) {
+ free_inode_info(info);
+ goto error;
+ }
+ local->xattr = dict_new();
+ if (!local->xattr) {
+ free_inode_info(info);
+ free_format(local);
+ goto error;
+ }
+ ret = dict_set_static_bin(local->xattr,
+ CRYPTO_FORMAT_PREFIX,
+ local->format,
+ new_format_size());
+ if (ret) {
+ dict_unref(local->xattr);
+ free_inode_info(info);
+ free_format(local);
+ goto error;
+ }
+ ret = dict_set(local->xattr, FSIZE_XATTR_PREFIX, data_from_uint64(0));
+ if (ret) {
+ dict_unref(local->xattr);
+ free_inode_info(info);
+ free_format(local);
+ goto error;
+ }
+ local->fd = fd_ref(fd);
+
+ STACK_WIND(frame,
+ crypt_create_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->create,
+ loc,
+ flags,
+ mode,
+ umask,
+ fd,
+ xdata);
+ return 0;
+ error:
+ gf_log("crypt", GF_LOG_WARNING, "can not create file");
+ STACK_UNWIND_STRICT(create,
+ frame,
+ -1,
+ ret,
+ NULL, NULL, NULL,
+ NULL, NULL, NULL);
+ return 0;
+}
+
+/*
+ * FIXME: this should depends on the version of format string
+ */
+static int32_t filter_crypt_xattr(dict_t *dict,
+ char *key, data_t *value, void *data)
+{
+ dict_del(dict, key);
+ return 0;
+}
+
+static int32_t crypt_fsetxattr(call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ dict_t *dict,
+ int32_t flags, dict_t *xdata)
+{
+ dict_foreach_fnmatch(dict, "trusted.glusterfs.crypt*",
+ filter_crypt_xattr, NULL);
+ STACK_WIND(frame,
+ default_fsetxattr_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr,
+ fd,
+ dict,
+ flags,
+ xdata);
+ return 0;
+}
+
+/*
+ * TBD: verify file metadata before wind
+ */
+static int32_t crypt_setxattr(call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ dict_t *dict,
+ int32_t flags, dict_t *xdata)
+{
+ dict_foreach_fnmatch(dict, "trusted.glusterfs.crypt*",
+ filter_crypt_xattr, NULL);
+ STACK_WIND(frame,
+ default_setxattr_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setxattr,
+ loc,
+ dict,
+ flags,
+ xdata);
+ return 0;
+}
+
+/*
+ * called as flush_cbk()
+ */
+static int32_t linkop_end(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+ linkop_unwind_handler_t unwind_fn;
+ unwind_fn = linkop_unwind_dispatch(local->fop);
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ if (op_ret < 0 &&
+ op_errno == ENOENT &&
+ local->loc->inode->ia_type == IA_IFLNK) {
+ local->op_ret = 0;
+ local->op_errno = 0;
+ }
+ unwind_fn(frame);
+ return 0;
+}
+
+/*
+ * unpin inode on the server
+ */
+static int32_t link_flush(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct iatt *buf,
+ struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ if (op_ret < 0)
+ goto error;
+ if (local->xdata) {
+ dict_unref(local->xdata);
+ local->xdata = NULL;
+ }
+ if (xdata)
+ local->xdata = dict_ref(xdata);
+ local->inode = inode_ref(inode);
+ local->buf = *buf;
+ local->prebuf = *preparent;
+ local->postbuf = *postparent;
+
+ STACK_WIND(frame,
+ linkop_end,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->flush,
+ local->fd,
+ NULL);
+ return 0;
+ error:
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ link_unwind(frame);
+ return 0;
+}
+
+void link_unwind(call_frame_t *frame)
+{
+ crypt_local_t *local = frame->local;
+ dict_t *xdata;
+ dict_t *xattr;
+ inode_t *inode;
+
+ if (!local) {
+ STACK_UNWIND_STRICT(link,
+ frame,
+ -1,
+ ENOMEM,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL);
+ return;
+ }
+ xdata = local->xdata;
+ xattr = local->xattr;
+ inode = local->inode;
+
+ if (local->loc){
+ loc_wipe(local->loc);
+ GF_FREE(local->loc);
+ }
+ if (local->newloc) {
+ loc_wipe(local->newloc);
+ GF_FREE(local->newloc);
+ }
+ if (local->fd)
+ fd_unref(local->fd);
+ if (local->format)
+ GF_FREE(local->format);
+
+ STACK_UNWIND_STRICT(link,
+ frame,
+ local->op_ret,
+ local->op_errno,
+ inode,
+ &local->buf,
+ &local->prebuf,
+ &local->postbuf,
+ xdata);
+ if (xdata)
+ dict_unref(xdata);
+ if (xattr)
+ dict_unref(xattr);
+ if (inode)
+ inode_unref(inode);
+}
+
+void link_wind(call_frame_t *frame, xlator_t *this)
+{
+ crypt_local_t *local = frame->local;
+
+ STACK_WIND(frame,
+ link_flush,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->link,
+ local->loc,
+ local->newloc,
+ local->xdata);
+}
+
+/*
+ * unlink()
+ */
+static int32_t unlink_flush(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ if (op_ret < 0)
+ goto error;
+ local->prebuf = *preparent;
+ local->postbuf = *postparent;
+ if (local->xdata) {
+ dict_unref(local->xdata);
+ local->xdata = NULL;
+ }
+ if (xdata)
+ local->xdata = dict_ref(xdata);
+
+ STACK_WIND(frame,
+ linkop_end,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->flush,
+ local->fd,
+ NULL);
+ return 0;
+ error:
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ unlink_unwind(frame);
+ return 0;
+}
+
+void unlink_unwind(call_frame_t *frame)
+{
+ crypt_local_t *local = frame->local;
+ dict_t *xdata;
+ dict_t *xattr;
+
+ if (!local) {
+ STACK_UNWIND_STRICT(unlink,
+ frame,
+ -1,
+ ENOMEM,
+ NULL,
+ NULL,
+ NULL);
+ return;
+ }
+ xdata = local->xdata;
+ xattr = local->xattr;
+ if (local->loc){
+ loc_wipe(local->loc);
+ GF_FREE(local->loc);
+ }
+ if (local->fd)
+ fd_unref(local->fd);
+ if (local->format)
+ GF_FREE(local->format);
+
+ STACK_UNWIND_STRICT(unlink,
+ frame,
+ local->op_ret,
+ local->op_errno,
+ &local->prebuf,
+ &local->postbuf,
+ xdata);
+ if (xdata)
+ dict_unref(xdata);
+ if (xattr)
+ dict_unref(xattr);
+}
+
+void unlink_wind(call_frame_t *frame, xlator_t *this)
+{
+ crypt_local_t *local = frame->local;
+
+ STACK_WIND(frame,
+ unlink_flush,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->unlink,
+ local->loc,
+ local->flags,
+ local->xdata);
+}
+
+void rename_unwind(call_frame_t *frame)
+{
+ crypt_local_t *local = frame->local;
+ dict_t *xdata;
+ dict_t *xattr;
+ struct iatt *prenewparent;
+ struct iatt *postnewparent;
+
+ if (!local) {
+ STACK_UNWIND_STRICT(rename,
+ frame,
+ -1,
+ ENOMEM,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL);
+ return;
+ }
+ xdata = local->xdata;
+ xattr = local->xattr;
+ prenewparent = local->prenewparent;
+ postnewparent = local->postnewparent;
+
+ if (local->loc){
+ loc_wipe(local->loc);
+ GF_FREE(local->loc);
+ }
+ if (local->newloc){
+ loc_wipe(local->newloc);
+ GF_FREE(local->newloc);
+ }
+ if (local->fd)
+ fd_unref(local->fd);
+ if (local->format)
+ GF_FREE(local->format);
+
+ STACK_UNWIND_STRICT(rename,
+ frame,
+ local->op_ret,
+ local->op_errno,
+ &local->buf,
+ &local->prebuf,
+ &local->postbuf,
+ prenewparent,
+ postnewparent,
+ xdata);
+ if (xdata)
+ dict_unref(xdata);
+ if (xattr)
+ dict_unref(xattr);
+ if (prenewparent)
+ GF_FREE(prenewparent);
+ if (postnewparent)
+ GF_FREE(postnewparent);
+}
+
+/*
+ * called as flush_cbk()
+ */
+static int32_t rename_end(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ rename_unwind(frame);
+ return 0;
+}
+
+static int32_t rename_flush(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iatt *buf,
+ struct iatt *preoldparent,
+ struct iatt *postoldparent,
+ struct iatt *prenewparent,
+ struct iatt *postnewparent,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ if (op_ret < 0)
+ goto error;
+ dict_unref(local->xdata);
+ local->xdata = NULL;
+ if (xdata)
+ local->xdata = dict_ref(xdata);
+
+ local->buf = *buf;
+ local->prebuf = *preoldparent;
+ local->postbuf = *postoldparent;
+ if (prenewparent) {
+ local->prenewparent = GF_CALLOC(1, sizeof(*prenewparent),
+ gf_crypt_mt_iatt);
+ if (!local->prenewparent) {
+ op_errno = ENOMEM;
+ goto error;
+ }
+ *local->prenewparent = *prenewparent;
+ }
+ if (postnewparent) {
+ local->postnewparent = GF_CALLOC(1, sizeof(*postnewparent),
+ gf_crypt_mt_iatt);
+ if (!local->postnewparent) {
+ op_errno = ENOMEM;
+ goto error;
+ }
+ *local->postnewparent = *postnewparent;
+ }
+ STACK_WIND(frame,
+ rename_end,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->flush,
+ local->fd,
+ NULL);
+ return 0;
+ error:
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ rename_unwind(frame);
+ return 0;
+}
+
+void rename_wind(call_frame_t *frame, xlator_t *this)
+{
+ crypt_local_t *local = frame->local;
+
+ STACK_WIND(frame,
+ rename_flush,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->rename,
+ local->loc,
+ local->newloc,
+ local->xdata);
+}
+
+static int32_t __do_linkop(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+ linkop_wind_handler_t wind_fn;
+ linkop_unwind_handler_t unwind_fn;
+
+ wind_fn = linkop_wind_dispatch(local->fop);
+ unwind_fn = linkop_unwind_dispatch(local->fop);
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ if (op_ret >= 0)
+ wind_fn(frame, this);
+ else {
+ gf_log(this->name, GF_LOG_WARNING, "mtd unlock failed (%d)",
+ op_errno);
+ unwind_fn(frame);
+ }
+ return 0;
+}
+
+static int32_t do_linkop(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *xdata)
+{
+ struct gf_flock lock = {0, };
+ crypt_local_t *local = frame->local;
+ linkop_unwind_handler_t unwind_fn;
+
+ unwind_fn = linkop_unwind_dispatch(local->fop);
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ if(op_ret < 0)
+ goto error;
+
+ lock.l_type = F_UNLCK;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 0;
+ lock.l_pid = 0;
+
+ STACK_WIND(frame,
+ __do_linkop,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->finodelk,
+ this->name,
+ local->fd,
+ F_SETLKW,
+ &lock,
+ NULL);
+ return 0;
+ error:
+ unwind_fn(frame);
+ return 0;
+}
+
+/*
+ * Update the metadata string (against the new pathname);
+ * submit the result
+ */
+static int32_t linkop_begin(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ fd_t *fd,
+ dict_t *xdata)
+{
+ gf_boolean_t upload_info;
+ crypt_local_t *local = frame->local;
+ crypt_private_t *priv = this->private;
+ struct crypt_inode_info *info;
+ data_t *old_mtd;
+ uint32_t new_mtd_size;
+ uint64_t value = 0;
+ void (*unwind_fn)(call_frame_t *frame);
+ void (*wind_fn)(call_frame_t *frame, xlator_t *this);
+ mtd_op_t mop;
+
+ wind_fn = linkop_wind_dispatch(local->fop);
+ unwind_fn = linkop_unwind_dispatch(local->fop);
+ mop = linkop_mtdop_dispatch(local->fop);
+
+ if (local->fd->inode->ia_type == IA_IFLNK)
+ goto wind;
+ if (op_ret < 0)
+ /*
+ * verification failed
+ */
+ goto error;
+
+ old_mtd = dict_get(xdata, CRYPTO_FORMAT_PREFIX);
+ if (!old_mtd) {
+ op_errno = EIO;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Metadata string wasn't found");
+ goto error;
+ }
+ new_mtd_size = format_size(mop, old_mtd->len);
+ op_errno = alloc_format(local, new_mtd_size);
+ if (op_errno)
+ goto error;
+ /*
+ * check for cached info
+ */
+ op_ret = inode_ctx_get(fd->inode, this, &value);
+ if (op_ret != -1) {
+ info = (struct crypt_inode_info *)(long)value;
+ if (info == NULL) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Inode info was not found");
+ op_errno = EINVAL;
+ goto error;
+ }
+ /*
+ * info was found in the cache
+ */
+ local->info = info;
+ upload_info = _gf_false;
+ }
+ else {
+ /*
+ * info wasn't found in the cache;
+ */
+ info = alloc_inode_info(local, local->loc);
+ if (!info)
+ goto error;
+ init_inode_info_head(info, fd);
+ local->info = info;
+ upload_info = _gf_true;
+ }
+ op_errno = open_format((unsigned char *)old_mtd->data,
+ old_mtd->len,
+ local->loc,
+ info,
+ get_master_cinfo(priv),
+ local,
+ upload_info);
+ if (op_errno)
+ goto error;
+ if (upload_info == _gf_true) {
+ op_errno = init_inode_info_tail(info,
+ get_master_cinfo(priv));
+ if (op_errno)
+ goto error;
+ op_errno = inode_ctx_put(fd->inode, this,
+ (uint64_t)(long)(info));
+ if (op_errno == -1) {
+ op_errno = EIO;
+ goto error;
+ }
+ }
+ /*
+ * update the format string (append/update/cup a MAC)
+ */
+ op_errno = update_format(local->format,
+ (unsigned char *)old_mtd->data,
+ old_mtd->len,
+ local->mac_idx,
+ mop,
+ local->newloc,
+ info,
+ get_master_cinfo(priv),
+ local);
+ if (op_errno)
+ goto error;
+ /*
+ * store the new format string on the server
+ */
+ if (new_mtd_size) {
+ op_errno = dict_set_static_bin(local->xattr,
+ CRYPTO_FORMAT_PREFIX,
+ local->format,
+ new_mtd_size);
+ if (op_errno)
+ goto error;
+ }
+ STACK_WIND(frame,
+ do_linkop,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setxattr,
+ local->loc,
+ local->xattr,
+ 0,
+ NULL);
+ return 0;
+ wind:
+ wind_fn(frame, this);
+ return 0;
+ error:
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ unwind_fn(frame);
+ return 0;
+}
+
+static int32_t linkop_grab_local(call_frame_t *frame,
+ xlator_t *this,
+ loc_t *oldloc,
+ loc_t *newloc,
+ int flags, dict_t *xdata,
+ glusterfs_fop_t op)
+{
+ int32_t ret = ENOMEM;
+ fd_t *fd;
+ crypt_local_t *local;
+
+ local = crypt_alloc_local(frame, this, op);
+ if (!local)
+ goto error;
+ if (xdata)
+ local->xdata = dict_ref(xdata);
+
+ fd = fd_create(oldloc->inode, frame->root->pid);
+ if (!fd) {
+ gf_log(this->name, GF_LOG_ERROR, "Can not create fd");
+ goto error;
+ }
+ local->fd = fd;
+ local->flags = flags;
+ local->loc = GF_CALLOC(1, sizeof(*oldloc), gf_crypt_mt_loc);
+ if (!local->loc)
+ goto error;
+ memset(local->loc, 0, sizeof(*local->loc));
+ ret = loc_copy(local->loc, oldloc);
+ if (ret) {
+ GF_FREE(local->loc);
+ local->loc = NULL;
+ goto error;
+ }
+ if (newloc) {
+ local->newloc = GF_CALLOC(1, sizeof(*newloc), gf_crypt_mt_loc);
+ if (!local->newloc) {
+ loc_wipe(local->loc);
+ GF_FREE(local->loc);
+ goto error;
+ }
+ memset(local->newloc, 0, sizeof(*local->newloc));
+ ret = loc_copy(local->newloc, newloc);
+ if (ret) {
+ loc_wipe(local->loc);
+ GF_FREE(local->loc);
+ GF_FREE(local->newloc);
+ goto error;
+ }
+ }
+ local->xattr = dict_new();
+ if (!local->xattr) {
+ gf_log(this->name, GF_LOG_ERROR, "Can not create dict");
+ ret = ENOMEM;
+ goto error;
+ }
+ return 0;
+
+error:
+ if (local) {
+ if (local->xdata)
+ dict_unref(local->xdata);
+ if (local->fd)
+ fd_unref(local->fd);
+ local->fd = 0;
+ local->loc = NULL;
+ local->newloc = NULL;
+ local->op_ret = -1;
+ local->op_errno = ret;
+ }
+
+ return ret;
+}
+
+/*
+ * read and verify locked metadata against the old pathname (via open);
+ * update the metadata string in accordance with the new pathname;
+ * submit modified metadata;
+ * wind;
+ */
+static int32_t linkop(call_frame_t *frame,
+ xlator_t *this,
+ loc_t *oldloc,
+ loc_t *newloc,
+ int flags,
+ dict_t *xdata,
+ glusterfs_fop_t op)
+{
+ int32_t ret;
+ dict_t *dict;
+ crypt_local_t *local;
+ void (*unwind_fn)(call_frame_t *frame);
+
+ unwind_fn = linkop_unwind_dispatch(op);
+
+ ret = linkop_grab_local(frame, this, oldloc, newloc, flags, xdata, op);
+ local = frame->local;
+ if (ret)
+ goto error;
+ dict = dict_new();
+ if (!dict) {
+ gf_log(this->name, GF_LOG_ERROR, "Can not create dict");
+ ret = ENOMEM;
+ goto error;
+ }
+ /*
+ * Set a message to crypt_open() that we need
+ * locked metadata string.
+ * All link operations (link, unlink, rename)
+ * need write lock
+ */
+ msgflags_set_mtd_wlock(&local->msgflags);
+ ret = dict_set_static_bin(dict,
+ MSGFLAGS_PREFIX,
+ &local->msgflags,
+ sizeof(local->msgflags));
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR, "Can not set dict");
+ dict_unref(dict);
+ goto error;
+ }
+ /*
+ * verify metadata against the old pathname
+ * and retrieve locked metadata string
+ */
+ STACK_WIND(frame,
+ linkop_begin,
+ this,
+ this->fops->open, /* crypt_open() */
+ oldloc,
+ O_RDWR,
+ local->fd,
+ dict);
+ dict_unref(dict);
+ return 0;
+ error:
+ local->op_ret = -1;
+ local->op_errno = ret;
+ unwind_fn(frame);
+ return 0;
+}
+
+static int32_t crypt_link(call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc, dict_t *xdata)
+{
+ return linkop(frame, this, oldloc, newloc, 0, xdata, GF_FOP_LINK);
+}
+
+static int32_t crypt_unlink(call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int flags, dict_t *xdata)
+{
+ return linkop(frame, this, loc, NULL, flags, xdata, GF_FOP_UNLINK);
+}
+
+static int32_t crypt_rename(call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc, dict_t *xdata)
+{
+ return linkop(frame, this, oldloc, newloc, 0, xdata, GF_FOP_RENAME);
+}
+
+static void put_one_call_open(call_frame_t *frame)
+{
+ crypt_local_t *local = frame->local;
+ if (put_one_call(local)) {
+ fd_t *fd = local->fd;
+ loc_t *loc = local->loc;
+ dict_t *xdata = local->xdata;
+
+ STACK_UNWIND_STRICT(open,
+ frame,
+ local->op_ret,
+ local->op_errno,
+ fd,
+ xdata);
+ fd_unref(fd);
+ if (xdata)
+ dict_unref(xdata);
+ loc_wipe(loc);
+ GF_FREE(loc);
+ }
+}
+
+static int32_t __crypt_readv_done(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+ fd_t *local_fd = local->fd;
+ dict_t *local_xdata = local->xdata;
+ /* read deals with data configs only */
+ struct iovec *avec = local->data_conf.avec;
+ char **pool = local->data_conf.pool;
+ int blocks_in_pool = local->data_conf.blocks_in_pool;
+ struct iobref *iobref = local->iobref;
+ struct iobref *iobref_data = local->iobref_data;
+
+ if (op_ret < 0) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "readv unlock failed (%d)", op_errno);
+ if (local->op_ret >= 0) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ }
+ }
+ dump_plain_text(local, avec);
+
+ gf_log("crypt", GF_LOG_DEBUG,
+ "readv: ret_to_user: %d, iovec len: %d, ia_size: %llu",
+ (int)(local->rw_count > 0 ? local->rw_count : local->op_ret),
+ (int)(local->rw_count > 0 ? iovec_get_size(avec, local->data_conf.acount) : 0),
+ (unsigned long long)local->buf.ia_size);
+
+ STACK_UNWIND_STRICT(readv,
+ frame,
+ local->rw_count > 0 ? local->rw_count : local->op_ret,
+ local->op_errno,
+ avec,
+ avec ? local->data_conf.acount : 0,
+ &local->buf,
+ local->iobref,
+ local_xdata);
+
+ free_avec(avec, pool, blocks_in_pool);
+ fd_unref(local_fd);
+ if (local_xdata)
+ dict_unref(local_xdata);
+ if (iobref)
+ iobref_unref(iobref);
+ if (iobref_data)
+ iobref_unref(iobref_data);
+ return 0;
+}
+
+static void crypt_readv_done(call_frame_t *frame, xlator_t *this)
+{
+ if (parent_is_crypt_xlator(frame, this))
+ /*
+ * don't unlock (it will be done by the parent)
+ */
+ __crypt_readv_done(frame, NULL, this, 0, 0, NULL);
+ else {
+ crypt_local_t *local = frame->local;
+ struct gf_flock lock = {0, };
+
+ lock.l_type = F_UNLCK;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 0;
+ lock.l_pid = 0;
+
+ STACK_WIND(frame,
+ __crypt_readv_done,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->finodelk,
+ this->name,
+ local->fd,
+ F_SETLKW,
+ &lock,
+ NULL);
+ }
+}
+
+static void put_one_call_readv(call_frame_t *frame, xlator_t *this)
+{
+ crypt_local_t *local = frame->local;
+ if (put_one_call(local))
+ crypt_readv_done(frame, this);
+}
+
+static int32_t __crypt_writev_done(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+ fd_t *local_fd = local->fd;
+ dict_t *local_xdata = local->xdata;
+ int32_t ret_to_user;
+
+ if (local->xattr)
+ dict_unref(local->xattr);
+ /*
+ * Calculate amout of butes to be returned
+ * to user. We need to subtract paddings that
+ * have been written as a part of atom.
+ */
+ /*
+ * subtract head padding
+ */
+ if (local->rw_count == 0)
+ /*
+ * Nothing has been written, it must be an error
+ */
+ ret_to_user = local->op_ret;
+ else if (local->rw_count <= local->data_conf.off_in_head) {
+ gf_log("crypt", GF_LOG_WARNING, "Incomplete write");
+ ret_to_user = 0;
+ }
+ else
+ ret_to_user = local->rw_count -
+ local->data_conf.off_in_head;
+ /*
+ * subtract tail padding
+ */
+ if (ret_to_user > local->data_conf.orig_size)
+ ret_to_user = local->data_conf.orig_size;
+
+ if (local->iobref)
+ iobref_unref(local->iobref);
+ if (local->iobref_data)
+ iobref_unref(local->iobref_data);
+ free_avec_data(local);
+ free_avec_hole(local);
+
+ gf_log("crypt", GF_LOG_DEBUG,
+ "writev: ret_to_user: %d", ret_to_user);
+
+ STACK_UNWIND_STRICT(writev,
+ frame,
+ ret_to_user,
+ local->op_errno,
+ &local->prebuf,
+ &local->postbuf,
+ local_xdata);
+ fd_unref(local_fd);
+ if (local_xdata)
+ dict_unref(local_xdata);
+ return 0;
+}
+
+static int32_t crypt_writev_done(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ if (op_ret < 0)
+ gf_log("crypt", GF_LOG_WARNING, "can not update file size");
+
+ if (parent_is_crypt_xlator(frame, this))
+ /*
+ * don't unlock (it will be done by the parent)
+ */
+ __crypt_writev_done(frame, NULL, this, 0, 0, NULL);
+ else {
+ struct gf_flock lock = {0, };
+
+ lock.l_type = F_UNLCK;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 0;
+ lock.l_pid = 0;
+
+ STACK_WIND(frame,
+ __crypt_writev_done,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->finodelk,
+ this->name,
+ local->fd,
+ F_SETLKW,
+ &lock,
+ NULL);
+ }
+ return 0;
+}
+
+static void put_one_call_writev(call_frame_t *frame, xlator_t *this)
+{
+ crypt_local_t *local = frame->local;
+ if (put_one_call(local)) {
+ if (local->update_disk_file_size) {
+ int32_t ret;
+ /*
+ * update file size, unlock the file and unwind
+ */
+ ret = dict_set(local->xattr,
+ FSIZE_XATTR_PREFIX,
+ data_from_uint64(local->cur_file_size));
+ if (ret) {
+ gf_log("crypt", GF_LOG_WARNING,
+ "can not set key to update file size");
+ crypt_writev_done(frame, NULL,
+ this, 0, 0, NULL);
+ return;
+ }
+ gf_log("crypt", GF_LOG_DEBUG,
+ "Updating disk file size to %llu",
+ (unsigned long long)local->cur_file_size);
+ STACK_WIND(frame,
+ crypt_writev_done,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr,
+ local->fd,
+ local->xattr, /* CRYPTO_FORMAT_PREFIX */
+ 0,
+ NULL);
+ }
+ else
+ crypt_writev_done(frame, NULL, this, 0, 0, NULL);
+ }
+}
+
+static int32_t __crypt_ftruncate_done(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+ fd_t *local_fd = local->fd;
+ dict_t *local_xdata = local->xdata;
+ char *iobase = local->vec.iov_base;
+
+ if (op_ret < 0) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "ftruncate unlock failed (%d)", op_errno);
+ if (local->op_ret >= 0) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ }
+ }
+ if (local->iobref_data)
+ iobref_unref(local->iobref_data);
+ free_avec_data(local);
+ free_avec_hole(local);
+
+ gf_log("crypt", GF_LOG_DEBUG,
+ "ftruncate, return to user: presize=%llu, postsize=%llu",
+ (unsigned long long)local->prebuf.ia_size,
+ (unsigned long long)local->postbuf.ia_size);
+
+ STACK_UNWIND_STRICT(ftruncate,
+ frame,
+ local->op_ret < 0 ? -1 : 0,
+ local->op_errno,
+ &local->prebuf,
+ &local->postbuf,
+ local_xdata);
+ fd_unref(local_fd);
+ if (local_xdata)
+ dict_unref(local_xdata);
+ if (iobase)
+ GF_FREE(iobase);
+ return 0;
+}
+
+static int32_t crypt_ftruncate_done(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+ struct gf_flock lock = {0, };
+
+ dict_unref(local->xattr);
+ if (op_ret < 0)
+ gf_log("crypt", GF_LOG_WARNING, "can not update file size");
+
+ lock.l_type = F_UNLCK;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 0;
+ lock.l_pid = 0;
+
+ STACK_WIND(frame,
+ __crypt_ftruncate_done,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->finodelk,
+ this->name,
+ local->fd,
+ F_SETLKW,
+ &lock,
+ NULL);
+ return 0;
+}
+
+static void put_one_call_ftruncate(call_frame_t *frame, xlator_t *this)
+{
+ crypt_local_t *local = frame->local;
+ if (put_one_call(local)) {
+ if (local->update_disk_file_size) {
+ int32_t ret;
+ /*
+ * update file size, unlock the file and unwind
+ */
+ ret = dict_set(local->xattr,
+ FSIZE_XATTR_PREFIX,
+ data_from_uint64(local->cur_file_size));
+ if (ret) {
+ gf_log("crypt", GF_LOG_WARNING,
+ "can not set key to update file size");
+ crypt_ftruncate_done(frame, NULL,
+ this, 0, 0, NULL);
+ return;
+ }
+ gf_log("crypt", GF_LOG_DEBUG,
+ "Updating disk file size to %llu",
+ (unsigned long long)local->cur_file_size);
+ STACK_WIND(frame,
+ crypt_ftruncate_done,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr,
+ local->fd,
+ local->xattr, /* CRYPTO_FORMAT_PREFIX */
+ 0,
+ NULL);
+ }
+ else
+ crypt_ftruncate_done(frame, NULL, this, 0, 0, NULL);
+ }
+}
+
+/*
+ * load regular file size for some FOPs
+ */
+static int32_t load_file_size(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ dict_t *dict,
+ dict_t *xdata)
+{
+ data_t *data;
+ crypt_local_t *local = frame->local;
+
+ dict_t *local_xdata = local->xdata;
+ inode_t *local_inode = local->inode;
+
+ if (op_ret < 0)
+ goto unwind;
+ /*
+ * load regular file size
+ */
+ data = dict_get(dict, FSIZE_XATTR_PREFIX);
+ if (!data) {
+ if (local->xdata)
+ dict_unref(local->xdata);
+ gf_log("crypt", GF_LOG_WARNING, "Regular file size not found");
+ op_ret = -1;
+ op_errno = EIO;
+ goto unwind;
+ }
+ local->buf.ia_size = data_to_uint64(data);
+
+ gf_log(this->name, GF_LOG_DEBUG,
+ "FOP %d: Translate regular file to %llu",
+ local->fop,
+ (unsigned long long)local->buf.ia_size);
+ unwind:
+ if (local->fd)
+ fd_unref(local->fd);
+ if (local->loc) {
+ loc_wipe(local->loc);
+ GF_FREE(local->loc);
+ }
+ switch (local->fop) {
+ case GF_FOP_FSTAT:
+ STACK_UNWIND_STRICT(fstat,
+ frame,
+ op_ret,
+ op_errno,
+ op_ret >= 0 ? &local->buf : NULL,
+ local->xdata);
+ break;
+ case GF_FOP_STAT:
+ STACK_UNWIND_STRICT(stat,
+ frame,
+ op_ret,
+ op_errno,
+ op_ret >= 0 ? &local->buf : NULL,
+ local->xdata);
+ break;
+ case GF_FOP_LOOKUP:
+ STACK_UNWIND_STRICT(lookup,
+ frame,
+ op_ret,
+ op_errno,
+ op_ret >= 0 ? local->inode : NULL,
+ op_ret >= 0 ? &local->buf : NULL,
+ local->xdata,
+ op_ret >= 0 ? &local->postbuf : NULL);
+ break;
+ case GF_FOP_READ:
+ STACK_UNWIND_STRICT(readv,
+ frame,
+ op_ret,
+ op_errno,
+ NULL,
+ 0,
+ op_ret >= 0 ? &local->buf : NULL,
+ NULL,
+ NULL);
+ break;
+ default:
+ gf_log(this->name, GF_LOG_WARNING,
+ "Improper file operation %d", local->fop);
+ }
+ if (local_xdata)
+ dict_unref(local_xdata);
+ if (local_inode)
+ inode_unref(local_inode);
+ return 0;
+}
+
+static int32_t crypt_stat_common_cbk(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iatt *buf, dict_t *xdata)
+{
+ crypt_local_t *local = frame->local;
+
+ if (op_ret < 0)
+ goto unwind;
+ if (!IA_ISREG(buf->ia_type))
+ goto unwind;
+
+ local->buf = *buf;
+ if (xdata)
+ local->xdata = dict_ref(xdata);
+
+ switch (local->fop) {
+ case GF_FOP_FSTAT:
+ STACK_WIND(frame,
+ load_file_size,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fgetxattr,
+ local->fd,
+ FSIZE_XATTR_PREFIX,
+ NULL);
+ break;
+ case GF_FOP_STAT:
+ STACK_WIND(frame,
+ load_file_size,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->getxattr,
+ local->loc,
+ FSIZE_XATTR_PREFIX,
+ NULL);
+ break;
+ default:
+ gf_log (this->name, GF_LOG_WARNING,
+ "Improper file operation %d", local->fop);
+ }
+ return 0;
+ unwind:
+ if (local->fd)
+ fd_unref(local->fd);
+ if (local->loc) {
+ loc_wipe(local->loc);
+ GF_FREE(local->loc);
+ }
+ switch (local->fop) {
+ case GF_FOP_FSTAT:
+ STACK_UNWIND_STRICT(fstat,
+ frame,
+ op_ret,
+ op_errno,
+ op_ret >= 0 ? buf : NULL,
+ op_ret >= 0 ? xdata : NULL);
+ break;
+ case GF_FOP_STAT:
+ STACK_UNWIND_STRICT(stat,
+ frame,
+ op_ret,
+ op_errno,
+ op_ret >= 0 ? buf : NULL,
+ op_ret >= 0 ? xdata : NULL);
+ break;
+ default:
+ gf_log (this->name, GF_LOG_WARNING,
+ "Improper file operation %d", local->fop);
+ }
+ return 0;
+}
+
+static int32_t crypt_fstat(call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd, dict_t *xdata)
+{
+ crypt_local_t *local;
+
+ local = crypt_alloc_local(frame, this, GF_FOP_FSTAT);
+ if (!local)
+ goto error;
+ local->fd = fd_ref(fd);
+ STACK_WIND(frame,
+ crypt_stat_common_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fstat,
+ fd,
+ xdata);
+ return 0;
+ error:
+ STACK_UNWIND_STRICT(fstat,
+ frame,
+ -1,
+ ENOMEM,
+ NULL,
+ NULL);
+ return 0;
+}
+
+static int32_t crypt_stat(call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc, dict_t *xdata)
+{
+ int32_t ret;
+ crypt_local_t *local;
+
+ local = crypt_alloc_local(frame, this, GF_FOP_STAT);
+ if (!local)
+ goto error;
+ local->loc = GF_CALLOC(1, sizeof(*loc), gf_crypt_mt_loc);
+ if (!local->loc)
+ goto error;
+ memset(local->loc, 0, sizeof(*local->loc));
+ ret = loc_copy(local->loc, loc);
+ if (ret) {
+ GF_FREE(local->loc);
+ goto error;
+ }
+ STACK_WIND(frame,
+ crypt_stat_common_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->stat,
+ loc,
+ xdata);
+ return 0;
+ error:
+ STACK_UNWIND_STRICT(stat,
+ frame,
+ -1,
+ ENOMEM,
+ NULL,
+ NULL);
+ return 0;
+}
+
+static int32_t crypt_lookup_cbk(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ inode_t *inode,
+ struct iatt *buf, dict_t *xdata,
+ struct iatt *postparent)
+{
+ crypt_local_t *local = frame->local;
+
+ if (op_ret < 0)
+ goto unwind;
+ if (!IA_ISREG(buf->ia_type))
+ goto unwind;
+
+ local->inode = inode_ref(inode);
+ local->buf = *buf;
+ local->postbuf = *postparent;
+ if (xdata)
+ local->xdata = dict_ref(xdata);
+ uuid_copy(local->loc->gfid, buf->ia_gfid);
+
+ STACK_WIND(frame,
+ load_file_size,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->getxattr,
+ local->loc,
+ FSIZE_XATTR_PREFIX,
+ NULL);
+ return 0;
+ unwind:
+ loc_wipe(local->loc);
+ GF_FREE(local->loc);
+ STACK_UNWIND_STRICT(lookup,
+ frame,
+ op_ret,
+ op_errno,
+ inode,
+ buf,
+ xdata,
+ postparent);
+ return 0;
+}
+
+static int32_t crypt_lookup(call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc, dict_t *xdata)
+{
+ int32_t ret;
+ crypt_local_t *local;
+
+ local = crypt_alloc_local(frame, this, GF_FOP_LOOKUP);
+ if (!local)
+ goto error;
+ local->loc = GF_CALLOC(1, sizeof(*loc), gf_crypt_mt_loc);
+ if (!local->loc)
+ goto error;
+ memset(local->loc, 0, sizeof(*local->loc));
+ ret = loc_copy(local->loc, loc);
+ if (ret) {
+ GF_FREE(local->loc);
+ goto error;
+ }
+ gf_log(this->name, GF_LOG_DEBUG, "Lookup %s", loc->path);
+ STACK_WIND(frame,
+ crypt_lookup_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lookup,
+ loc,
+ xdata);
+ return 0;
+ error:
+ STACK_UNWIND_STRICT(lookup,
+ frame,
+ -1,
+ ENOMEM,
+ NULL,
+ NULL,
+ NULL,
+ NULL);
+ return 0;
+}
+
+/*
+ * for every regular directory entry find its real file size
+ * and update stat's buf properly
+ */
+static int32_t crypt_readdirp_cbk(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ gf_dirent_t *entries, dict_t *xdata)
+{
+ gf_dirent_t *entry = NULL;
+
+ if (op_ret < 0)
+ goto unwind;
+
+ list_for_each_entry (entry, (&entries->list), list) {
+ data_t *data;
+
+ if (!IA_ISREG(entry->d_stat.ia_type))
+ continue;
+ data = dict_get(entry->dict, FSIZE_XATTR_PREFIX);
+ if (!data){
+ gf_log("crypt", GF_LOG_WARNING,
+ "Regular file size of direntry not found");
+ op_errno = EIO;
+ op_ret = -1;
+ break;
+ }
+ entry->d_stat.ia_size = data_to_uint64(data);
+ }
+ unwind:
+ STACK_UNWIND_STRICT(readdirp, frame, op_ret, op_errno, entries, xdata);
+ return 0;
+}
+
+/*
+ * ->readdirp() fills in-core inodes, so we need to set proper
+ * file sizes for all directory entries of the parent @fd.
+ * Actual updates take place in ->crypt_readdirp_cbk()
+ */
+static int32_t crypt_readdirp(call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t offset,
+ dict_t *xdata)
+{
+ int32_t ret = ENOMEM;
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata)
+ goto error;
+ }
+ else
+ dict_ref(xdata);
+ /*
+ * make sure that we'll have real file sizes at ->readdirp_cbk()
+ */
+ ret = dict_set(xdata, FSIZE_XATTR_PREFIX, data_from_uint64(0));
+ if (ret) {
+ dict_unref(xdata);
+ goto error;
+ }
+ STACK_WIND(frame,
+ crypt_readdirp_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdirp,
+ fd,
+ size,
+ offset,
+ xdata);
+ dict_unref(xdata);
+ return 0;
+ error:
+ STACK_UNWIND_STRICT(readdirp, frame, -1, ret, NULL, NULL);
+ return 0;
+}
+
+static int32_t crypt_access(call_frame_t *frame,
+ xlator_t *this,
+ loc_t *loc,
+ int32_t mask, dict_t *xdata)
+{
+ gf_log(this->name, GF_LOG_WARNING,
+ "NFS mounts of encrypted volumes are unsupported");
+ STACK_UNWIND_STRICT(access, frame, -1, EPERM, NULL);
+ return 0;
+}
+
+int32_t master_set_block_size (xlator_t *this, crypt_private_t *priv,
+ dict_t *options)
+{
+ uint64_t block_size = 0;
+ struct master_cipher_info *master = get_master_cinfo(priv);
+
+ if (options != NULL)
+ GF_OPTION_RECONF("block-size", block_size, options,
+ size_uint64, error);
+ else
+ GF_OPTION_INIT("block-size", block_size, size_uint64, error);
+
+ switch (block_size) {
+ case 512:
+ master->m_block_bits = 9;
+ break;
+ case 1024:
+ master->m_block_bits = 10;
+ break;
+ case 2048:
+ master->m_block_bits = 11;
+ break;
+ case 4096:
+ master->m_block_bits = 12;
+ break;
+ default:
+ gf_log("crypt", GF_LOG_ERROR,
+ "FATAL: unsupported block size %llu",
+ (unsigned long long)block_size);
+ goto error;
+ }
+ return 0;
+ error:
+ return -1;
+}
+
+int32_t master_set_alg(xlator_t *this, crypt_private_t *priv)
+{
+ struct master_cipher_info *master = get_master_cinfo(priv);
+ master->m_alg = AES_CIPHER_ALG;
+ return 0;
+}
+
+int32_t master_set_mode(xlator_t *this, crypt_private_t *priv)
+{
+ struct master_cipher_info *master = get_master_cinfo(priv);
+ master->m_mode = XTS_CIPHER_MODE;
+ return 0;
+}
+
+/*
+ * set key size in bits to the master info
+ * Pre-conditions: cipher mode in the master info is uptodate.
+ */
+static int master_set_data_key_size (xlator_t *this, crypt_private_t *priv,
+ dict_t *options)
+{
+ int32_t ret;
+ uint64_t key_size = 0;
+ struct master_cipher_info *master = get_master_cinfo(priv);
+
+ if (options != NULL)
+ GF_OPTION_RECONF("data-key-size", key_size, options,
+ uint64, error);
+ else
+ GF_OPTION_INIT("data-key-size", key_size, uint64, error);
+
+ ret = data_cipher_algs[master->m_alg][master->m_mode].check_key(key_size);
+ if (ret) {
+ gf_log("crypt", GF_LOG_ERROR,
+ "FATAL: wrong bin key size %llu for alg %d mode %d",
+ (unsigned long long)key_size,
+ (int)master->m_alg,
+ (int)master->m_mode);
+ goto error;
+ }
+ master->m_dkey_size = key_size;
+ return 0;
+ error:
+ return -1;
+}
+
+static int is_hex(char *s) {
+ return ('0' <= *s && *s <= '9') || ('a' <= *s && *s <= 'f');
+}
+
+static int parse_hex_buf(xlator_t *this, char *src, unsigned char *dst,
+ int hex_size)
+{
+ int i;
+ int hex_byte = 0;
+
+ for (i = 0; i < (hex_size / 2); i++) {
+ if (!is_hex(src + i*2) || !is_hex(src + i*2 + 1)) {
+ gf_log("crypt", GF_LOG_ERROR,
+ "FATAL: not hex symbol in key");
+ return -1;
+ }
+ if (sscanf(src + i*2, "%2x", &hex_byte) != 1) {
+ gf_log("crypt", GF_LOG_ERROR,
+ "FATAL: can not parse hex key");
+ return -1;
+ }
+ dst[i] = hex_byte & 0xff;
+ }
+ return 0;
+}
+
+/*
+ * Parse options;
+ * install master volume key
+ */
+int32_t master_set_master_vol_key(xlator_t *this, crypt_private_t *priv)
+{
+ int32_t ret;
+ FILE *file = NULL;
+
+ int32_t key_size;
+ char *opt_key_file_pathname = NULL;
+
+ unsigned char bin_buf[MASTER_VOL_KEY_SIZE];
+ char hex_buf[2 * MASTER_VOL_KEY_SIZE];
+
+ struct master_cipher_info *master = get_master_cinfo(priv);
+ /*
+ * extract master key passed via option
+ */
+ GF_OPTION_INIT("master-key", opt_key_file_pathname, path, bad_key);
+
+ if (!opt_key_file_pathname) {
+ gf_log(this->name, GF_LOG_ERROR, "FATAL: missing master key");
+ return -1;
+ }
+ gf_log(this->name, GF_LOG_DEBUG, "handling file key %s",
+ opt_key_file_pathname);
+
+ file = fopen(opt_key_file_pathname, "r");
+ if (file == NULL) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "FATAL: can not open file with master key");
+ return -1;
+ }
+ /*
+ * extract hex key
+ */
+ key_size = fread(hex_buf, 1, sizeof(hex_buf), file);
+ if (key_size < sizeof(hex_buf)) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "FATAL: master key is too short");
+ goto bad_key;
+ }
+ ret = parse_hex_buf(this, hex_buf, bin_buf, key_size);
+ if (ret)
+ goto bad_key;
+ memcpy(master->m_key, bin_buf, MASTER_VOL_KEY_SIZE);
+ memset(hex_buf, 0, sizeof(hex_buf));
+ fclose(file);
+
+ memset(bin_buf, 0, sizeof(bin_buf));
+ return 0;
+ bad_key:
+ gf_log(this->name, GF_LOG_ERROR, "FATAL: bad master key");
+ if (file)
+ fclose(file);
+ memset(bin_buf, 0, sizeof(bin_buf));
+ return -1;
+}
+
+/*
+ * Derive volume key for object-id authentication
+ */
+int32_t master_set_nmtd_vol_key(xlator_t *this, crypt_private_t *priv)
+{
+ return get_nmtd_vol_key(get_master_cinfo(priv));
+}
+
+int32_t crypt_init_xlator(xlator_t *this)
+{
+ int32_t ret;
+ crypt_private_t *priv = this->private;
+
+ ret = master_set_alg(this, priv);
+ if (ret)
+ return ret;
+ ret = master_set_mode(this, priv);
+ if (ret)
+ return ret;
+ ret = master_set_block_size(this, priv, NULL);
+ if (ret)
+ return ret;
+ ret = master_set_data_key_size(this, priv, NULL);
+ if (ret)
+ return ret;
+ ret = master_set_master_vol_key(this, priv);
+ if (ret)
+ return ret;
+ return master_set_nmtd_vol_key(this, priv);
+}
+
+static int32_t crypt_alloc_private(xlator_t *this)
+{
+ this->private = GF_CALLOC(1, sizeof(crypt_private_t), gf_crypt_mt_priv);
+ if (!this->private) {
+ gf_log("crypt", GF_LOG_ERROR,
+ "Can not allocate memory for private data");
+ return ENOMEM;
+ }
+ return 0;
+}
+
+static void crypt_free_private(xlator_t *this)
+{
+ crypt_private_t *priv = this->private;
+ if (priv) {
+ memset(priv, 0, sizeof(*priv));
+ GF_FREE(priv);
+ }
+}
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ if (!this)
+ return ret;
+
+ ret = xlator_mem_acct_init (this, gf_crypt_mt_end);
+
+ if (ret != 0) {
+ gf_log(this->name, GF_LOG_ERROR, "Memory accounting init"
+ "failed");
+ return ret;
+ }
+
+ return ret;
+}
+
+int32_t reconfigure (xlator_t *this, dict_t *options)
+{
+ int32_t ret = -1;
+ crypt_private_t *priv = NULL;
+
+ GF_VALIDATE_OR_GOTO ("crypt", this, error);
+ GF_VALIDATE_OR_GOTO (this->name, this->private, error);
+ GF_VALIDATE_OR_GOTO (this->name, options, error);
+
+ priv = this->private;
+
+ ret = master_set_block_size(this, priv, options);
+ if (ret) {
+ gf_log("this->name", GF_LOG_ERROR,
+ "Failed to reconfure block size");
+ goto error;
+ }
+ ret = master_set_data_key_size(this, priv, options);
+ if (ret) {
+ gf_log("this->name", GF_LOG_ERROR,
+ "Failed to reconfure data key size");
+ goto error;
+ }
+ return 0;
+ error:
+ return ret;
+}
+
+int32_t init(xlator_t *this)
+{
+ int32_t ret;
+
+ if (!this->children || this->children->next) {
+ gf_log ("crypt", GF_LOG_ERROR,
+ "FATAL: crypt should have exactly one child");
+ return EINVAL;
+ }
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "dangling volume. check volfile ");
+ }
+ ret = crypt_alloc_private(this);
+ if (ret)
+ return ret;
+ ret = crypt_init_xlator(this);
+ if (ret)
+ goto error;
+ this->local_pool = mem_pool_new(crypt_local_t, 64);
+ if (!this->local_pool) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "failed to create local_t's memory pool");
+ ret = ENOMEM;
+ goto error;
+ }
+ gf_log ("crypt", GF_LOG_INFO, "crypt xlator loaded");
+ return 0;
+ error:
+ crypt_free_private(this);
+ return ret;
+}
+
+void fini (xlator_t *this)
+{
+ crypt_free_private(this);
+}
+
+struct xlator_fops fops = {
+ .readv = crypt_readv,
+ .writev = crypt_writev,
+ .truncate = crypt_truncate,
+ .ftruncate = crypt_ftruncate,
+ .setxattr = crypt_setxattr,
+ .fsetxattr = crypt_fsetxattr,
+ .link = crypt_link,
+ .unlink = crypt_unlink,
+ .rename = crypt_rename,
+ .open = crypt_open,
+ .create = crypt_create,
+ .stat = crypt_stat,
+ .fstat = crypt_fstat,
+ .lookup = crypt_lookup,
+ .readdirp = crypt_readdirp,
+ .access = crypt_access
+};
+
+struct xlator_cbks cbks = {
+ .forget = crypt_forget
+};
+
+struct volume_options options[] = {
+ { .key = {"master-key"},
+ .type = GF_OPTION_TYPE_PATH,
+ .description = "Pathname of regular file which contains master volume key"
+ },
+ { .key = {"data-key-size"},
+ .type = GF_OPTION_TYPE_SIZET,
+ .description = "Data key size (bits)",
+ .min = 256,
+ .max = 512,
+ .default_value = "256",
+ },
+ { .key = {"block-size"},
+ .type = GF_OPTION_TYPE_SIZET,
+ .description = "Atom size (bits)",
+ .min = 512,
+ .max = 4096,
+ .default_value = "4096"
+ },
+ { .key = {NULL} },
+};
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
diff --git a/xlators/encryption/crypt/src/crypt.h b/xlators/encryption/crypt/src/crypt.h
new file mode 100644
index 000000000..eb7291f13
--- /dev/null
+++ b/xlators/encryption/crypt/src/crypt.h
@@ -0,0 +1,908 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __CRYPT_H__
+#define __CRYPT_H__
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+#include <openssl/aes.h>
+#include <openssl/evp.h>
+#include <openssl/sha.h>
+#include <openssl/hmac.h>
+#include <openssl/cmac.h>
+#include <openssl/modes.h>
+#include "crypt-mem-types.h"
+#include "compat.h"
+
+#define CRYPT_XLATOR_ID (0)
+
+#define MAX_IOVEC_BITS (3)
+#define MAX_IOVEC (1 << MAX_IOVEC_BITS)
+#define KEY_FACTOR_BITS (6)
+
+#define DEBUG_CRYPT (0)
+#define TRIVIAL_TFM (0)
+
+#define CRYPT_MIN_BLOCK_BITS (9)
+#define CRYPT_MAX_BLOCK_BITS (12)
+
+#define MASTER_VOL_KEY_SIZE (32)
+#define NMTD_VOL_KEY_SIZE (16)
+
+#if defined(__NetBSD__)
+typedef off_t loff_t;
+#endif
+
+#if defined(GF_DARWIN_HOST_OS)
+typedef uint64_t loff_t;
+#endif
+
+struct crypt_key {
+ uint32_t len;
+ const char *label;
+};
+
+/*
+ * Add new key types to the end of this
+ * enumeration but before LAST_KEY_TYPE
+ */
+typedef enum {
+ MASTER_VOL_KEY,
+ NMTD_VOL_KEY,
+ NMTD_LINK_KEY,
+ EMTD_FILE_KEY,
+ DATA_FILE_KEY_256,
+ DATA_FILE_KEY_512,
+ LAST_KEY_TYPE
+}crypt_key_type;
+
+struct kderive_context {
+ const unsigned char *pkey;/* parent key */
+ uint32_t pkey_len; /* parent key size, bits */
+ uint32_t ckey_len; /* child key size, bits */
+ unsigned char *fid; /* fixed input data, NIST 800-108, 5.1 */
+ uint32_t fid_len; /* fid len, bytes */
+ unsigned char *out; /* contains child keying material */
+ uint32_t out_len; /* out len, bytes */
+};
+
+typedef enum {
+ DATA_ATOM,
+ HOLE_ATOM,
+ LAST_DATA_TYPE
+}atom_data_type;
+
+typedef enum {
+ HEAD_ATOM,
+ TAIL_ATOM,
+ FULL_ATOM,
+ LAST_LOCALITY_TYPE
+}atom_locality_type;
+
+typedef enum {
+ MTD_CREATE,
+ MTD_APPEND,
+ MTD_OVERWRITE,
+ MTD_CUT,
+ MTD_LAST_OP
+} mtd_op_t;
+
+struct xts128_context {
+ void *key1, *key2;
+ block128_f block1,block2;
+};
+
+struct object_cipher_info {
+ cipher_alg_t o_alg;
+ cipher_mode_t o_mode;
+ uint32_t o_block_bits;
+ uint32_t o_dkey_size; /* raw data key size in bits */
+ union {
+ struct {
+ unsigned char ivec[16];
+ AES_KEY dkey[2];
+ AES_KEY tkey; /* key used for tweaking */
+ XTS128_CONTEXT xts;
+ } aes_xts;
+ } u;
+};
+
+struct master_cipher_info {
+ /*
+ * attributes inherited by newly created regular files
+ */
+ cipher_alg_t m_alg;
+ cipher_mode_t m_mode;
+ uint32_t m_block_bits;
+ uint32_t m_dkey_size; /* raw key size in bits */
+ /*
+ * master key
+ */
+ unsigned char m_key[MASTER_VOL_KEY_SIZE];
+ /*
+ * volume key for oid authentication
+ */
+ unsigned char m_nmtd_key[NMTD_VOL_KEY_SIZE];
+};
+
+/*
+* This info is not changed during file's life
+ */
+struct crypt_inode_info {
+#if DEBUG_CRYPT
+ loc_t *loc; /* pathname that the file has been
+ opened, or created with */
+#endif
+ uint16_t nr_minor;
+ uuid_t oid;
+ struct object_cipher_info cinfo;
+};
+
+/*
+ * this should locate in secure memory
+ */
+typedef struct {
+ struct master_cipher_info master;
+} crypt_private_t;
+
+static inline struct master_cipher_info *get_master_cinfo(crypt_private_t *priv)
+{
+ return &priv->master;
+}
+
+static inline struct object_cipher_info *get_object_cinfo(struct crypt_inode_info
+ *info)
+{
+ return &info->cinfo;
+}
+
+/*
+ * this describes layouts and properties
+ * of atoms in an aligned vector
+ */
+struct avec_config {
+ uint32_t atom_size;
+ atom_data_type type;
+ size_t orig_size;
+ off_t orig_offset;
+ size_t expanded_size;
+ off_t aligned_offset;
+
+ uint32_t off_in_head;
+ uint32_t off_in_tail;
+ uint32_t gap_in_tail;
+ uint32_t nr_full_blocks;
+
+ struct iovec *avec; /* aligned vector */
+ uint32_t acount; /* number of avec components. The same
+ * as number of occupied logical blocks */
+ char **pool;
+ uint32_t blocks_in_pool;
+ uint32_t cursor; /* makes sense only for ordered writes,
+ * so there is no races on this counter.
+ *
+ * Cursor is per-config object, we don't
+ * reset cursor for atoms of different
+ * localities (head, tail, full)
+ */
+};
+
+
+typedef struct {
+ glusterfs_fop_t fop; /* code of FOP this local info built for */
+ fd_t *fd;
+ inode_t *inode;
+ loc_t *loc;
+ int32_t mac_idx;
+ loc_t *newloc;
+ int32_t flags;
+ int32_t wbflags;
+ struct crypt_inode_info *info;
+ struct iobref *iobref;
+ struct iobref *iobref_data;
+ off_t offset;
+
+ uint64_t old_file_size; /* per FOP, retrieved under lock held */
+ uint64_t cur_file_size; /* per iteration, before issuing IOs */
+ uint64_t new_file_size; /* per iteration, after issuing IOs */
+
+ uint64_t io_offset; /* offset of IOs issued per iteration */
+ uint64_t io_offset_nopad; /* offset of user's data in the atom */
+ uint32_t io_size; /* size of IOs issued per iteration */
+ uint32_t io_size_nopad; /* size of user's data in the IOs */
+ uint32_t eof_padding_size; /* size od EOF padding in the IOs */
+
+ gf_lock_t call_lock; /* protect nr_calls from many cbks */
+ int32_t nr_calls;
+
+ atom_data_type active_setup; /* which setup (hole or date)
+ is currently active */
+ /* data setup */
+ struct avec_config data_conf;
+
+ /* hole setup */
+ int hole_conv_in_proggress;
+ gf_lock_t hole_lock; /* protect hole config from many cbks */
+ int hole_handled;
+ struct avec_config hole_conf;
+ struct iatt buf;
+ struct iatt prebuf;
+ struct iatt postbuf;
+ struct iatt *prenewparent;
+ struct iatt *postnewparent;
+ int32_t op_ret;
+ int32_t op_errno;
+ int32_t rw_count; /* total read or written */
+ gf_lock_t rw_count_lock; /* protect the counter above */
+ unsigned char *format; /* for create, update format string */
+ uint32_t format_size;
+ uint32_t msgflags; /* messages for crypt_open() */
+ dict_t *xdata;
+ dict_t *xattr;
+ struct iovec vec; /* contains last file's atom for
+ read-prune-write sequence */
+ gf_boolean_t custom_mtd;
+ /*
+ * the next 3 fields are used by readdir and friends
+ */
+ gf_dirent_t *de; /* directory entry */
+ char *de_path; /* pathname of directory entry */
+ uint32_t de_prefix_len; /* lenght of the parent's pathname */
+ gf_dirent_t *entries;
+
+ uint32_t update_disk_file_size:1;
+} crypt_local_t;
+
+/* This represents a (read)modify-write atom */
+struct rmw_atom {
+ atom_locality_type locality;
+ /*
+ * read-modify-write sequence of the atom
+ */
+ int32_t (*rmw)(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iovec *vec,
+ int32_t count,
+ struct iatt *stbuf,
+ struct iobref *iobref,
+ dict_t *xdata);
+ /*
+ * offset of the logical block in a file
+ */
+ loff_t (*offset_at)(call_frame_t *frame,
+ struct object_cipher_info *object);
+ /*
+ * IO offset in an atom
+ */
+ uint32_t (*offset_in)(call_frame_t *frame,
+ struct object_cipher_info *object);
+ /*
+ * number of bytes of plain text of this atom that user
+ * wants to read/write.
+ * It can be smaller than atom_size in the case of head
+ * or tail atoms.
+ */
+ uint32_t (*io_size_nopad)(call_frame_t *frame,
+ struct object_cipher_info *object);
+ /*
+ * which iovec represents the atom
+ */
+ struct iovec *(*get_iovec)(call_frame_t *frame, uint32_t count);
+ /*
+ * how many bytes of partial block should be uptodated by
+ * reading from disk.
+ * This is used to perform a read component of RMW (read-modify-write).
+ */
+ uint32_t (*count_to_uptodate)(call_frame_t *frame, struct object_cipher_info *object);
+ struct avec_config *(*get_config)(call_frame_t *frame);
+};
+
+struct data_cipher_alg {
+ gf_boolean_t atomic; /* true means that algorithm requires
+ to pad data before cipher transform */
+ gf_boolean_t should_pad; /* true means that algorithm requires
+ to pad the end of file with extra-data */
+ uint32_t blkbits; /* blksize = 1 << blkbits */
+ /*
+ * any preliminary sanity checks goes here
+ */
+ int32_t (*init)(void);
+ /*
+ * set alg-mode specific inode info
+ */
+ int32_t (*set_private)(struct crypt_inode_info *info,
+ struct master_cipher_info *master);
+ /*
+ * check alg-mode specific data key
+ */
+ int32_t (*check_key)(uint32_t key_size);
+ void (*set_iv)(off_t offset, struct object_cipher_info *object);
+ int32_t (*encrypt)(const unsigned char *from, unsigned char *to,
+ size_t length, off_t offset, const int enc,
+ struct object_cipher_info *object);
+};
+
+/*
+ * version-dependent metadata loader
+ */
+struct crypt_mtd_loader {
+ /*
+ * return core format size
+ */
+ size_t (*format_size)(mtd_op_t op, size_t old_size);
+ /*
+ * pack version-specific metadata of an object
+ * at ->create()
+ */
+ int32_t (*create_format)(unsigned char *wire,
+ loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master);
+ /*
+ * extract version-specific metadata of an object
+ * at ->open() time
+ */
+ int32_t (*open_format)(unsigned char *wire,
+ int32_t len,
+ loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master,
+ crypt_local_t *local,
+ gf_boolean_t load_info);
+ int32_t (*update_format)(unsigned char *new,
+ unsigned char *old,
+ size_t old_len,
+ int32_t mac_idx,
+ mtd_op_t op,
+ loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master,
+ crypt_local_t *local);
+};
+
+typedef int32_t (*end_writeback_handler_t)(call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iatt *prebuf,
+ struct iatt *postbuf,
+ dict_t *xdata);
+typedef void (*linkop_wind_handler_t)(call_frame_t *frame, xlator_t *this);
+typedef void (*linkop_unwind_handler_t)(call_frame_t *frame);
+
+
+/* Declarations */
+
+/* keys.c */
+extern struct crypt_key crypt_keys[LAST_KEY_TYPE];
+int32_t get_nmtd_vol_key(struct master_cipher_info *master);
+int32_t get_nmtd_link_key(loc_t *loc,
+ struct master_cipher_info *master,
+ unsigned char *result);
+int32_t get_emtd_file_key(struct crypt_inode_info *info,
+ struct master_cipher_info *master,
+ unsigned char *result);
+int32_t get_data_file_key(struct crypt_inode_info *info,
+ struct master_cipher_info *master,
+ uint32_t keysize,
+ unsigned char *key);
+/* data.c */
+extern struct data_cipher_alg data_cipher_algs[LAST_CIPHER_ALG][LAST_CIPHER_MODE];
+void encrypt_aligned_iov(struct object_cipher_info *object,
+ struct iovec *vec,
+ int count,
+ off_t off);
+void decrypt_aligned_iov(struct object_cipher_info *object,
+ struct iovec *vec,
+ int count,
+ off_t off);
+int32_t align_iov_by_atoms(xlator_t *this,
+ crypt_local_t *local,
+ struct object_cipher_info *object,
+ struct iovec *vec /* input vector */,
+ int32_t count /* number of vec components */,
+ struct iovec *avec /* aligned vector */,
+ char **blocks /* pool of blocks */,
+ uint32_t *blocks_allocated,
+ struct avec_config *conf);
+int32_t set_config_avec_data(xlator_t *this,
+ crypt_local_t *local,
+ struct avec_config *conf,
+ struct object_cipher_info *object,
+ struct iovec *vec,
+ int32_t vec_count);
+int32_t set_config_avec_hole(xlator_t *this,
+ crypt_local_t *local,
+ struct avec_config *conf,
+ struct object_cipher_info *object,
+ glusterfs_fop_t fop);
+void set_gap_at_end(call_frame_t *frame, struct object_cipher_info *object,
+ struct avec_config *conf, atom_data_type dtype);
+void set_config_offsets(call_frame_t *frame,
+ xlator_t *this,
+ uint64_t offset,
+ uint64_t count,
+ atom_data_type dtype,
+ int32_t setup_gap_in_tail);
+
+/* metadata.c */
+extern struct crypt_mtd_loader mtd_loaders [LAST_MTD_LOADER];
+
+int32_t alloc_format(crypt_local_t *local, size_t size);
+int32_t alloc_format_create(crypt_local_t *local);
+void free_format(crypt_local_t *local);
+size_t format_size(mtd_op_t op, size_t old_size);
+size_t new_format_size(void);
+int32_t open_format(unsigned char *str, int32_t len, loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master, crypt_local_t *local,
+ gf_boolean_t load_info);
+int32_t update_format(unsigned char *new, unsigned char *old,
+ size_t old_len, int32_t mac_idx, mtd_op_t op, loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master,
+ crypt_local_t *local);
+int32_t create_format(unsigned char *wire,
+ loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master);
+
+/* atom.c */
+struct rmw_atom *atom_by_types(atom_data_type data,
+ atom_locality_type locality);
+void submit_partial(call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ atom_locality_type ltype);
+void submit_full(call_frame_t *frame, xlator_t *this);
+
+/* crypt.c */
+
+end_writeback_handler_t dispatch_end_writeback(glusterfs_fop_t fop);
+static size_t iovec_get_size(struct iovec *vec, uint32_t count);
+void set_local_io_params_writev(call_frame_t *frame,
+ struct object_cipher_info *object,
+ struct rmw_atom *atom, off_t io_offset,
+ uint32_t io_size);
+void link_wind(call_frame_t *frame, xlator_t *this);
+void unlink_wind(call_frame_t *frame, xlator_t *this);
+void link_unwind(call_frame_t *frame);
+void unlink_unwind(call_frame_t *frame);
+void rename_wind(call_frame_t *frame, xlator_t *this);
+void rename_unwind(call_frame_t *frame);
+
+/* Inline functions */
+
+static inline size_t iovec_get_size(struct iovec *vec, uint32_t count)
+{
+ int i;
+ size_t size = 0;
+ for (i = 0; i < count; i++)
+ size += vec[i].iov_len;
+ return size;
+}
+
+static inline int32_t crypt_xlator_id(void)
+{
+ return CRYPT_XLATOR_ID;
+}
+
+static inline mtd_loader_id current_mtd_loader(void)
+{
+ return MTD_LOADER_V1;
+}
+
+static inline uint32_t master_key_size (void)
+{
+ return crypt_keys[MASTER_VOL_KEY].len >> 3;
+}
+
+static inline uint32_t nmtd_vol_key_size (void)
+{
+ return crypt_keys[NMTD_VOL_KEY].len >> 3;
+}
+
+static inline uint32_t alg_mode_blkbits(cipher_alg_t alg,
+ cipher_mode_t mode)
+{
+ return data_cipher_algs[alg][mode].blkbits;
+}
+
+static inline uint32_t alg_mode_blksize(cipher_alg_t alg,
+ cipher_mode_t mode)
+{
+ return 1 << alg_mode_blkbits(alg, mode);
+}
+
+static inline gf_boolean_t alg_mode_atomic(cipher_alg_t alg,
+ cipher_mode_t mode)
+{
+ return data_cipher_algs[alg][mode].atomic;
+}
+
+static inline gf_boolean_t alg_mode_should_pad(cipher_alg_t alg,
+ cipher_mode_t mode)
+{
+ return data_cipher_algs[alg][mode].should_pad;
+}
+
+static inline uint32_t master_alg_blksize(struct master_cipher_info *mr)
+{
+ return alg_mode_blksize(mr->m_alg, mr->m_mode);
+}
+
+static inline uint32_t master_alg_blkbits(struct master_cipher_info *mr)
+{
+ return alg_mode_blkbits(mr->m_alg, mr->m_mode);
+}
+
+static inline gf_boolean_t master_alg_atomic(struct master_cipher_info *mr)
+{
+ return alg_mode_atomic(mr->m_alg, mr->m_mode);
+}
+
+static inline gf_boolean_t master_alg_should_pad(struct master_cipher_info *mr)
+{
+ return alg_mode_should_pad(mr->m_alg, mr->m_mode);
+}
+
+static inline uint32_t object_alg_blksize(struct object_cipher_info *ob)
+{
+ return alg_mode_blksize(ob->o_alg, ob->o_mode);
+}
+
+static inline uint32_t object_alg_blkbits(struct object_cipher_info *ob)
+{
+ return alg_mode_blkbits(ob->o_alg, ob->o_mode);
+}
+
+static inline gf_boolean_t object_alg_atomic(struct object_cipher_info *ob)
+{
+ return alg_mode_atomic(ob->o_alg, ob->o_mode);
+}
+
+static inline gf_boolean_t object_alg_should_pad(struct object_cipher_info *ob)
+{
+ return alg_mode_should_pad(ob->o_alg, ob->o_mode);
+}
+
+static inline uint32_t aes_raw_key_size(struct master_cipher_info *master)
+{
+ return master->m_dkey_size >> 3;
+}
+
+static inline struct avec_config *get_hole_conf(call_frame_t *frame)
+{
+ return &(((crypt_local_t *)frame->local)->hole_conf);
+}
+
+static inline struct avec_config *get_data_conf(call_frame_t *frame)
+{
+ return &(((crypt_local_t *)frame->local)->data_conf);
+}
+
+static inline int32_t get_atom_bits (struct object_cipher_info *object)
+{
+ return object->o_block_bits;
+}
+
+static inline int32_t get_atom_size (struct object_cipher_info *object)
+{
+ return 1 << get_atom_bits(object);
+}
+
+static inline int32_t has_head_block(struct avec_config *conf)
+{
+ return conf->off_in_head ||
+ (conf->acount == 1 && conf->off_in_tail);
+}
+
+static inline int32_t has_tail_block(struct avec_config *conf)
+{
+ return conf->off_in_tail && conf->acount > 1;
+}
+
+static inline int32_t has_full_blocks(struct avec_config *conf)
+{
+ return conf->nr_full_blocks;
+}
+
+static inline int32_t should_submit_head_block(struct avec_config *conf)
+{
+ return has_head_block(conf) && (conf->cursor == 0);
+}
+
+static inline int32_t should_submit_tail_block(struct avec_config *conf)
+{
+ return has_tail_block(conf) && (conf->cursor == conf->acount - 1);
+}
+
+static inline int32_t should_submit_full_block(struct avec_config *conf)
+{
+ uint32_t start = has_head_block(conf) ? 1 : 0;
+
+ return has_full_blocks(conf) &&
+ conf->cursor >= start &&
+ conf->cursor < start + conf->nr_full_blocks;
+}
+
+#if DEBUG_CRYPT
+static inline void crypt_check_input_len(size_t len,
+ struct object_cipher_info *object)
+{
+ if (object_alg_should_pad(object) && (len & (object_alg_blksize(object) - 1)))
+ gf_log ("crypt", GF_LOG_DEBUG, "bad input len: %d", (int)len);
+}
+
+static inline void check_head_block(struct avec_config *conf)
+{
+ if (!has_head_block(conf))
+ gf_log("crypt", GF_LOG_DEBUG, "not a head atom");
+}
+
+static inline void check_tail_block(struct avec_config *conf)
+{
+ if (!has_tail_block(conf))
+ gf_log("crypt", GF_LOG_DEBUG, "not a tail atom");
+}
+
+static inline void check_full_block(struct avec_config *conf)
+{
+ if (!has_full_blocks(conf))
+ gf_log("crypt", GF_LOG_DEBUG, "not a full atom");
+}
+
+static inline void check_cursor_head(struct avec_config *conf)
+{
+ if (!has_head_block(conf))
+ gf_log("crypt",
+ GF_LOG_DEBUG, "Illegal call of head atom method");
+ else if (conf->cursor != 0)
+ gf_log("crypt",
+ GF_LOG_DEBUG, "Cursor (%d) is not at head atom",
+ conf->cursor);
+}
+
+static inline void check_cursor_full(struct avec_config *conf)
+{
+ if (!has_full_blocks(conf))
+ gf_log("crypt",
+ GF_LOG_DEBUG, "Illegal call of full atom method");
+ if (has_head_block(conf) && (conf->cursor == 0))
+ gf_log("crypt",
+ GF_LOG_DEBUG, "Cursor is not at full atom");
+}
+
+/*
+ * FIXME: use avec->iov_len to check setup
+ */
+static inline int data_local_invariant(crypt_local_t *local)
+{
+ return 0;
+}
+
+#else
+#define crypt_check_input_len(len, object) noop
+#define check_head_block(conf) noop
+#define check_tail_block(conf) noop
+#define check_full_block(conf) noop
+#define check_cursor_head(conf) noop
+#define check_cursor_full(conf) noop
+
+#endif /* DEBUG_CRYPT */
+
+static inline struct avec_config *conf_by_type(call_frame_t *frame,
+ atom_data_type dtype)
+{
+ struct avec_config *conf = NULL;
+
+ switch (dtype) {
+ case HOLE_ATOM:
+ conf = get_hole_conf(frame);
+ break;
+ case DATA_ATOM:
+ conf = get_data_conf(frame);
+ break;
+ default:
+ gf_log("crypt", GF_LOG_DEBUG, "bad atom type");
+ }
+ return conf;
+}
+
+static inline uint32_t nr_calls_head(struct avec_config *conf)
+{
+ return has_head_block(conf) ? 1 : 0;
+}
+
+static inline uint32_t nr_calls_tail(struct avec_config *conf)
+{
+ return has_tail_block(conf) ? 1 : 0;
+}
+
+static inline uint32_t nr_calls_full(struct avec_config *conf)
+{
+ switch(conf->type) {
+ case HOLE_ATOM:
+ return has_full_blocks(conf);
+ case DATA_ATOM:
+ return has_full_blocks(conf) ?
+ logical_blocks_occupied(0,
+ conf->nr_full_blocks,
+ MAX_IOVEC_BITS) : 0;
+ default:
+ gf_log("crypt", GF_LOG_DEBUG, "bad atom data type");
+ return 0;
+ }
+}
+
+static inline uint32_t nr_calls(struct avec_config *conf)
+{
+ return nr_calls_head(conf) + nr_calls_tail(conf) + nr_calls_full(conf);
+}
+
+static inline uint32_t nr_calls_data(call_frame_t *frame)
+{
+ return nr_calls(get_data_conf(frame));
+}
+
+static inline uint32_t nr_calls_hole(call_frame_t *frame)
+{
+ return nr_calls(get_hole_conf(frame));
+}
+
+static inline void get_one_call_nolock(call_frame_t *frame)
+{
+ crypt_local_t *local = frame->local;
+
+ ++local->nr_calls;
+
+ //gf_log("crypt", GF_LOG_DEBUG, "get %d calls", 1);
+}
+
+static inline void get_one_call(call_frame_t *frame)
+{
+ crypt_local_t *local = frame->local;
+
+ LOCK(&local->call_lock);
+ get_one_call_nolock(frame);
+ UNLOCK(&local->call_lock);
+}
+
+static inline void get_nr_calls_nolock(call_frame_t *frame, int32_t nr)
+{
+ crypt_local_t *local = frame->local;
+
+ local->nr_calls += nr;
+
+ //gf_log("crypt", GF_LOG_DEBUG, "get %d calls", nr);
+}
+
+static inline void get_nr_calls(call_frame_t *frame, int32_t nr)
+{
+ crypt_local_t *local = frame->local;
+
+ LOCK(&local->call_lock);
+ get_nr_calls_nolock(frame, nr);
+ UNLOCK(&local->call_lock);
+}
+
+static inline int put_one_call(crypt_local_t *local)
+{
+ uint32_t last = 0;
+
+ LOCK(&local->call_lock);
+ if (--local->nr_calls == 0)
+ last = 1;
+
+ //gf_log("crypt", GF_LOG_DEBUG, "put %d calls", 1);
+
+ UNLOCK(&local->call_lock);
+ return last;
+}
+
+static inline int is_appended_write(call_frame_t *frame)
+{
+ crypt_local_t *local = frame->local;
+ struct avec_config *conf = get_data_conf(frame);
+
+ return conf->orig_offset + conf->orig_size > local->old_file_size;
+}
+
+static inline int is_ordered_mode(call_frame_t *frame)
+{
+#if 0
+ crypt_local_t *local = frame->local;
+ return local->fop == GF_FOP_FTRUNCATE ||
+ (local->fop == GF_FOP_WRITE && is_appended_write(frame));
+#endif
+ return 1;
+}
+
+static inline int32_t hole_conv_completed(crypt_local_t *local)
+{
+ struct avec_config *conf = &local->hole_conf;
+ return conf->cursor == conf->acount;
+}
+
+static inline int32_t data_write_in_progress(crypt_local_t *local)
+{
+ return local->active_setup == DATA_ATOM;
+}
+
+static inline int32_t parent_is_crypt_xlator(call_frame_t *frame,
+ xlator_t *this)
+{
+ return frame->parent->this == this;
+}
+
+static inline linkop_wind_handler_t linkop_wind_dispatch(glusterfs_fop_t fop)
+{
+ switch(fop){
+ case GF_FOP_LINK:
+ return link_wind;
+ case GF_FOP_UNLINK:
+ return unlink_wind;
+ case GF_FOP_RENAME:
+ return rename_wind;
+ default:
+ gf_log("crypt", GF_LOG_ERROR, "Bad link operation %d", fop);
+ return NULL;
+ }
+}
+
+static inline linkop_unwind_handler_t linkop_unwind_dispatch(glusterfs_fop_t fop)
+{
+ switch(fop){
+ case GF_FOP_LINK:
+ return link_unwind;
+ case GF_FOP_UNLINK:
+ return unlink_unwind;
+ case GF_FOP_RENAME:
+ return rename_unwind;
+ default:
+ gf_log("crypt", GF_LOG_ERROR, "Bad link operation %d", fop);
+ return NULL;
+ }
+}
+
+static inline mtd_op_t linkop_mtdop_dispatch(glusterfs_fop_t fop)
+{
+ switch (fop) {
+ case GF_FOP_LINK:
+ return MTD_APPEND;
+ case GF_FOP_UNLINK:
+ return MTD_CUT;
+ case GF_FOP_RENAME:
+ return MTD_OVERWRITE;
+ default:
+ gf_log("crypt", GF_LOG_WARNING, "Bad link operation %d", fop);
+ return MTD_LAST_OP;
+ }
+}
+
+#endif /* __CRYPT_H__ */
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
diff --git a/xlators/encryption/crypt/src/data.c b/xlators/encryption/crypt/src/data.c
new file mode 100644
index 000000000..762fa554a
--- /dev/null
+++ b/xlators/encryption/crypt/src/data.c
@@ -0,0 +1,769 @@
+/*
+ Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "defaults.h"
+#include "crypt-common.h"
+#include "crypt.h"
+
+static void set_iv_aes_xts(off_t offset, struct object_cipher_info *object)
+{
+ unsigned char *ivec;
+
+ ivec = object->u.aes_xts.ivec;
+
+ /* convert the tweak into a little-endian byte
+ * array (IEEE P1619/D16, May 2007, section 5.1)
+ */
+
+ *((uint64_t *)ivec) = htole64(offset);
+
+ /* ivec is padded with zeroes */
+}
+
+static int32_t aes_set_keys_common(unsigned char *raw_key, uint32_t key_size,
+ AES_KEY *keys)
+{
+ int32_t ret;
+
+ ret = AES_set_encrypt_key(raw_key,
+ key_size,
+ &keys[AES_ENCRYPT]);
+ if (ret) {
+ gf_log("crypt", GF_LOG_ERROR, "Set encrypt key failed");
+ return ret;
+ }
+ ret = AES_set_decrypt_key(raw_key,
+ key_size,
+ &keys[AES_DECRYPT]);
+ if (ret) {
+ gf_log("crypt", GF_LOG_ERROR, "Set decrypt key failed");
+ return ret;
+ }
+ return 0;
+}
+
+/*
+ * set private cipher info for xts mode
+ */
+static int32_t set_private_aes_xts(struct crypt_inode_info *info,
+ struct master_cipher_info *master)
+{
+ int ret;
+ struct object_cipher_info *object = get_object_cinfo(info);
+ unsigned char *data_key;
+ uint32_t subkey_size;
+
+ /* init tweak value */
+ memset(object->u.aes_xts.ivec, 0, 16);
+
+ data_key = GF_CALLOC(1, object->o_dkey_size, gf_crypt_mt_key);
+ if (!data_key)
+ return ENOMEM;
+
+ /*
+ * retrieve data keying meterial
+ */
+ ret = get_data_file_key(info, master, object->o_dkey_size, data_key);
+ if (ret) {
+ gf_log("crypt", GF_LOG_ERROR, "Failed to retrieve data key");
+ GF_FREE(data_key);
+ return ret;
+ }
+ /*
+ * parse compound xts key
+ */
+ subkey_size = object->o_dkey_size >> 4; /* (xts-key-size-in-bytes / 2) */
+ /*
+ * install key for data encryption
+ */
+ ret = aes_set_keys_common(data_key,
+ subkey_size << 3, object->u.aes_xts.dkey);
+ if (ret) {
+ GF_FREE(data_key);
+ return ret;
+ }
+ /*
+ * set up key used to encrypt tweaks
+ */
+ ret = AES_set_encrypt_key(data_key + subkey_size,
+ object->o_dkey_size / 2,
+ &object->u.aes_xts.tkey);
+ if (ret < 0)
+ gf_log("crypt", GF_LOG_ERROR, "Set tweak key failed");
+
+ GF_FREE(data_key);
+ return ret;
+}
+
+static int32_t aes_xts_init(void)
+{
+ cassert(AES_BLOCK_SIZE == (1 << AES_BLOCK_BITS));
+ return 0;
+}
+
+static int32_t check_key_aes_xts(uint32_t keysize)
+{
+ switch(keysize) {
+ case 256:
+ case 512:
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+static int32_t encrypt_aes_xts(const unsigned char *from,
+ unsigned char *to, size_t length,
+ off_t offset, const int enc,
+ struct object_cipher_info *object)
+{
+ XTS128_CONTEXT ctx;
+ if (enc) {
+ ctx.key1 = &object->u.aes_xts.dkey[AES_ENCRYPT];
+ ctx.block1 = (block128_f)AES_encrypt;
+ }
+ else {
+ ctx.key1 = &object->u.aes_xts.dkey[AES_DECRYPT];
+ ctx.block1 = (block128_f)AES_decrypt;
+ }
+ ctx.key2 = &object->u.aes_xts.tkey;
+ ctx.block2 = (block128_f)AES_encrypt;
+
+ return CRYPTO_xts128_encrypt(&ctx,
+ object->u.aes_xts.ivec,
+ from,
+ to,
+ length, enc);
+}
+
+/*
+ * Cipher input chunk @from of length @len;
+ * @to: result of cipher transform;
+ * @off: offset in a file (must be cblock-aligned);
+ */
+static void cipher_data(struct object_cipher_info *object,
+ char *from,
+ char *to,
+ off_t off,
+ size_t len,
+ const int enc)
+{
+ crypt_check_input_len(len, object);
+
+#if TRIVIAL_TFM && DEBUG_CRYPT
+ return;
+#endif
+ data_cipher_algs[object->o_alg][object->o_mode].set_iv(off, object);
+ data_cipher_algs[object->o_alg][object->o_mode].encrypt
+ ((const unsigned char *)from,
+ (unsigned char *)to,
+ len,
+ off,
+ enc,
+ object);
+}
+
+#define MAX_CIPHER_CHUNK (1 << 30)
+
+/*
+ * Do cipher (encryption/decryption) transform of a
+ * continuous region of memory.
+ *
+ * @len: a number of bytes to transform;
+ * @buf: data to transform;
+ * @off: offset in a file, should be block-aligned
+ * for atomic cipher modes and ksize-aligned
+ * for other modes).
+ * @dir: direction of transform (encrypt/decrypt).
+ */
+static void cipher_region(struct object_cipher_info *object,
+ char *from,
+ char *to,
+ off_t off,
+ size_t len,
+ int dir)
+{
+ while (len > 0) {
+ size_t to_cipher;
+
+ to_cipher = len;
+ if (to_cipher > MAX_CIPHER_CHUNK)
+ to_cipher = MAX_CIPHER_CHUNK;
+
+ /* this will reset IV */
+ cipher_data(object,
+ from,
+ to,
+ off,
+ to_cipher,
+ dir);
+ from += to_cipher;
+ to += to_cipher;
+ off += to_cipher;
+ len -= to_cipher;
+ }
+}
+
+/*
+ * Do cipher transform (encryption/decryption) of
+ * plaintext/ciphertext represented by @vec.
+ *
+ * Pre-conditions: @vec represents a continuous piece
+ * of data in a file at offset @off to be ciphered
+ * (encrypted/decrypted).
+ * @count is the number of vec's components. All the
+ * components must be block-aligned, the caller is
+ * responsible for this. @dir is "direction" of
+ * transform (encrypt/decrypt).
+ */
+static void cipher_aligned_iov(struct object_cipher_info *object,
+ struct iovec *vec,
+ int count,
+ off_t off,
+ int32_t dir)
+{
+ int i;
+ int len = 0;
+
+ for (i = 0; i < count; i++) {
+ cipher_region(object,
+ vec[i].iov_base,
+ vec[i].iov_base,
+ off + len,
+ vec[i].iov_len,
+ dir);
+ len += vec[i].iov_len;
+ }
+}
+
+void encrypt_aligned_iov(struct object_cipher_info *object,
+ struct iovec *vec,
+ int count,
+ off_t off)
+{
+ cipher_aligned_iov(object, vec, count, off, 1);
+}
+
+void decrypt_aligned_iov(struct object_cipher_info *object,
+ struct iovec *vec,
+ int count,
+ off_t off)
+{
+ cipher_aligned_iov(object, vec, count, off, 0);
+}
+
+#if DEBUG_CRYPT
+static void compound_stream(struct iovec *vec, int count, char *buf, off_t skip)
+{
+ int i;
+ int off = 0;
+ for (i = 0; i < count; i++) {
+ memcpy(buf + off,
+ vec[i].iov_base + skip,
+ vec[i].iov_len - skip);
+
+ off += (vec[i].iov_len - skip);
+ skip = 0;
+ }
+}
+
+static void check_iovecs(struct iovec *vec, int cnt,
+ struct iovec *avec, int acnt, uint32_t off_in_head)
+{
+ char *s1, *s2;
+ uint32_t size, asize;
+
+ size = iovec_get_size(vec, cnt);
+ asize = iovec_get_size(avec, acnt) - off_in_head;
+ if (size != asize) {
+ gf_log("crypt", GF_LOG_DEBUG, "size %d is not eq asize %d",
+ size, asize);
+ return;
+ }
+ s1 = GF_CALLOC(1, size, gf_crypt_mt_data);
+ if (!s1) {
+ gf_log("crypt", GF_LOG_DEBUG, "Can not allocate stream ");
+ return;
+ }
+ s2 = GF_CALLOC(1, asize, gf_crypt_mt_data);
+ if (!s2) {
+ GF_FREE(s1);
+ gf_log("crypt", GF_LOG_DEBUG, "Can not allocate stream ");
+ return;
+ }
+ compound_stream(vec, cnt, s1, 0);
+ compound_stream(avec, acnt, s2, off_in_head);
+ if (memcmp(s1, s2, size))
+ gf_log("crypt", GF_LOG_DEBUG, "chunks of different data");
+ GF_FREE(s1);
+ GF_FREE(s2);
+}
+
+#else
+#define check_iovecs(vec, count, avec, avecn, off) noop
+#endif /* DEBUG_CRYPT */
+
+static char *data_alloc_block(xlator_t *this, crypt_local_t *local,
+ int32_t block_size)
+{
+ struct iobuf *iobuf = NULL;
+
+ iobuf = iobuf_get2(this->ctx->iobuf_pool, block_size);
+ if (!iobuf) {
+ gf_log("crypt", GF_LOG_ERROR,
+ "Failed to get iobuf");
+ return NULL;
+ }
+ if (!local->iobref_data) {
+ local->iobref_data = iobref_new();
+ if (!local->iobref_data) {
+ gf_log("crypt", GF_LOG_ERROR,
+ "Failed to get iobref");
+ iobuf_unref(iobuf);
+ return NULL;
+ }
+ }
+ iobref_add(local->iobref_data, iobuf);
+ return iobuf->ptr;
+}
+
+/*
+ * Compound @avec, which represent the same data
+ * chunk as @vec, but has aligned components of
+ * specified block size. Alloc blocks, if needed.
+ * In particular, incomplete head and tail blocks
+ * must be allocated.
+ * Put number of allocated blocks to @num_blocks.
+ *
+ * Example:
+ *
+ * input: data chunk represented by 4 components
+ * [AB],[BC],[CD],[DE];
+ * output: 5 logical blocks (0, 1, 2, 3, 4).
+ *
+ * A B C D E
+ * *-----*+------*-+---*----+--------+-*
+ * | || | | | | | |
+ * *-+-----+*------+-*---+----*--------*-+------*
+ * 0 1 2 3 4
+ *
+ * 0 - incomplete compound (head);
+ * 1, 2 - full compound;
+ * 3 - full non-compound (the case of reuse);
+ * 4 - incomplete non-compound (tail).
+ */
+int32_t align_iov_by_atoms(xlator_t *this,
+ crypt_local_t *local,
+ struct object_cipher_info *object,
+ struct iovec *vec /* input vector */,
+ int32_t count /* number of vec components */,
+ struct iovec *avec /* aligned vector */,
+ char **blocks /* pool of blocks */,
+ uint32_t *blocks_allocated,
+ struct avec_config *conf)
+{
+ int vecn = 0; /* number of the current component in vec */
+ int avecn = 0; /* number of the current component in avec */
+ off_t vec_off = 0; /* offset in the current vec component,
+ * i.e. the number of bytes have already
+ * been copied */
+ int32_t block_size = get_atom_size(object);
+ size_t to_process; /* number of vec's bytes to copy and(or) re-use */
+ int32_t off_in_head = conf->off_in_head;
+
+ to_process = iovec_get_size(vec, count);
+
+ while (to_process > 0) {
+ if (off_in_head ||
+ vec[vecn].iov_len - vec_off < block_size) {
+ /*
+ * less than block_size:
+ * the case of incomplete (head or tail),
+ * or compound block
+ */
+ size_t copied = 0;
+ /*
+ * populate the pool with a new block
+ */
+ blocks[*blocks_allocated] = data_alloc_block(this,
+ local,
+ block_size);
+ if (!blocks[*blocks_allocated])
+ return -ENOMEM;
+ memset(blocks[*blocks_allocated], 0, off_in_head);
+ /*
+ * fill the block with vec components
+ */
+ do {
+ size_t to_copy;
+
+ to_copy = vec[vecn].iov_len - vec_off;
+ if (to_copy > block_size - off_in_head)
+ to_copy = block_size - off_in_head;
+
+ memcpy(blocks[*blocks_allocated] + off_in_head + copied,
+ vec[vecn].iov_base + vec_off,
+ to_copy);
+
+ copied += to_copy;
+ to_process -= to_copy;
+
+ vec_off += to_copy;
+ if (vec_off == vec[vecn].iov_len) {
+ /* finished with this vecn */
+ vec_off = 0;
+ vecn++;
+ }
+ } while (copied < (block_size - off_in_head) && to_process > 0);
+ /*
+ * update avec
+ */
+ avec[avecn].iov_len = off_in_head + copied;
+ avec[avecn].iov_base = blocks[*blocks_allocated];
+
+ (*blocks_allocated)++;
+ off_in_head = 0;
+ } else {
+ /*
+ * the rest of the current vec component
+ * is not less than block_size, so reuse
+ * the memory buffer of the component.
+ */
+ size_t to_reuse;
+ to_reuse = (to_process > block_size ?
+ block_size :
+ to_process);
+ avec[avecn].iov_len = to_reuse;
+ avec[avecn].iov_base = vec[vecn].iov_base + vec_off;
+
+ vec_off += to_reuse;
+ if (vec_off == vec[vecn].iov_len) {
+ /* finished with this vecn */
+ vec_off = 0;
+ vecn++;
+ }
+ to_process -= to_reuse;
+ }
+ avecn++;
+ }
+ check_iovecs(vec, count, avec, avecn, conf->off_in_head);
+ return 0;
+}
+
+/*
+ * allocate and setup aligned vector for data submission
+ * Pre-condition: @conf is set.
+ */
+int32_t set_config_avec_data(xlator_t *this,
+ crypt_local_t *local,
+ struct avec_config *conf,
+ struct object_cipher_info *object,
+ struct iovec *vec,
+ int32_t vec_count)
+{
+ int32_t ret = ENOMEM;
+ struct iovec *avec;
+ char **pool;
+ uint32_t blocks_in_pool = 0;
+
+ conf->type = DATA_ATOM;
+
+ avec = GF_CALLOC(conf->acount, sizeof(*avec), gf_crypt_mt_iovec);
+ if (!avec)
+ return ret;
+ pool = GF_CALLOC(conf->acount, sizeof(pool), gf_crypt_mt_char);
+ if (!pool) {
+ GF_FREE(avec);
+ return ret;
+ }
+ if (!vec) {
+ /*
+ * degenerated case: no data
+ */
+ pool[0] = data_alloc_block(this, local, get_atom_size(object));
+ if (!pool[0])
+ goto free;
+ blocks_in_pool = 1;
+ avec->iov_base = pool[0];
+ avec->iov_len = conf->off_in_tail;
+ }
+ else {
+ ret = align_iov_by_atoms(this, local, object, vec, vec_count,
+ avec, pool, &blocks_in_pool, conf);
+ if (ret)
+ goto free;
+ }
+ conf->avec = avec;
+ conf->pool = pool;
+ conf->blocks_in_pool = blocks_in_pool;
+ return 0;
+ free:
+ GF_FREE(avec);
+ GF_FREE(pool);
+ return ret;
+}
+
+/*
+ * allocate and setup aligned vector for hole submission
+ */
+int32_t set_config_avec_hole(xlator_t *this,
+ crypt_local_t *local,
+ struct avec_config *conf,
+ struct object_cipher_info *object,
+ glusterfs_fop_t fop)
+{
+ uint32_t i, idx;
+ struct iovec *avec;
+ char **pool;
+ uint32_t num_blocks;
+ uint32_t blocks_in_pool = 0;
+
+ conf->type = HOLE_ATOM;
+
+ num_blocks = conf->acount -
+ (conf->nr_full_blocks ? conf->nr_full_blocks - 1 : 0);
+
+ switch (fop) {
+ case GF_FOP_WRITE:
+ /*
+ * hole goes before data
+ */
+ if (num_blocks == 1 && conf->off_in_tail != 0)
+ /*
+ * we won't submit a hole which fits into
+ * a data atom: this part of hole will be
+ * submitted with data write
+ */
+ return 0;
+ break;
+ case GF_FOP_FTRUNCATE:
+ /*
+ * expanding truncate, hole goes after data,
+ * and will be submited in any case.
+ */
+ break;
+ default:
+ gf_log("crypt", GF_LOG_WARNING,
+ "bad file operation %d", fop);
+ return 0;
+ }
+ avec = GF_CALLOC(num_blocks, sizeof(*avec), gf_crypt_mt_iovec);
+ if (!avec)
+ return ENOMEM;
+ pool = GF_CALLOC(num_blocks, sizeof(pool), gf_crypt_mt_char);
+ if (!pool) {
+ GF_FREE(avec);
+ return ENOMEM;
+ }
+ for (i = 0; i < num_blocks; i++) {
+ pool[i] = data_alloc_block(this, local, get_atom_size(object));
+ if (pool[i] == NULL)
+ goto free;
+ blocks_in_pool++;
+ }
+ if (has_head_block(conf)) {
+ /* set head block */
+ idx = 0;
+ avec[idx].iov_base = pool[idx];
+ avec[idx].iov_len = get_atom_size(object);
+ memset(avec[idx].iov_base + conf->off_in_head,
+ 0,
+ get_atom_size(object) - conf->off_in_head);
+ }
+ if (has_tail_block(conf)) {
+ /* set tail block */
+ idx = num_blocks - 1;
+ avec[idx].iov_base = pool[idx];
+ avec[idx].iov_len = get_atom_size(object);
+ memset(avec[idx].iov_base, 0, conf->off_in_tail);
+ }
+ if (has_full_blocks(conf)) {
+ /* set full block */
+ idx = conf->off_in_head ? 1 : 0;
+ avec[idx].iov_base = pool[idx];
+ avec[idx].iov_len = get_atom_size(object);
+ /*
+ * since we re-use the buffer,
+ * zeroes will be set every time
+ * before encryption, see submit_full()
+ */
+ }
+ conf->avec = avec;
+ conf->pool = pool;
+ conf->blocks_in_pool = blocks_in_pool;
+ return 0;
+ free:
+ GF_FREE(avec);
+ GF_FREE(pool);
+ return ENOMEM;
+}
+
+/* A helper for setting up config of partial atoms (which
+ * participate in read-modify-write sequence).
+ *
+ * Calculate and setup precise amount of "extra-bytes"
+ * that should be uptodated at the end of partial (not
+ * necessarily tail!) block.
+ *
+ * Pre-condition: local->old_file_size is valid!
+ * @conf contains setup, which is enough for correct calculation
+ * of has_tail_block(), ->get_offset().
+ */
+void set_gap_at_end(call_frame_t *frame, struct object_cipher_info *object,
+ struct avec_config *conf, atom_data_type dtype)
+{
+ uint32_t to_block;
+ crypt_local_t *local = frame->local;
+ uint64_t old_file_size = local->old_file_size;
+ struct rmw_atom *partial = atom_by_types(dtype,
+ has_tail_block(conf) ?
+ TAIL_ATOM : HEAD_ATOM);
+
+ if (old_file_size <= partial->offset_at(frame, object))
+ to_block = 0;
+ else {
+ to_block = old_file_size - partial->offset_at(frame, object);
+ if (to_block > get_atom_size(object))
+ to_block = get_atom_size(object);
+ }
+ if (to_block > conf->off_in_tail)
+ conf->gap_in_tail = to_block - conf->off_in_tail;
+ else
+ /*
+ * nothing to uptodate
+ */
+ conf->gap_in_tail = 0;
+}
+
+/*
+ * fill struct avec_config with offsets layouts
+ */
+void set_config_offsets(call_frame_t *frame,
+ xlator_t *this,
+ uint64_t offset,
+ uint64_t count,
+ atom_data_type dtype,
+ int32_t set_gap)
+{
+ crypt_local_t *local;
+ struct object_cipher_info *object;
+ struct avec_config *conf;
+ uint32_t resid;
+
+ uint32_t atom_size;
+ uint32_t atom_bits;
+
+ size_t orig_size;
+ off_t orig_offset;
+ size_t expanded_size;
+ off_t aligned_offset;
+
+ uint32_t off_in_head = 0;
+ uint32_t off_in_tail = 0;
+ uint32_t nr_full_blocks;
+ int32_t size_full_blocks;
+
+ uint32_t acount; /* number of alifned components to write.
+ * The same as number of occupied logical
+ * blocks (atoms)
+ */
+ local = frame->local;
+ object = &local->info->cinfo;
+ conf = (dtype == DATA_ATOM ?
+ get_data_conf(frame) : get_hole_conf(frame));
+
+ orig_offset = offset;
+ orig_size = count;
+
+ atom_size = get_atom_size(object);
+ atom_bits = get_atom_bits(object);
+
+ /*
+ * Round-down the start,
+ * round-up the end.
+ */
+ resid = offset & (uint64_t)(atom_size - 1);
+
+ if (resid)
+ off_in_head = resid;
+ aligned_offset = offset - off_in_head;
+ expanded_size = orig_size + off_in_head;
+
+ /* calculate tail,
+ expand size forward */
+ resid = (offset + orig_size) & (uint64_t)(atom_size - 1);
+
+ if (resid) {
+ off_in_tail = resid;
+ expanded_size += (atom_size - off_in_tail);
+ }
+ /*
+ * calculate number of occupied blocks
+ */
+ acount = expanded_size >> atom_bits;
+ /*
+ * calculate number of full blocks
+ */
+ size_full_blocks = expanded_size;
+ if (off_in_head)
+ size_full_blocks -= atom_size;
+ if (off_in_tail && size_full_blocks > 0)
+ size_full_blocks -= atom_size;
+ nr_full_blocks = size_full_blocks >> atom_bits;
+
+ conf->atom_size = atom_size;
+ conf->orig_size = orig_size;
+ conf->orig_offset = orig_offset;
+ conf->expanded_size = expanded_size;
+ conf->aligned_offset = aligned_offset;
+
+ conf->off_in_head = off_in_head;
+ conf->off_in_tail = off_in_tail;
+ conf->nr_full_blocks = nr_full_blocks;
+ conf->acount = acount;
+ /*
+ * Finally, calculate precise amount of
+ * "extra-bytes" that should be uptodated
+ * at the end.
+ * Only if RMW is expected.
+ */
+ if (off_in_tail && set_gap)
+ set_gap_at_end(frame, object, conf, dtype);
+}
+
+struct data_cipher_alg data_cipher_algs[LAST_CIPHER_ALG][LAST_CIPHER_MODE] = {
+ [AES_CIPHER_ALG][XTS_CIPHER_MODE] =
+ { .atomic = _gf_true,
+ .should_pad = _gf_true,
+ .blkbits = AES_BLOCK_BITS,
+ .init = aes_xts_init,
+ .set_private = set_private_aes_xts,
+ .check_key = check_key_aes_xts,
+ .set_iv = set_iv_aes_xts,
+ .encrypt = encrypt_aes_xts
+ }
+};
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
diff --git a/xlators/encryption/crypt/src/keys.c b/xlators/encryption/crypt/src/keys.c
new file mode 100644
index 000000000..4a1d3bb5a
--- /dev/null
+++ b/xlators/encryption/crypt/src/keys.c
@@ -0,0 +1,302 @@
+/*
+ Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "defaults.h"
+#include "crypt-common.h"
+#include "crypt.h"
+
+/* Key hierarchy
+
+ +----------------+
+ | MASTER_VOL_KEY |
+ +-------+--------+
+ |
+ |
+ +----------------+----------------+
+ | | |
+ | | |
+ +-------+------+ +-------+-------+ +------+--------+
+ | NMTD_VOL_KEY | | EMTD_FILE_KEY | | DATA_FILE_KEY |
+ +-------+------+ +---------------+ +---------------+
+ |
+ |
+ +-------+-------+
+ | NMTD_LINK_KEY |
+ +---------------+
+
+ */
+
+#if DEBUG_CRYPT
+static void check_prf_iters(uint32_t num_iters)
+{
+ if (num_iters == 0)
+ gf_log ("crypt", GF_LOG_DEBUG,
+ "bad number of prf iterations : %d", num_iters);
+}
+#else
+#define check_prf_iters(num_iters) noop
+#endif /* DEBUG_CRYPT */
+
+unsigned char crypt_fake_oid[16] =
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+/*
+ * derive key in the counter mode using
+ * sha256-based HMAC as PRF, see
+ * NIST Special Publication 800-108, 5.1)
+ */
+
+#define PRF_OUTPUT_SIZE SHA256_DIGEST_LENGTH
+
+static int32_t kderive_init(struct kderive_context *ctx,
+ const unsigned char *pkey, /* parent key */
+ uint32_t pkey_size, /* parent key size */
+ const unsigned char *idctx, /* id-context */
+ uint32_t idctx_size,
+ crypt_key_type type /* type of child key */)
+{
+ unsigned char *pos;
+ uint32_t llen = strlen(crypt_keys[type].label);
+ /*
+ * Compoud the fixed input data for KDF:
+ * [i]_2 || Label || 0x00 || Id-Context || [L]_2),
+ * NIST SP 800-108, 5.1
+ */
+ ctx->fid_len =
+ sizeof(uint32_t) +
+ llen +
+ 1 +
+ idctx_size +
+ sizeof(uint32_t);
+
+ ctx->fid = GF_CALLOC(ctx->fid_len, 1, gf_crypt_mt_key);
+ if (!ctx->fid)
+ return ENOMEM;
+ ctx->out_len = round_up(crypt_keys[type].len >> 3,
+ PRF_OUTPUT_SIZE);
+ ctx->out = GF_CALLOC(ctx->out_len, 1, gf_crypt_mt_key);
+ if (!ctx->out) {
+ GF_FREE(ctx->fid);
+ return ENOMEM;
+ }
+ ctx->pkey = pkey;
+ ctx->pkey_len = pkey_size;
+ ctx->ckey_len = crypt_keys[type].len;
+
+ pos = ctx->fid;
+
+ /* counter will be set up in kderive_rfn() */
+ pos += sizeof(uint32_t);
+
+ memcpy(pos, crypt_keys[type].label, llen);
+ pos += llen;
+
+ /* set up zero octet */
+ *pos = 0;
+ pos += 1;
+
+ memcpy(pos, idctx, idctx_size);
+ pos += idctx_size;
+
+ *((uint32_t *)pos) = htobe32(ctx->ckey_len);
+
+ return 0;
+}
+
+static void kderive_update(struct kderive_context *ctx)
+{
+ uint32_t i;
+ HMAC_CTX hctx;
+ unsigned char *pos = ctx->out;
+ uint32_t *p_iter = (uint32_t *)ctx->fid;
+ uint32_t num_iters = ctx->out_len / PRF_OUTPUT_SIZE;
+
+ check_prf_iters(num_iters);
+
+ HMAC_CTX_init(&hctx);
+ for (i = 0; i < num_iters; i++) {
+ /*
+ * update the iteration number in the fid
+ */
+ *p_iter = htobe32(i);
+ HMAC_Init_ex(&hctx,
+ ctx->pkey, ctx->pkey_len >> 3,
+ EVP_sha256(),
+ NULL);
+ HMAC_Update(&hctx, ctx->fid, ctx->fid_len);
+ HMAC_Final(&hctx, pos, NULL);
+
+ pos += PRF_OUTPUT_SIZE;
+ }
+ HMAC_CTX_cleanup(&hctx);
+}
+
+static void kderive_final(struct kderive_context *ctx, unsigned char *child)
+{
+ memcpy(child, ctx->out, ctx->ckey_len >> 3);
+ GF_FREE(ctx->fid);
+ GF_FREE(ctx->out);
+ memset(ctx, 0, sizeof(*ctx));
+}
+
+/*
+ * derive per-volume key for object ids aithentication
+ */
+int32_t get_nmtd_vol_key(struct master_cipher_info *master)
+{
+ int32_t ret;
+ struct kderive_context ctx;
+
+ ret = kderive_init(&ctx,
+ master->m_key,
+ master_key_size(),
+ crypt_fake_oid, sizeof(uuid_t), NMTD_VOL_KEY);
+ if (ret)
+ return ret;
+ kderive_update(&ctx);
+ kderive_final(&ctx, master->m_nmtd_key);
+ return 0;
+}
+
+/*
+ * derive per-link key for aithentication of non-encrypted
+ * meta-data (nmtd)
+ */
+int32_t get_nmtd_link_key(loc_t *loc,
+ struct master_cipher_info *master,
+ unsigned char *result)
+{
+ int32_t ret;
+ struct kderive_context ctx;
+
+ ret = kderive_init(&ctx,
+ master->m_nmtd_key,
+ nmtd_vol_key_size(),
+ (const unsigned char *)loc->path,
+ strlen(loc->path), NMTD_LINK_KEY);
+ if (ret)
+ return ret;
+ kderive_update(&ctx);
+ kderive_final(&ctx, result);
+ return 0;
+}
+
+/*
+ * derive per-file key for encryption and authentication
+ * of encrypted part of metadata (emtd)
+ */
+int32_t get_emtd_file_key(struct crypt_inode_info *info,
+ struct master_cipher_info *master,
+ unsigned char *result)
+{
+ int32_t ret;
+ struct kderive_context ctx;
+
+ ret = kderive_init(&ctx,
+ master->m_key,
+ master_key_size(),
+ info->oid, sizeof(uuid_t), EMTD_FILE_KEY);
+ if (ret)
+ return ret;
+ kderive_update(&ctx);
+ kderive_final(&ctx, result);
+ return 0;
+}
+
+static int32_t data_key_type_by_size(uint32_t keysize, crypt_key_type *type)
+{
+ int32_t ret = 0;
+ switch (keysize) {
+ case 256:
+ *type = DATA_FILE_KEY_256;
+ break;
+ case 512:
+ *type = DATA_FILE_KEY_512;
+ break;
+ default:
+ gf_log("crypt", GF_LOG_ERROR, "Unsupported data key size %d",
+ keysize);
+ ret = ENOTSUP;
+ break;
+ }
+ return ret;
+}
+
+/*
+ * derive per-file key for data encryption
+ */
+int32_t get_data_file_key(struct crypt_inode_info *info,
+ struct master_cipher_info *master,
+ uint32_t keysize,
+ unsigned char *key)
+{
+ int32_t ret;
+ struct kderive_context ctx;
+ crypt_key_type type;
+
+ ret = data_key_type_by_size(keysize, &type);
+ if (ret)
+ return ret;
+ ret = kderive_init(&ctx,
+ master->m_key,
+ master_key_size(),
+ info->oid, sizeof(uuid_t), type);
+ if (ret)
+ return ret;
+ kderive_update(&ctx);
+ kderive_final(&ctx, key);
+ return 0;
+}
+
+/*
+ * NOTE: Don't change existing keys: it will break compatibility;
+ */
+struct crypt_key crypt_keys[LAST_KEY_TYPE] = {
+ [MASTER_VOL_KEY] =
+ { .len = MASTER_VOL_KEY_SIZE << 3,
+ .label = "volume-master",
+ },
+ [NMTD_VOL_KEY] =
+ { .len = NMTD_VOL_KEY_SIZE << 3,
+ .label = "volume-nmtd-key-generation"
+ },
+ [NMTD_LINK_KEY] =
+ { .len = 128,
+ .label = "link-nmtd-authentication"
+ },
+ [EMTD_FILE_KEY] =
+ { .len = 128,
+ .label = "file-emtd-encryption-and-auth"
+ },
+ [DATA_FILE_KEY_256] =
+ { .len = 256,
+ .label = "file-data-encryption-256"
+ },
+ [DATA_FILE_KEY_512] =
+ { .len = 512,
+ .label = "file-data-encryption-512"
+ }
+};
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
diff --git a/xlators/encryption/crypt/src/metadata.c b/xlators/encryption/crypt/src/metadata.c
new file mode 100644
index 000000000..36b14c055
--- /dev/null
+++ b/xlators/encryption/crypt/src/metadata.c
@@ -0,0 +1,605 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "defaults.h"
+#include "crypt-common.h"
+#include "crypt.h"
+#include "metadata.h"
+
+int32_t alloc_format(crypt_local_t *local, size_t size)
+{
+ if (size > 0) {
+ local->format = GF_CALLOC(1, size, gf_crypt_mt_mtd);
+ if (!local->format)
+ return ENOMEM;
+ }
+ local->format_size = size;
+ return 0;
+}
+
+int32_t alloc_format_create(crypt_local_t *local)
+{
+ return alloc_format(local, new_format_size());
+}
+
+void free_format(crypt_local_t *local)
+{
+ GF_FREE(local->format);
+}
+
+/*
+ * Check compatibility with extracted metadata
+ */
+static int32_t check_file_metadata(struct crypt_inode_info *info)
+{
+ struct object_cipher_info *object = &info->cinfo;
+
+ if (info->nr_minor != CRYPT_XLATOR_ID) {
+ gf_log("crypt", GF_LOG_WARNING,
+ "unsupported minor subversion %d", info->nr_minor);
+ return EINVAL;
+ }
+ if (object->o_alg > LAST_CIPHER_ALG) {
+ gf_log("crypt", GF_LOG_WARNING,
+ "unsupported cipher algorithm %d",
+ object->o_alg);
+ return EINVAL;
+ }
+ if (object->o_mode > LAST_CIPHER_MODE) {
+ gf_log("crypt", GF_LOG_WARNING,
+ "unsupported cipher mode %d",
+ object->o_mode);
+ return EINVAL;
+ }
+ if (object->o_block_bits < CRYPT_MIN_BLOCK_BITS ||
+ object->o_block_bits > CRYPT_MAX_BLOCK_BITS) {
+ gf_log("crypt", GF_LOG_WARNING, "unsupported block bits %d",
+ object->o_block_bits);
+ return EINVAL;
+ }
+ /* TBD: check data key size */
+ return 0;
+}
+
+static size_t format_size_v1(mtd_op_t op, size_t old_size)
+{
+
+ switch (op) {
+ case MTD_CREATE:
+ return sizeof(struct mtd_format_v1);
+ case MTD_OVERWRITE:
+ return old_size;
+ case MTD_APPEND:
+ return old_size + NMTD_8_MAC_SIZE;
+ case MTD_CUT:
+ if (old_size > sizeof(struct mtd_format_v1))
+ return old_size - NMTD_8_MAC_SIZE;
+ else
+ return 0;
+ default:
+ gf_log("crypt", GF_LOG_WARNING, "Bad mtd operation");
+ return 0;
+ }
+}
+
+/*
+ * Calculate size of the updated format string.
+ * Returned zero means that we don't need to update the format string.
+ */
+size_t format_size(mtd_op_t op, size_t old_size)
+{
+ size_t versioned;
+
+ versioned = mtd_loaders[current_mtd_loader()].format_size(op,
+ old_size - sizeof(struct crypt_format));
+ if (versioned != 0)
+ return versioned + sizeof(struct crypt_format);
+ return 0;
+}
+
+/*
+ * size of the format string of newly created file (nr_links = 1)
+ */
+size_t new_format_size(void)
+{
+ return format_size(MTD_CREATE, 0);
+}
+
+/*
+ * Calculate per-link MAC by pathname
+ */
+static int32_t calc_link_mac_v1(struct mtd_format_v1 *fmt,
+ loc_t *loc,
+ unsigned char *result,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master)
+{
+ int32_t ret;
+ unsigned char nmtd_link_key[16];
+ CMAC_CTX *cctx;
+ size_t len;
+
+ ret = get_nmtd_link_key(loc, master, nmtd_link_key);
+ if (ret) {
+ gf_log("crypt", GF_LOG_ERROR, "Can not get nmtd link key");
+ return -1;
+ }
+ cctx = CMAC_CTX_new();
+ if (!cctx) {
+ gf_log("crypt", GF_LOG_ERROR, "CMAC_CTX_new failed");
+ return -1;
+ }
+ ret = CMAC_Init(cctx, nmtd_link_key, sizeof(nmtd_link_key),
+ EVP_aes_128_cbc(), 0);
+ if (!ret) {
+ gf_log("crypt", GF_LOG_ERROR, "CMAC_Init failed");
+ CMAC_CTX_free(cctx);
+ return -1;
+ }
+ ret = CMAC_Update(cctx, get_NMTD_V1(info), SIZE_OF_NMTD_V1);
+ if (!ret) {
+ gf_log("crypt", GF_LOG_ERROR, "CMAC_Update failed");
+ CMAC_CTX_free(cctx);
+ return -1;
+ }
+ ret = CMAC_Final(cctx, result, &len);
+ CMAC_CTX_free(cctx);
+ if (!ret) {
+ gf_log("crypt", GF_LOG_ERROR, "CMAC_Final failed");
+ return -1;
+ }
+ return 0;
+}
+
+/*
+ * Create per-link MAC of index @idx by pathname
+ */
+static int32_t create_link_mac_v1(struct mtd_format_v1 *fmt,
+ uint32_t idx,
+ loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master)
+{
+ int32_t ret;
+ unsigned char *mac;
+ unsigned char cmac[16];
+
+ mac = get_NMTD_V1_MAC(fmt) + idx * SIZE_OF_NMTD_V1_MAC;
+
+ ret = calc_link_mac_v1(fmt, loc, cmac, info, master);
+ if (ret)
+ return -1;
+ memcpy(mac, cmac, SIZE_OF_NMTD_V1_MAC);
+ return 0;
+}
+
+static int32_t create_format_v1(unsigned char *wire,
+ loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master)
+{
+ int32_t ret;
+ struct mtd_format_v1 *fmt;
+ unsigned char mtd_key[16];
+ AES_KEY EMTD_KEY;
+ unsigned char nmtd_link_key[16];
+ uint32_t ad;
+ GCM128_CONTEXT *gctx;
+
+ fmt = (struct mtd_format_v1 *)wire;
+
+ fmt->minor_id = info->nr_minor;
+ fmt->alg_id = AES_CIPHER_ALG;
+ fmt->dkey_factor = master->m_dkey_size >> KEY_FACTOR_BITS;
+ fmt->block_bits = master->m_block_bits;
+ fmt->mode_id = master->m_mode;
+ /*
+ * retrieve keys for the parts of metadata
+ */
+ ret = get_emtd_file_key(info, master, mtd_key);
+ if (ret)
+ return ret;
+ ret = get_nmtd_link_key(loc, master, nmtd_link_key);
+ if (ret)
+ return ret;
+
+ AES_set_encrypt_key(mtd_key, sizeof(mtd_key)*8, &EMTD_KEY);
+
+ gctx = CRYPTO_gcm128_new(&EMTD_KEY, (block128_f)AES_encrypt);
+
+ /* TBD: Check return values */
+
+ CRYPTO_gcm128_setiv(gctx, info->oid, sizeof(uuid_t));
+
+ ad = htole32(MTD_LOADER_V1);
+ ret = CRYPTO_gcm128_aad(gctx, (const unsigned char *)&ad, sizeof(ad));
+ if (ret) {
+ gf_log("crypt", GF_LOG_ERROR, " CRYPTO_gcm128_aad failed");
+ CRYPTO_gcm128_release(gctx);
+ return ret;
+ }
+ ret = CRYPTO_gcm128_encrypt(gctx,
+ get_EMTD_V1(fmt),
+ get_EMTD_V1(fmt),
+ SIZE_OF_EMTD_V1);
+ if (ret) {
+ gf_log("crypt", GF_LOG_ERROR, " CRYPTO_gcm128_encrypt failed");
+ CRYPTO_gcm128_release(gctx);
+ return ret;
+ }
+ /*
+ * set MAC of encrypted part of metadata
+ */
+ CRYPTO_gcm128_tag(gctx, get_EMTD_V1_MAC(fmt), SIZE_OF_EMTD_V1_MAC);
+ CRYPTO_gcm128_release(gctx);
+ /*
+ * set the first MAC of non-encrypted part of metadata
+ */
+ return create_link_mac_v1(fmt, 0, loc, info, master);
+}
+
+/*
+ * Called by fops:
+ * ->create();
+ * ->link();
+ *
+ * Pack common and version-specific parts of file's metadata
+ * Pre-conditions: @info contains valid object-id.
+ */
+int32_t create_format(unsigned char *wire,
+ loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master)
+{
+ struct crypt_format *fmt = (struct crypt_format *)wire;
+
+ fmt->loader_id = current_mtd_loader();
+
+ wire += sizeof(struct crypt_format);
+ return mtd_loaders[current_mtd_loader()].create_format(wire, loc,
+ info, master);
+}
+
+/*
+ * Append or overwrite per-link mac of @mac_idx index
+ * in accordance with the new pathname
+ */
+int32_t appov_link_mac_v1(unsigned char *new,
+ unsigned char *old,
+ uint32_t old_size,
+ int32_t mac_idx,
+ loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master,
+ crypt_local_t *local)
+{
+ memcpy(new, old, old_size);
+ return create_link_mac_v1((struct mtd_format_v1 *)new, mac_idx,
+ loc, info, master);
+}
+
+/*
+ * Cut per-link mac of @mac_idx index
+ */
+static int32_t cut_link_mac_v1(unsigned char *new,
+ unsigned char *old,
+ uint32_t old_size,
+ int32_t mac_idx,
+ loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master,
+ crypt_local_t *local)
+{
+ memcpy(new,
+ old,
+ sizeof(struct mtd_format_v1) + NMTD_8_MAC_SIZE * (mac_idx - 1));
+
+ memcpy(new + sizeof(struct mtd_format_v1) + NMTD_8_MAC_SIZE * (mac_idx - 1),
+ old + sizeof(struct mtd_format_v1) + NMTD_8_MAC_SIZE * mac_idx,
+ old_size - (sizeof(struct mtd_format_v1) + NMTD_8_MAC_SIZE * mac_idx));
+ return 0;
+}
+
+int32_t update_format_v1(unsigned char *new,
+ unsigned char *old,
+ size_t old_len,
+ int32_t mac_idx, /* of old name */
+ mtd_op_t op,
+ loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master,
+ crypt_local_t *local)
+{
+ switch (op) {
+ case MTD_APPEND:
+ mac_idx = 1 + (old_len - sizeof(struct mtd_format_v1))/8;
+ case MTD_OVERWRITE:
+ return appov_link_mac_v1(new, old, old_len, mac_idx,
+ loc, info, master, local);
+ case MTD_CUT:
+ return cut_link_mac_v1(new, old, old_len, mac_idx,
+ loc, info, master, local);
+ default:
+ gf_log("crypt", GF_LOG_ERROR, "Bad mtd operation %d", op);
+ return -1;
+ }
+}
+
+/*
+ * Called by fops:
+ *
+ * ->link()
+ * ->unlink()
+ * ->rename()
+ *
+ */
+int32_t update_format(unsigned char *new,
+ unsigned char *old,
+ size_t old_len,
+ int32_t mac_idx,
+ mtd_op_t op,
+ loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master,
+ crypt_local_t *local)
+{
+ if (!new)
+ return 0;
+ memcpy(new, old, sizeof(struct crypt_format));
+
+ old += sizeof(struct crypt_format);
+ new += sizeof(struct crypt_format);
+ old_len -= sizeof(struct crypt_format);
+
+ return mtd_loaders[current_mtd_loader()].update_format(new, old,
+ old_len,
+ mac_idx, op,
+ loc, info,
+ master, local);
+}
+
+/*
+ * Perform preliminary checks of found metadata
+ * Return < 0 on errors;
+ * Return number of object-id MACs (>= 1) on success
+ */
+int32_t check_format_v1(uint32_t len, unsigned char *wire)
+{
+ uint32_t nr_links;
+
+ if (len < sizeof(struct mtd_format_v1)) {
+ gf_log("crypt", GF_LOG_ERROR,
+ "v1-loader: bad metadata size %d", len);
+ goto error;
+ }
+ len -= sizeof(struct mtd_format_v1);
+ if (len % sizeof(nmtd_8_mac_t)) {
+ gf_log("crypt", GF_LOG_ERROR,
+ "v1-loader: bad metadata format");
+ goto error;
+ }
+ nr_links = 1 + len / sizeof(nmtd_8_mac_t);
+ if (nr_links > _POSIX_LINK_MAX)
+ goto error;
+ return nr_links;
+ error:
+ return EIO;
+}
+
+/*
+ * Verify per-link MAC specified by index @idx
+ *
+ * return:
+ * -1 on errors;
+ * 0 on failed verification;
+ * 1 on sucessful verification
+ */
+static int32_t verify_link_mac_v1(struct mtd_format_v1 *fmt,
+ uint32_t idx /* index of the mac to verify */,
+ loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master)
+{
+ int32_t ret;
+ unsigned char *mac;
+ unsigned char cmac[16];
+
+ mac = get_NMTD_V1_MAC(fmt) + idx * SIZE_OF_NMTD_V1_MAC;
+
+ ret = calc_link_mac_v1(fmt, loc, cmac, info, master);
+ if (ret)
+ return -1;
+ if (memcmp(cmac, mac, SIZE_OF_NMTD_V1_MAC))
+ return 0;
+ return 1;
+}
+
+/*
+ * Lookup per-link MAC by pathname.
+ *
+ * return index of the MAC, if it was found;
+ * return < 0 on errors, or if the MAC wasn't found
+ */
+static int32_t lookup_link_mac_v1(struct mtd_format_v1 *fmt,
+ uint32_t nr_macs,
+ loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master)
+{
+ int32_t ret;
+ uint32_t idx;
+
+ for (idx = 0; idx < nr_macs; idx++) {
+ ret = verify_link_mac_v1(fmt, idx, loc, info, master);
+ if (ret < 0)
+ return ret;
+ if (ret > 0)
+ return idx;
+ }
+ return -ENOENT;
+}
+
+/*
+ * Extract version-specific part of metadata
+ */
+static int32_t open_format_v1(unsigned char *wire,
+ int32_t len,
+ loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master,
+ crypt_local_t *local,
+ gf_boolean_t load_info)
+{
+ int32_t ret;
+ int32_t num_nmtd_macs;
+ struct mtd_format_v1 *fmt;
+ unsigned char mtd_key[16];
+ AES_KEY EMTD_KEY;
+ GCM128_CONTEXT *gctx;
+ uint32_t ad;
+ emtd_8_mac_t gmac;
+ struct object_cipher_info *object;
+
+ num_nmtd_macs = check_format_v1(len, wire);
+ if (num_nmtd_macs <= 0)
+ return EIO;
+ fmt = (struct mtd_format_v1 *)wire;
+
+ ret = lookup_link_mac_v1(fmt, num_nmtd_macs, loc, info, master);
+ if (ret < 0) {
+ gf_log("crypt", GF_LOG_ERROR, "NMTD verification failed");
+ return EINVAL;
+ }
+ local->mac_idx = ret;
+ if (load_info == _gf_false)
+ /* the case of partial open */
+ return 0;
+
+ object = &info->cinfo;
+
+ ret = get_emtd_file_key(info, master, mtd_key);
+ if (ret) {
+ gf_log("crypt", GF_LOG_ERROR, "Can not retrieve metadata key");
+ return ret;
+ }
+ /*
+ * decrypt encrypted meta-data
+ */
+ ret = AES_set_encrypt_key(mtd_key, sizeof(mtd_key)*8, &EMTD_KEY);
+ if (ret < 0) {
+ gf_log("crypt", GF_LOG_ERROR, "Can not set encrypt key");
+ return ret;
+ }
+ gctx = CRYPTO_gcm128_new(&EMTD_KEY, (block128_f)AES_encrypt);
+ if (!gctx) {
+ gf_log("crypt", GF_LOG_ERROR, "Can not alloc gcm context");
+ return ENOMEM;
+ }
+ CRYPTO_gcm128_setiv(gctx, info->oid, sizeof(uuid_t));
+
+ ad = htole32(MTD_LOADER_V1);
+ ret = CRYPTO_gcm128_aad(gctx, (const unsigned char *)&ad, sizeof(ad));
+ if (ret) {
+ gf_log("crypt", GF_LOG_ERROR, " CRYPTO_gcm128_aad failed");
+ CRYPTO_gcm128_release(gctx);
+ return ret;
+ }
+ ret = CRYPTO_gcm128_decrypt(gctx,
+ get_EMTD_V1(fmt),
+ get_EMTD_V1(fmt),
+ SIZE_OF_EMTD_V1);
+ if (ret) {
+ gf_log("crypt", GF_LOG_ERROR, " CRYPTO_gcm128_decrypt failed");
+ CRYPTO_gcm128_release(gctx);
+ return ret;
+ }
+ /*
+ * verify metadata
+ */
+ CRYPTO_gcm128_tag(gctx, gmac, sizeof(gmac));
+ CRYPTO_gcm128_release(gctx);
+ if (memcmp(gmac, get_EMTD_V1_MAC(fmt), SIZE_OF_EMTD_V1_MAC)) {
+ gf_log("crypt", GF_LOG_ERROR, "EMTD verification failed");
+ return EINVAL;
+ }
+ /*
+ * load verified metadata to the private part of inode
+ */
+ info->nr_minor = fmt->minor_id;
+
+ object->o_alg = fmt->alg_id;
+ object->o_dkey_size = fmt->dkey_factor << KEY_FACTOR_BITS;
+ object->o_block_bits = fmt->block_bits;
+ object->o_mode = fmt->mode_id;
+
+ return check_file_metadata(info);
+}
+
+/*
+ * perform metadata authentication against @loc->path;
+ * extract crypt-specific attribtes and populate @info
+ * with them (optional)
+ */
+int32_t open_format(unsigned char *str,
+ int32_t len,
+ loc_t *loc,
+ struct crypt_inode_info *info,
+ struct master_cipher_info *master,
+ crypt_local_t *local,
+ gf_boolean_t load_info)
+{
+ struct crypt_format *fmt;
+ if (len < sizeof(*fmt)) {
+ gf_log("crypt", GF_LOG_ERROR, "Bad core format");
+ return EIO;
+ }
+ fmt = (struct crypt_format *)str;
+
+ if (fmt->loader_id >= LAST_MTD_LOADER) {
+ gf_log("crypt", GF_LOG_ERROR,
+ "Unsupported loader id %d", fmt->loader_id);
+ return EINVAL;
+ }
+ str += sizeof(*fmt);
+ len -= sizeof(*fmt);
+
+ return mtd_loaders[fmt->loader_id].open_format(str,
+ len,
+ loc,
+ info,
+ master,
+ local,
+ load_info);
+}
+
+struct crypt_mtd_loader mtd_loaders [LAST_MTD_LOADER] = {
+ [MTD_LOADER_V1] =
+ {.format_size = format_size_v1,
+ .create_format = create_format_v1,
+ .open_format = open_format_v1,
+ .update_format = update_format_v1
+ }
+};
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
diff --git a/xlators/encryption/crypt/src/metadata.h b/xlators/encryption/crypt/src/metadata.h
new file mode 100644
index 000000000..a92f149ef
--- /dev/null
+++ b/xlators/encryption/crypt/src/metadata.h
@@ -0,0 +1,74 @@
+/*
+ Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __METADATA_H__
+#define __METADATA_H__
+
+#define NMTD_8_MAC_SIZE (8)
+#define EMTD_8_MAC_SIZE (8)
+
+typedef uint8_t nmtd_8_mac_t[NMTD_8_MAC_SIZE];
+typedef uint8_t emtd_8_mac_t[EMTD_8_MAC_SIZE] ;
+
+/*
+ * Version "v1" of file's metadata.
+ * Metadata of this version has 4 components:
+ *
+ * 1) EMTD (Encrypted part of MeTaData);
+ * 2) NMTD (Non-encrypted part of MeTaData);
+ * 3) EMTD_MAC; (EMTD Message Authentication Code);
+ * 4) Array of per-link NMTD MACs (for every (hard)link it includes
+ * exactly one MAC)
+ */
+struct mtd_format_v1 {
+ /* EMTD, encrypted part of meta-data */
+ uint8_t alg_id; /* cipher algorithm id (only AES for now) */
+ uint8_t mode_id; /* cipher mode id; (only XTS for now) */
+ uint8_t block_bits; /* encoded block size */
+ uint8_t minor_id; /* client translator id */
+ uint8_t dkey_factor; /* encoded size of the data key */
+ /* MACs */
+ emtd_8_mac_t gmac; /* MAC of the encrypted meta-data, 8 bytes */
+ nmtd_8_mac_t omac; /* per-link MACs of the non-encrypted
+ * meta-data: at least one such MAC is always
+ * present */
+} __attribute__((packed));
+
+/*
+ * NMTD, the non-encrypted part of metadata of version "v1"
+ * is file's gfid, which is generated on trusted machines.
+ */
+#define SIZE_OF_NMTD_V1 (sizeof(uuid_t))
+#define SIZE_OF_EMTD_V1 (offsetof(struct mtd_format_v1, gmac) - \
+ offsetof(struct mtd_format_v1, alg_id))
+#define SIZE_OF_NMTD_V1_MAC (NMTD_8_MAC_SIZE)
+#define SIZE_OF_EMTD_V1_MAC (EMTD_8_MAC_SIZE)
+
+static inline unsigned char *get_EMTD_V1(struct mtd_format_v1 *format)
+{
+ return &format->alg_id;
+}
+
+static inline unsigned char *get_NMTD_V1(struct crypt_inode_info *info)
+{
+ return info->oid;
+}
+
+static inline unsigned char *get_EMTD_V1_MAC(struct mtd_format_v1 *format)
+{
+ return format->gmac;
+}
+
+static inline unsigned char *get_NMTD_V1_MAC(struct mtd_format_v1 *format)
+{
+ return format->omac;
+}
+
+#endif /* __METADATA_H__ */
diff --git a/xlators/encryption/rot-13/src/Makefile.am b/xlators/encryption/rot-13/src/Makefile.am
index 91c9d93e2..94e8d18e7 100644
--- a/xlators/encryption/rot-13/src/Makefile.am
+++ b/xlators/encryption/rot-13/src/Makefile.am
@@ -1,7 +1,7 @@
xlator_LTLIBRARIES = rot-13.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/encryption
-rot_13_la_LDFLAGS = -module -avoidversion
+rot_13_la_LDFLAGS = -module -avoid-version
rot_13_la_SOURCES = rot-13.c
rot_13_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
diff --git a/xlators/encryption/rot-13/src/rot-13.c b/xlators/encryption/rot-13/src/rot-13.c
index ba7d88a5f..1bcfe0192 100644
--- a/xlators/encryption/rot-13/src/rot-13.c
+++ b/xlators/encryption/rot-13/src/rot-13.c
@@ -22,13 +22,13 @@
#include "rot-13.h"
/*
- * This is a rot13 ``encryption'' xlator. It rot13's data when
- * writing to disk and rot13's it back when reading it.
+ * This is a rot13 ``encryption'' xlator. It rot13's data when
+ * writing to disk and rot13's it back when reading it.
* This xlator is meant as an example, NOT FOR PRODUCTION
* USE ;) (hence no error-checking)
*/
-void
+void
rot13 (char *buf, int len)
{
int i;
@@ -61,7 +61,7 @@ rot13_readv_cbk (call_frame_t *frame,
struct iobref *iobref, dict_t *xdata)
{
rot_13_private_t *priv = (rot_13_private_t *)this->private;
-
+
if (priv->decrypt_read)
rot13_iovec (vector, count);
@@ -128,7 +128,7 @@ init (xlator_t *this)
rot_13_private_t *priv = NULL;
if (!this->children || this->children->next) {
- gf_log ("rot13", GF_LOG_ERROR,
+ gf_log ("rot13", GF_LOG_ERROR,
"FATAL: rot13 should have exactly one child");
return -1;
}
@@ -150,6 +150,7 @@ init (xlator_t *this)
if (gf_string2boolean (data->data, &priv->encrypt_write) == -1) {
gf_log (this->name, GF_LOG_ERROR,
"encrypt-write takes only boolean options");
+ GF_FREE (priv);
return -1;
}
}
@@ -159,6 +160,7 @@ init (xlator_t *this)
if (gf_string2boolean (data->data, &priv->decrypt_read) == -1) {
gf_log (this->name, GF_LOG_ERROR,
"decrypt-read takes only boolean options");
+ GF_FREE (priv);
return -1;
}
}
@@ -186,15 +188,14 @@ struct xlator_fops fops = {
.writev = rot13_writev
};
-struct xlator_cbks cbks = {
-};
+struct xlator_cbks cbks;
struct volume_options options[] = {
- { .key = {"encrypt-write"},
+ { .key = {"encrypt-write"},
.type = GF_OPTION_TYPE_BOOL
},
- { .key = {"decrypt-read"},
- .type = GF_OPTION_TYPE_BOOL
+ { .key = {"decrypt-read"},
+ .type = GF_OPTION_TYPE_BOOL
},
{ .key = {NULL} },
};
diff --git a/xlators/features/Makefile.am b/xlators/features/Makefile.am
index be48f93fa..1fdd474c2 100644
--- a/xlators/features/Makefile.am
+++ b/xlators/features/Makefile.am
@@ -1,3 +1,4 @@
-SUBDIRS = locks quota read-only mac-compat quiesce marker index # trash path-converter # filter
+SUBDIRS = locks quota read-only mac-compat quiesce marker index barrier \
+ protect compress changelog gfid-access $(GLUPY_SUBDIR) qemu-block # trash path-converter # filter
CLEANFILES =
diff --git a/xlators/features/barrier/Makefile.am b/xlators/features/barrier/Makefile.am
new file mode 100644
index 000000000..a985f42a8
--- /dev/null
+++ b/xlators/features/barrier/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/features/barrier/src/Makefile.am b/xlators/features/barrier/src/Makefile.am
new file mode 100644
index 000000000..8859be328
--- /dev/null
+++ b/xlators/features/barrier/src/Makefile.am
@@ -0,0 +1,16 @@
+xlator_LTLIBRARIES = barrier.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+barrier_la_LDFLAGS = -module -avoid-version
+
+barrier_la_SOURCES = barrier.c
+
+barrier_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = barrier.h barrier-mem-types.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/barrier/src/barrier-mem-types.h b/xlators/features/barrier/src/barrier-mem-types.h
new file mode 100644
index 000000000..36647a669
--- /dev/null
+++ b/xlators/features/barrier/src/barrier-mem-types.h
@@ -0,0 +1,20 @@
+/*
+ Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __BARRIER_MEM_TYPES_H__
+#define __BARRIER_MEM_TYPES_H__
+
+#include "mem-types.h"
+
+enum gf_barrier_mem_types_ {
+ gf_barrier_mt_priv_t = gf_common_mt_end + 1,
+ gf_barrier_mt_end
+};
+#endif
diff --git a/xlators/features/barrier/src/barrier.c b/xlators/features/barrier/src/barrier.c
new file mode 100644
index 000000000..5edb9cdd3
--- /dev/null
+++ b/xlators/features/barrier/src/barrier.c
@@ -0,0 +1,658 @@
+/*
+ Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "barrier.h"
+#include "defaults.h"
+#include "call-stub.h"
+
+#include "statedump.h"
+
+int32_t
+barrier_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf,
+ dict_t *xdata)
+{
+ BARRIER_FOP_CBK (writev, out, frame, this, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+out:
+ return 0;
+}
+
+int32_t
+barrier_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ BARRIER_FOP_CBK (fremovexattr, out, frame, this, op_ret, op_errno,
+ xdata);
+out:
+ return 0;
+}
+
+int32_t
+barrier_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ BARRIER_FOP_CBK (removexattr, out, frame, this, op_ret, op_errno,
+ xdata);
+out:
+ return 0;
+}
+
+int32_t
+barrier_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ BARRIER_FOP_CBK (truncate, out, frame, this, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+out:
+ return 0;
+}
+
+int32_t
+barrier_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ BARRIER_FOP_CBK (ftruncate, out, frame, this, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+out:
+ return 0;
+}
+
+int32_t
+barrier_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ struct iatt *preoldparent, struct iatt *postoldparent,
+ struct iatt *prenewparent, struct iatt *postnewparent,
+ dict_t *xdata)
+{
+ BARRIER_FOP_CBK (rename, out, frame, this, op_ret, op_errno, buf,
+ preoldparent, postoldparent, prenewparent,
+ postnewparent, xdata);
+out:
+ return 0;
+}
+
+int32_t
+barrier_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ BARRIER_FOP_CBK (rmdir, out, frame, this, op_ret, op_errno, preparent,
+ postparent, xdata);
+out:
+ return 0;
+}
+
+int32_t
+barrier_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ BARRIER_FOP_CBK (unlink, out, frame, this, op_ret, op_errno, preparent,
+ postparent, xdata);
+out:
+ return 0;
+}
+
+int32_t
+barrier_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ BARRIER_FOP_CBK (fsync, out, frame, this, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+out:
+ return 0;
+}
+
+int32_t
+barrier_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t off, uint32_t flags,
+ struct iobref *iobref, dict_t *xdata)
+{
+ if (!(flags & O_SYNC)) {
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev,
+ fd, vector, count, off, flags, iobref, xdata);
+
+ return 0;
+ }
+
+ STACK_WIND (frame, barrier_writev_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev, fd, vector, count,
+ off, flags, iobref, xdata);
+ return 0;
+}
+
+int32_t
+barrier_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata)
+{
+ STACK_WIND (frame, barrier_fremovexattr_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fremovexattr,
+ fd, name, xdata);
+ return 0;
+}
+
+int32_t
+barrier_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ STACK_WIND (frame, barrier_removexattr_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->removexattr,
+ loc, name, xdata);
+ return 0;
+}
+
+int32_t
+barrier_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ off_t offset, dict_t *xdata)
+{
+ STACK_WIND (frame, barrier_truncate_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->truncate,
+ loc, offset, xdata);
+ return 0;
+}
+
+int32_t
+barrier_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc,
+ loc_t *newloc, dict_t *xdata)
+{
+ STACK_WIND (frame, barrier_rename_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->rename,
+ oldloc, newloc, xdata);
+ return 0;
+}
+
+int
+barrier_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+ dict_t *xdata)
+{
+ STACK_WIND (frame, barrier_rmdir_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->rmdir,
+ loc, flags, xdata);
+ return 0;
+}
+
+int32_t
+barrier_unlink (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int xflag, dict_t *xdata)
+{
+ STACK_WIND (frame, barrier_unlink_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->unlink,
+ loc, xflag, xdata);
+ return 0;
+}
+
+int32_t
+barrier_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, dict_t *xdata)
+{
+ STACK_WIND (frame, barrier_ftruncate_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->ftruncate,
+ fd, offset, xdata);
+ return 0;
+}
+
+int32_t
+barrier_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int32_t flags, dict_t *xdata)
+{
+ STACK_WIND (frame, barrier_fsync_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fsync,
+ fd, flags, xdata);
+ return 0;
+}
+
+call_stub_t *
+__barrier_dequeue (xlator_t *this, struct list_head *queue)
+{
+ call_stub_t *stub = NULL;
+ barrier_priv_t *priv = NULL;
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ if (list_empty (queue))
+ goto out;
+
+ stub = list_entry (queue->next, call_stub_t, list);
+ list_del_init (&stub->list);
+
+out:
+ return stub;
+}
+
+void
+barrier_dequeue_all (xlator_t *this, struct list_head *queue)
+{
+ call_stub_t *stub = NULL;
+
+ gf_log (this->name, GF_LOG_INFO, "Dequeuing all the barriered fops");
+
+ /* TODO: Start the below task in a new thread */
+ while ((stub = __barrier_dequeue (this, queue)))
+ call_resume (stub);
+
+ gf_log (this->name, GF_LOG_INFO, "Dequeuing the barriered fops is "
+ "finished");
+ return;
+}
+
+void
+barrier_timeout (void *data)
+{
+ xlator_t *this = NULL;
+ barrier_priv_t *priv = NULL;
+ struct list_head queue = {0,};
+
+ this = data;
+ THIS = this;
+ priv = this->private;
+
+ INIT_LIST_HEAD (&queue);
+
+ gf_log (this->name, GF_LOG_CRITICAL, "Disabling barrier because of "
+ "the barrier timeout.");
+
+ LOCK (&priv->lock);
+ {
+ __barrier_disable (this, &queue);
+ }
+ UNLOCK (&priv->lock);
+
+ barrier_dequeue_all (this, &queue);
+
+ return;
+}
+
+void
+__barrier_enqueue (xlator_t *this, call_stub_t *stub)
+{
+ barrier_priv_t *priv = NULL;
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ list_add_tail (&stub->list, &priv->queue);
+ priv->queue_size++;
+
+ return;
+}
+
+void
+__barrier_disable (xlator_t *this, struct list_head *queue)
+{
+ GF_UNUSED int ret = 0;
+ barrier_priv_t *priv = NULL;
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ if (priv->timer) {
+ ret = gf_timer_call_cancel (this->ctx, priv->timer);
+ priv->timer = NULL;
+ }
+
+ list_splice_init (&priv->queue, queue);
+ priv->queue_size = 0;
+ priv->barrier_enabled = _gf_false;
+}
+
+int
+__barrier_enable (xlator_t *this, barrier_priv_t *priv)
+{
+ int ret = -1;
+
+ priv->timer = gf_timer_call_after (this->ctx, priv->timeout,
+ barrier_timeout, (void *) this);
+ if (!priv->timer) {
+ gf_log (this->name, GF_LOG_CRITICAL, "Couldn't add barrier "
+ "timeout event.");
+ goto out;
+ }
+
+ priv->barrier_enabled = _gf_true;
+ ret = 0;
+out:
+ return ret;
+}
+
+int
+notify (xlator_t *this, int event, void *data, ...)
+{
+ barrier_priv_t *priv = NULL;
+ dict_t *dict = NULL;
+ gf_boolean_t past = _gf_false;
+ int ret = -1;
+ gf_boolean_t barrier_enabled = _gf_false;
+ struct list_head queue = {0,};
+
+ priv = this->private;
+ GF_ASSERT (priv);
+ INIT_LIST_HEAD (&queue);
+
+ switch (event) {
+ case GF_EVENT_TRANSLATOR_OP:
+ {
+ dict = data;
+ GF_OPTION_RECONF ("barrier", barrier_enabled, dict,
+ bool, out);
+
+ LOCK (&priv->lock);
+ {
+ past = priv->barrier_enabled;
+
+ switch (past) {
+ case _gf_false:
+ if (barrier_enabled) {
+ ret = __barrier_enable (this,priv);
+ if (ret)
+ goto unlock;
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Already disabled.");
+ goto unlock;
+ }
+ break;
+
+ case _gf_true:
+ if (!barrier_enabled) {
+ __barrier_disable(this, &queue);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Already enabled");
+ goto unlock;
+ }
+ break;
+ }
+ ret = 0;
+ }
+unlock:
+ UNLOCK (&priv->lock);
+
+ if (!list_empty (&queue))
+ barrier_dequeue_all (this, &queue);
+ // missing break is intentional
+ }
+ default:
+ {
+ default_notify (this, event, data);
+ ret = 0;
+ goto out;
+ }
+ }
+out:
+ return ret;
+}
+
+int
+reconfigure (xlator_t *this, dict_t *options)
+{
+ barrier_priv_t *priv = NULL;
+ gf_boolean_t past = _gf_false;
+ int ret = -1;
+ gf_boolean_t barrier_enabled = _gf_false;
+ uint32_t timeout = {0,};
+ struct list_head queue = {0,};
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ GF_OPTION_RECONF ("barrier", barrier_enabled, options, bool, out);
+ GF_OPTION_RECONF ("barrier-timeout", timeout, options, time, out);
+
+ INIT_LIST_HEAD (&queue);
+
+ LOCK (&priv->lock);
+ {
+ past = priv->barrier_enabled;
+
+ switch (past) {
+ case _gf_false:
+ if (barrier_enabled) {
+ ret = __barrier_enable (this, priv);
+ if (ret) {
+ goto unlock;
+ }
+ }
+ break;
+
+ case _gf_true:
+ if (!barrier_enabled) {
+ __barrier_disable (this, &queue);
+
+ }
+ break;
+ }
+ priv->timeout.tv_sec = timeout;
+ ret = 0;
+ }
+unlock:
+ UNLOCK (&priv->lock);
+
+ if (!list_empty (&queue))
+ barrier_dequeue_all (this, &queue);
+
+out:
+ return ret;
+}
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ ret = xlator_mem_acct_init (this, gf_barrier_mt_end + 1);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR, "Memory accounting "
+ "initialization failed.");
+
+ return ret;
+}
+
+int
+init (xlator_t *this)
+{
+ int ret = -1;
+ barrier_priv_t *priv = NULL;
+ uint32_t timeout = {0,};
+
+ if (!this->children || this->children->next) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "'barrier' not configured with exactly one child");
+ goto out;
+ }
+
+ if (!this->parents)
+ gf_log (this->name, GF_LOG_WARNING,
+ "dangling volume. check volfile ");
+
+ priv = GF_CALLOC (1, sizeof (*priv), gf_barrier_mt_priv_t);
+ if (!priv)
+ goto out;
+
+ LOCK_INIT (&priv->lock);
+
+ GF_OPTION_INIT ("barrier", priv->barrier_enabled, bool, out);
+ GF_OPTION_INIT ("barrier-timeout", timeout, time, out);
+ priv->timeout.tv_sec = timeout;
+
+ INIT_LIST_HEAD (&priv->queue);
+
+ if (priv->barrier_enabled) {
+ ret = __barrier_enable (this, priv);
+ if (ret == -1)
+ goto out;
+ }
+
+ this->private = priv;
+ ret = 0;
+out:
+ return ret;
+}
+
+void
+fini (xlator_t *this)
+{
+ barrier_priv_t *priv = NULL;
+ struct list_head queue = {0,};
+
+ priv = this->private;
+ if (!priv)
+ goto out;
+
+ INIT_LIST_HEAD (&queue);
+
+ gf_log (this->name, GF_LOG_INFO, "Disabling barriering and dequeuing "
+ "all the queued fops");
+ LOCK (&priv->lock);
+ {
+ __barrier_disable (this, &queue);
+ }
+ UNLOCK (&priv->lock);
+
+ if (!list_empty (&queue))
+ barrier_dequeue_all (this, &queue);
+
+ this->private = NULL;
+
+ LOCK_DESTROY (&priv->lock);
+ GF_FREE (priv);
+out:
+ return;
+}
+
+static void
+barrier_dump_stub (call_stub_t *stub, char *prefix)
+{
+ char key[GF_DUMP_MAX_BUF_LEN] = {0,};
+
+ gf_proc_dump_build_key (key, prefix, "fop");
+ gf_proc_dump_write (key, "%s", gf_fop_list[stub->fop]);
+
+ gf_proc_dump_build_key (key, prefix, "gfid");
+ gf_proc_dump_write (key, "%s", uuid_utoa (stub->args.loc.gfid));
+
+ if (stub->args.loc.path) {
+ gf_proc_dump_build_key (key, prefix, "path");
+ gf_proc_dump_write (key, "%s", stub->args.loc.path);
+ }
+ if (stub->args.loc.name) {
+ gf_proc_dump_build_key (key, prefix, "name");
+ gf_proc_dump_write (key, "%s", stub->args.loc.name);
+ }
+
+ return;
+}
+
+static void
+__barrier_dump_queue (barrier_priv_t *priv)
+{
+ call_stub_t *stub = NULL;
+ char key[GF_DUMP_MAX_BUF_LEN] = {0,};
+ int i = 0;
+
+ GF_VALIDATE_OR_GOTO ("barrier", priv, out);
+
+ list_for_each_entry (stub, &priv->queue, list) {
+ snprintf (key, sizeof (key), "stub.%d", i++);
+ gf_proc_dump_add_section (key);
+ barrier_dump_stub(stub, key);
+ }
+
+out:
+ return;
+}
+
+int
+barrier_dump_priv (xlator_t *this)
+{
+ int ret = -1;
+ char key[GF_DUMP_MAX_BUF_LEN] = {0,};
+ barrier_priv_t *priv = NULL;
+
+ GF_VALIDATE_OR_GOTO ("barrier", this, out);
+
+ priv = this->private;
+ if (!priv)
+ return 0;
+
+ gf_proc_dump_build_key (key, "xlator.features.barrier", "priv");
+ gf_proc_dump_add_section (key);
+
+ LOCK (&priv->lock);
+ {
+ gf_proc_dump_build_key (key, "barrier", "enabled");
+ gf_proc_dump_write (key, "%d", priv->barrier_enabled);
+ gf_proc_dump_build_key (key, "barrier", "timeout");
+ gf_proc_dump_write (key, "%"PRId64, priv->timeout.tv_sec);
+ if (priv->barrier_enabled) {
+ gf_proc_dump_build_key (key, "barrier", "queue_size");
+ gf_proc_dump_write (key, "%d", priv->queue_size);
+ __barrier_dump_queue (priv);
+ }
+ }
+ UNLOCK (&priv->lock);
+
+out:
+ return ret;
+}
+
+struct xlator_fops fops = {
+
+ /* Barrier Class fops */
+ .rmdir = barrier_rmdir,
+ .unlink = barrier_unlink,
+ .rename = barrier_rename,
+ .removexattr = barrier_removexattr,
+ .fremovexattr = barrier_fremovexattr,
+ .truncate = barrier_truncate,
+ .ftruncate = barrier_ftruncate,
+ .fsync = barrier_fsync,
+
+ /* Writes with only O_SYNC flag */
+ .writev = barrier_writev,
+};
+
+struct xlator_dumpops dumpops = {
+ .priv = barrier_dump_priv,
+};
+
+struct xlator_cbks cbks;
+
+struct volume_options options[] = {
+ { .key = {"barrier"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "When \"on\", blocks acknowledgements to application "
+ "for file operations such as rmdir, rename, unlink, "
+ "removexattr, fremovexattr, truncate, ftruncate, "
+ "write (with O_SYNC), fsync. It is turned \"off\" by "
+ "default."
+ },
+ { .key = {"barrier-timeout"},
+ .type = GF_OPTION_TYPE_TIME,
+ .default_value = "120",
+ .description = "After 'timeout' seconds since the time 'barrier' "
+ "option was set to \"on\", acknowledgements to file "
+ "operations are no longer blocked and previously "
+ "blocked acknowledgements are sent to the application"
+ },
+ { .key = {NULL} },
+};
diff --git a/xlators/features/barrier/src/barrier.h b/xlators/features/barrier/src/barrier.h
new file mode 100644
index 000000000..8face9f65
--- /dev/null
+++ b/xlators/features/barrier/src/barrier.h
@@ -0,0 +1,91 @@
+/*
+ Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __BARRIER_H__
+#define __BARRIER_H__
+
+#include "barrier-mem-types.h"
+#include "xlator.h"
+#include "timer.h"
+#include "call-stub.h"
+
+#define BARRIER_SAFE_ASSIGN(lock, to, value) \
+ do { \
+ LOCK (&(lock)); \
+ { \
+ to = value; \
+ } \
+ UNLOCK (&(lock)); \
+ } while (0)
+
+#define BARRIER_FOP_CBK(fop_name, label, frame, this, params ...) \
+ do { \
+ barrier_priv_t *_priv = NULL; \
+ call_stub_t *_stub = NULL; \
+ gf_boolean_t _barrier_enabled= _gf_false; \
+ struct list_head queue = {0, }; \
+ \
+ INIT_LIST_HEAD (&queue); \
+ \
+ _priv = this->private; \
+ GF_ASSERT (_priv); \
+ \
+ LOCK (&_priv->lock); \
+ { \
+ if (_priv->barrier_enabled) { \
+ _barrier_enabled = _priv->barrier_enabled;\
+ \
+ _stub = fop_##fop_name##_cbk_stub \
+ (frame, \
+ default_##fop_name##_cbk_resume,\
+ params); \
+ if (!_stub) { \
+ __barrier_disable (this, &queue);\
+ goto unlock; \
+ } \
+ \
+ __barrier_enqueue (this, _stub); \
+ } \
+ } \
+unlock: \
+ UNLOCK (&_priv->lock); \
+ \
+ if (_stub) \
+ goto label; \
+ \
+ if (_barrier_enabled && !_stub) { \
+ gf_log (this->name, GF_LOG_CRITICAL, \
+ "Failed to barrier FOPs, disabling " \
+ "barrier. FOP: %s, ERROR: %s", \
+ #fop_name, strerror (ENOMEM)); \
+ barrier_dequeue_all (this, &queue); \
+ } \
+ \
+ STACK_UNWIND_STRICT (fop_name, frame, params); \
+ goto label; \
+ } while (0)
+
+typedef struct {
+ gf_timer_t *timer;
+ gf_boolean_t barrier_enabled;
+ gf_lock_t lock;
+ struct list_head queue;
+ struct timespec timeout;
+ uint32_t queue_size;
+} barrier_priv_t;
+
+int __barrier_enable (xlator_t *this, barrier_priv_t *priv);
+void __barrier_enqueue (xlator_t *this, call_stub_t *stub);
+void __barrier_disable (xlator_t *this, struct list_head *queue);
+void barrier_timeout (void *data);
+void barrier_dequeue_all (xlator_t *this, struct list_head *queue);
+call_stub_t *__barrier_dequeue (xlator_t *this, struct list_head *queue);
+
+#endif
diff --git a/xlators/features/changelog/Makefile.am b/xlators/features/changelog/Makefile.am
new file mode 100644
index 000000000..153bb6850
--- /dev/null
+++ b/xlators/features/changelog/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src lib
+
+CLEANFILES =
diff --git a/xlators/features/changelog/lib/Makefile.am b/xlators/features/changelog/lib/Makefile.am
new file mode 100644
index 000000000..a985f42a8
--- /dev/null
+++ b/xlators/features/changelog/lib/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/features/changelog/lib/examples/c/get-changes.c b/xlators/features/changelog/lib/examples/c/get-changes.c
new file mode 100644
index 000000000..14562585a
--- /dev/null
+++ b/xlators/features/changelog/lib/examples/c/get-changes.c
@@ -0,0 +1,87 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+/**
+ * get set of new changes every 10 seconds (just print the file names)
+ *
+ * Compile it using:
+ * gcc -o getchanges `pkg-config --cflags libgfchangelog` get-changes.c \
+ * `pkg-config --libs libgfchangelog`
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/un.h>
+#include <limits.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <errno.h>
+
+#include "changelog.h"
+
+#define handle_error(fn) \
+ printf ("%s (reason: %s)\n", fn, strerror (errno))
+
+int
+main (int argc, char ** argv)
+{
+ int i = 0;
+ int ret = 0;
+ ssize_t nr_changes = 0;
+ ssize_t changes = 0;
+ char fbuf[PATH_MAX] = {0,};
+
+ /* get changes for brick "/home/vshankar/export/yow/yow-1" */
+ ret = gf_changelog_register ("/home/vshankar/export/yow/yow-1",
+ "/tmp/scratch", "/tmp/change.log", 9, 5);
+ if (ret) {
+ handle_error ("register failed");
+ goto out;
+ }
+
+ while (1) {
+ i = 0;
+ nr_changes = gf_changelog_scan ();
+ if (nr_changes < 0) {
+ handle_error ("scan(): ");
+ break;
+ }
+
+ if (nr_changes == 0)
+ goto next;
+
+ printf ("Got %ld changelog files\n", nr_changes);
+
+ while ( (changes =
+ gf_changelog_next_change (fbuf, PATH_MAX)) > 0) {
+ printf ("changelog file [%d]: %s\n", ++i, fbuf);
+
+ /* process changelog */
+ /* ... */
+ /* ... */
+ /* ... */
+ /* done processing */
+
+ ret = gf_changelog_done (fbuf);
+ if (ret)
+ handle_error ("gf_changelog_done");
+ }
+
+ if (changes == -1)
+ handle_error ("gf_changelog_next_change");
+
+ next:
+ sleep (10);
+ }
+
+ out:
+ return ret;
+}
diff --git a/xlators/features/changelog/lib/examples/python/changes.py b/xlators/features/changelog/lib/examples/python/changes.py
new file mode 100644
index 000000000..d21db8eab
--- /dev/null
+++ b/xlators/features/changelog/lib/examples/python/changes.py
@@ -0,0 +1,32 @@
+#!/usr/bin/python
+
+import os
+import sys
+import time
+import libgfchangelog
+
+cl = libgfchangelog.Changes()
+
+def get_changes(brick, scratch_dir, log_file, log_level, interval):
+ change_list = []
+ try:
+ cl.cl_register(brick, scratch_dir, log_file, log_level)
+ while True:
+ cl.cl_scan()
+ change_list = cl.cl_getchanges()
+ if change_list:
+ print change_list
+ for change in change_list:
+ print('done with %s' % (change))
+ cl.cl_done(change)
+ time.sleep(interval)
+ except OSError:
+ ex = sys.exc_info()[1]
+ print ex
+
+if __name__ == '__main__':
+ if len(sys.argv) != 5:
+ print("usage: %s <brick> <scratch-dir> <log-file> <fetch-interval>"
+ % (sys.argv[0]))
+ sys.exit(1)
+ get_changes(sys.argv[1], sys.argv[2], sys.argv[3], 9, int(sys.argv[4]))
diff --git a/xlators/features/changelog/lib/examples/python/libgfchangelog.py b/xlators/features/changelog/lib/examples/python/libgfchangelog.py
new file mode 100644
index 000000000..68ec3baf1
--- /dev/null
+++ b/xlators/features/changelog/lib/examples/python/libgfchangelog.py
@@ -0,0 +1,64 @@
+import os
+from ctypes import *
+from ctypes.util import find_library
+
+class Changes(object):
+ libgfc = CDLL(find_library("gfchangelog"), use_errno=True)
+
+ @classmethod
+ def geterrno(cls):
+ return get_errno()
+
+ @classmethod
+ def raise_oserr(cls):
+ errn = cls.geterrno()
+ raise OSError(errn, os.strerror(errn))
+
+ @classmethod
+ def _get_api(cls, call):
+ return getattr(cls.libgfc, call)
+
+ @classmethod
+ def cl_register(cls, brick, path, log_file, log_level, retries = 0):
+ ret = cls._get_api('gf_changelog_register')(brick, path,
+ log_file, log_level, retries)
+ if ret == -1:
+ cls.raise_oserr()
+
+ @classmethod
+ def cl_scan(cls):
+ ret = cls._get_api('gf_changelog_scan')()
+ if ret == -1:
+ cls.raise_oserr()
+
+ @classmethod
+ def cl_startfresh(cls):
+ ret = cls._get_api('gf_changelog_start_fresh')()
+ if ret == -1:
+ cls.raise_oserr()
+
+ @classmethod
+ def cl_getchanges(cls):
+ """ remove hardcoding for path name length """
+ def clsort(f):
+ return f.split('.')[-1]
+ changes = []
+ buf = create_string_buffer('\0', 4096)
+ call = cls._get_api('gf_changelog_next_change')
+
+ while True:
+ ret = call(buf, 4096)
+ if ret in (0, -1):
+ break;
+ changes.append(buf.raw[:ret-1])
+ if ret == -1:
+ cls.raise_oserr()
+ # cleanup tracker
+ cls.cl_startfresh()
+ return sorted(changes, key=clsort)
+
+ @classmethod
+ def cl_done(cls, clfile):
+ ret = cls._get_api('gf_changelog_done')(clfile)
+ if ret == -1:
+ cls.raise_oserr()
diff --git a/xlators/features/changelog/lib/src/Makefile.am b/xlators/features/changelog/lib/src/Makefile.am
new file mode 100644
index 000000000..28d5a70aa
--- /dev/null
+++ b/xlators/features/changelog/lib/src/Makefile.am
@@ -0,0 +1,38 @@
+libgfchangelog_la_CFLAGS = -Wall $(GF_CFLAGS) $(GF_DARWIN_LIBGLUSTERFS_CFLAGS) \
+ -DDATADIR=\"$(localstatedir)\"
+
+libgfchangelog_la_CPPFLAGS = $(GF_CPPFLAGS) -D__USE_FILE_OFFSET64 -fpic \
+ -I../../../src/ -I$(top_srcdir)/libglusterfs/src \
+ -I$(top_srcdir)/xlators/features/changelog/src \
+ -DDATADIR=\"$(localstatedir)\"
+
+libgfchangelog_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
+ $(GF_GLUSTERFS_LIBS)
+
+libgfchangelog_la_LDFLAGS = $(GF_LDFLAGS) -version-info $(LIBGFCHANGELOG_LT_VERSION)
+
+libgfchangelogdir = $(includedir)/glusterfs/gfchangelog
+lib_LTLIBRARIES = libgfchangelog.la
+
+CONTRIB_BUILDDIR = $(top_builddir)/contrib
+
+libgfchangelog_la_SOURCES = gf-changelog.c gf-changelog-process.c \
+ gf-changelog-helpers.c gf-history-changelog.c \
+ $(CONTRIBDIR)/uuid/clear.c \
+ $(CONTRIBDIR)/uuid/copy.c $(CONTRIBDIR)/uuid/gen_uuid.c \
+ $(CONTRIBDIR)/uuid/pack.c $(CONTRIBDIR)/uuid/parse.c \
+ $(CONTRIBDIR)/uuid/unparse.c $(CONTRIBDIR)/uuid/uuid_time.c \
+ $(CONTRIBDIR)/uuid/compare.c $(CONTRIBDIR)/uuid/isnull.c \
+ $(CONTRIBDIR)/uuid/unpack.c
+
+noinst_HEADERS = gf-changelog-helpers.h $(CONTRIBDIR)/uuid/uuidd.h \
+ $(CONTRIBDIR)/uuid/uuid.h $(CONTRIBDIR)/uuid/uuidP.h \
+ $(CONTRIB_BUILDDIR)/uuid/uuid_types.h
+
+libgfchangelog_HEADERS = changelog.h
+
+CLEANFILES =
+CONFIG_CLEAN_FILES = $(CONTRIB_BUILDDIR)/uuid/uuid_types.h
+
+$(top_builddir)/libglusterfs/src/libglusterfs.la:
+ $(MAKE) -C $(top_builddir)/libglusterfs/src/ all
diff --git a/xlators/features/changelog/lib/src/changelog.h b/xlators/features/changelog/lib/src/changelog.h
new file mode 100644
index 000000000..5cddfb583
--- /dev/null
+++ b/xlators/features/changelog/lib/src/changelog.h
@@ -0,0 +1,31 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GF_CHANGELOG_H
+#define _GF_CHANGELOG_H
+
+/* API set */
+
+int
+gf_changelog_register (char *brick_path, char *scratch_dir,
+ char *log_file, int log_levl, int max_reconnects);
+ssize_t
+gf_changelog_scan ();
+
+int
+gf_changelog_start_fresh ();
+
+ssize_t
+gf_changelog_next_change (char *bufptr, size_t maxlen);
+
+int
+gf_changelog_done (char *file);
+
+#endif
diff --git a/xlators/features/changelog/lib/src/gf-changelog-helpers.c b/xlators/features/changelog/lib/src/gf-changelog-helpers.c
new file mode 100644
index 000000000..1eef8bf04
--- /dev/null
+++ b/xlators/features/changelog/lib/src/gf-changelog-helpers.c
@@ -0,0 +1,180 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "changelog-mem-types.h"
+#include "gf-changelog-helpers.h"
+
+ssize_t gf_changelog_read_path (int fd, char *buffer, size_t bufsize)
+{
+ return read (fd, buffer, bufsize);
+}
+
+size_t
+gf_changelog_write (int fd, char *buffer, size_t len)
+{
+ ssize_t size = 0;
+ size_t writen = 0;
+
+ while (writen < len) {
+ size = write (fd,
+ buffer + writen, len - writen);
+ if (size <= 0)
+ break;
+
+ writen += size;
+ }
+
+ return writen;
+}
+
+void
+gf_rfc3986_encode (unsigned char *s, char *enc, char *estr)
+{
+ for (; *s; s++) {
+ if (estr[*s])
+ sprintf(enc, "%c", estr[*s]);
+ else
+ sprintf(enc, "%%%02X", *s);
+ while (*++enc);
+ }
+}
+
+/**
+ * thread safe version of readline with buffering
+ * (taken from Unix Network Programming Volume I, W.R. Stevens)
+ *
+ * This is favoured over fgets() as we'd need to ftruncate()
+ * (see gf_changelog_scan() API) to record new changelog files.
+ * stream open functions does have a truncate like api (although
+ * that can be done via @fflush(fp), @ftruncate(fd) and @fseek(fp),
+ * but this involves mixing POSIX file descriptors and stream FILE *).
+ *
+ * NOTE: This implmentation still does work with more than one fd's
+ * used to perform gf_readline(). For this very reason it's not
+ * made a part of libglusterfs.
+ */
+
+static pthread_key_t rl_key;
+static pthread_once_t rl_once = PTHREAD_ONCE_INIT;
+
+static void
+readline_destructor (void *ptr)
+{
+ GF_FREE (ptr);
+}
+
+static void
+readline_once (void)
+{
+ pthread_key_create (&rl_key, readline_destructor);
+}
+
+static ssize_t
+my_read (read_line_t *tsd, int fd, char *ptr)
+{
+ if (tsd->rl_cnt <= 0) {
+ if ( (tsd->rl_cnt = read (fd, tsd->rl_buf, MAXLINE)) < 0 )
+ return -1;
+ else if (tsd->rl_cnt == 0)
+ return 0;
+ tsd->rl_bufptr = tsd->rl_buf;
+ }
+
+ tsd->rl_cnt--;
+ *ptr = *tsd->rl_bufptr++;
+ return 1;
+}
+
+static int
+gf_readline_init_once (read_line_t **tsd)
+{
+ if (pthread_once (&rl_once, readline_once) != 0)
+ return -1;
+
+ *tsd = pthread_getspecific (rl_key);
+ if (*tsd)
+ goto out;
+
+ *tsd = GF_CALLOC (1, sizeof (**tsd),
+ gf_changelog_mt_libgfchangelog_rl_t);
+ if (!*tsd)
+ return -1;
+
+ if (pthread_setspecific (rl_key, *tsd) != 0)
+ return -1;
+
+ out:
+ return 0;
+}
+
+ssize_t
+gf_readline (int fd, void *vptr, size_t maxlen)
+{
+ size_t n = 0;
+ size_t rc = 0;
+ char c = ' ';
+ char *ptr = NULL;
+ read_line_t *tsd = NULL;
+
+ if (gf_readline_init_once (&tsd))
+ return -1;
+
+ ptr = vptr;
+ for (n = 1; n < maxlen; n++) {
+ if ( (rc = my_read (tsd, fd, &c)) == 1 ) {
+ *ptr++ = c;
+ if (c == '\n')
+ break;
+ } else if (rc == 0) {
+ *ptr = '\0';
+ return (n - 1);
+ } else
+ return -1;
+ }
+
+ *ptr = '\0';
+ return n;
+
+}
+
+off_t
+gf_lseek (int fd, off_t offset, int whence)
+{
+ off_t off = 0;
+ read_line_t *tsd = NULL;
+
+ if (gf_readline_init_once (&tsd))
+ return -1;
+
+ if ( (off = lseek (fd, offset, whence)) == -1)
+ return -1;
+
+ tsd->rl_cnt = 0;
+ tsd->rl_bufptr = tsd->rl_buf;
+
+ return off;
+}
+
+int
+gf_ftruncate (int fd, off_t length)
+{
+ read_line_t *tsd = NULL;
+
+ if (gf_readline_init_once (&tsd))
+ return -1;
+
+ if (ftruncate (fd, 0))
+ return -1;
+
+ tsd->rl_cnt = 0;
+ tsd->rl_bufptr = tsd->rl_buf;
+
+ return 0;
+}
diff --git a/xlators/features/changelog/lib/src/gf-changelog-helpers.h b/xlators/features/changelog/lib/src/gf-changelog-helpers.h
new file mode 100644
index 000000000..fa0edabf0
--- /dev/null
+++ b/xlators/features/changelog/lib/src/gf-changelog-helpers.h
@@ -0,0 +1,102 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GF_CHANGELOG_HELPERS_H
+#define _GF_CHANGELOG_HELPERS_H
+
+#include <unistd.h>
+#include <dirent.h>
+#include <limits.h>
+#include <pthread.h>
+
+#include <xlator.h>
+
+#define GF_CHANGELOG_TRACKER "tracker"
+
+#define GF_CHANGELOG_CURRENT_DIR ".current"
+#define GF_CHANGELOG_PROCESSED_DIR ".processed"
+#define GF_CHANGELOG_PROCESSING_DIR ".processing"
+#define GF_CHANGELOG_HISTORY_DIR ".history"
+
+#ifndef MAXLINE
+#define MAXLINE 4096
+#endif
+
+#define GF_CHANGELOG_FILL_BUFFER(ptr, ascii, off, len) do { \
+ memcpy (ascii + off, ptr, len); \
+ off += len; \
+ } while (0)
+
+typedef struct read_line {
+ int rl_cnt;
+ char *rl_bufptr;
+ char rl_buf[MAXLINE];
+} read_line_t;
+
+typedef struct gf_changelog {
+ xlator_t *this;
+
+ /* 'processing' directory stream */
+ DIR *gfc_dir;
+
+ /* fd to the tracker file */
+ int gfc_fd;
+
+ /* connection retries */
+ int gfc_connretries;
+
+ char gfc_sockpath[UNIX_PATH_MAX];
+
+ char gfc_brickpath[PATH_MAX];
+
+ /* socket for recieving notifications */
+ int gfc_sockfd;
+
+ char *gfc_working_dir;
+
+ /* RFC 3986 string encoding */
+ char rfc3986[256];
+
+ char gfc_current_dir[PATH_MAX];
+ char gfc_processed_dir[PATH_MAX];
+ char gfc_processing_dir[PATH_MAX];
+
+ pthread_t gfc_changelog_processor;
+
+ /* Holds gfc for History API */
+ struct gf_changelog *hist_gfc;
+} gf_changelog_t;
+
+int
+gf_changelog_notification_init (xlator_t *this, gf_changelog_t *gfc);
+
+void *
+gf_changelog_process (void *data);
+
+ssize_t
+gf_changelog_read_path (int fd, char *buffer, size_t bufsize);
+
+void
+gf_rfc3986_encode (unsigned char *s, char *enc, char *estr);
+
+size_t
+gf_changelog_write (int fd, char *buffer, size_t len);
+
+ssize_t
+gf_readline (int fd, void *vptr, size_t maxlen);
+
+int
+gf_ftruncate (int fd, off_t length);
+
+off_t
+gf_lseek (int fd, off_t offset, int whence);
+
+
+#endif
diff --git a/xlators/features/changelog/lib/src/gf-changelog-process.c b/xlators/features/changelog/lib/src/gf-changelog-process.c
new file mode 100644
index 000000000..df7204931
--- /dev/null
+++ b/xlators/features/changelog/lib/src/gf-changelog-process.c
@@ -0,0 +1,571 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <unistd.h>
+#include <pthread.h>
+
+#include "uuid.h"
+#include "globals.h"
+#include "glusterfs.h"
+
+#include "gf-changelog-helpers.h"
+
+/* from the changelog translator */
+#include "changelog-misc.h"
+
+extern int byebye;
+
+/**
+ * number of gfid records after fop number
+ */
+int nr_gfids[] = {
+ [GF_FOP_MKNOD] = 1,
+ [GF_FOP_MKDIR] = 1,
+ [GF_FOP_UNLINK] = 1,
+ [GF_FOP_RMDIR] = 1,
+ [GF_FOP_SYMLINK] = 1,
+ [GF_FOP_RENAME] = 2,
+ [GF_FOP_LINK] = 1,
+ [GF_FOP_CREATE] = 1,
+};
+
+static char *
+binary_to_ascii (uuid_t uuid)
+{
+ return uuid_utoa (uuid);
+}
+
+static char *
+conv_noop (char *ptr) { return ptr; }
+
+#define VERIFY_SEPARATOR(ptr, plen, perr) \
+ { \
+ if (*(ptr + plen) != '\0') { \
+ perr = 1; \
+ break; \
+ } \
+ }
+
+#define MOVER_MOVE(mover, nleft, bytes) \
+ { \
+ mover += bytes; \
+ nleft -= bytes; \
+ } \
+
+#define PARSE_GFID(mov, ptr, le, fn, perr) \
+ { \
+ VERIFY_SEPARATOR (mov, le, perr); \
+ ptr = fn (mov); \
+ if (!ptr) { \
+ perr = 1; \
+ break; \
+ } \
+ }
+
+#define FILL_AND_MOVE(pt, buf, of, mo, nl, le) \
+ { \
+ GF_CHANGELOG_FILL_BUFFER (pt, buf, of, strlen (pt)); \
+ MOVER_MOVE (mo, nl, le); \
+ }
+
+
+#define PARSE_GFID_MOVE(ptr, uuid, mover, nleft, perr) \
+ { \
+ memcpy (uuid, mover, sizeof (uuid_t)); \
+ ptr = binary_to_ascii (uuid); \
+ if (!ptr) { \
+ perr = 1; \
+ break; \
+ } \
+ MOVER_MOVE (mover, nleft, sizeof (uuid_t)); \
+ } \
+
+#define LINE_BUFSIZE 3*PATH_MAX /* enough buffer for extra chars too */
+
+/**
+ * using mmap() makes parsing easy. fgets() cannot be used here as
+ * the binary gfid could contain a line-feed (0x0A), in that case fgets()
+ * would read an incomplete line and parsing would fail. using POSIX fds
+ * would result is additional code to maintain state in case of partial
+ * reads of data (where multiple entries do not fit extirely in the buffer).
+ *
+ * mmap() gives the flexibility of pointing to an offset in the file
+ * without us worrying about reading it in memory (VM does that for us for
+ * free).
+ */
+
+static int
+gf_changelog_parse_binary (xlator_t *this,
+ gf_changelog_t *gfc, int from_fd, int to_fd,
+ size_t start_offset, struct stat *stbuf)
+
+{
+ int ret = -1;
+ off_t off = 0;
+ off_t nleft = 0;
+ uuid_t uuid = {0,};
+ char *ptr = NULL;
+ char *bname_start = NULL;
+ char *bname_end = NULL;
+ char *mover = NULL;
+ char *start = NULL;
+ char current_mover = ' ';
+ size_t blen = 0;
+ int parse_err = 0;
+ char ascii[LINE_BUFSIZE] = {0,};
+
+ nleft = stbuf->st_size;
+
+ start = (char *) mmap (NULL, nleft,
+ PROT_READ, MAP_PRIVATE, from_fd, 0);
+ if (!start) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "mmap() error (reason: %s)", strerror (errno));
+ goto out;
+ }
+
+ mover = start;
+
+ MOVER_MOVE (mover, nleft, start_offset);
+
+ while (nleft > 0) {
+
+ off = blen = 0;
+ ptr = bname_start = bname_end = NULL;
+
+ current_mover = *mover;
+
+ switch (current_mover) {
+ case 'D':
+ case 'M':
+ MOVER_MOVE (mover, nleft, 1);
+ PARSE_GFID_MOVE (ptr, uuid, mover, nleft, parse_err);
+
+ break;
+
+ case 'E':
+ MOVER_MOVE (mover, nleft, 1);
+ PARSE_GFID_MOVE (ptr, uuid, mover, nleft, parse_err);
+
+ bname_start = mover;
+ if ( (bname_end = strchr (mover, '\n')) == NULL ) {
+ parse_err = 1;
+ break;
+ }
+
+ blen = bname_end - bname_start;
+ MOVER_MOVE (mover, nleft, blen);
+
+ break;
+
+ default:
+ parse_err = 1;
+ }
+
+ if (parse_err)
+ break;
+
+ GF_CHANGELOG_FILL_BUFFER (&current_mover, ascii, off, 1);
+ GF_CHANGELOG_FILL_BUFFER (" ", ascii, off, 1);
+ GF_CHANGELOG_FILL_BUFFER (ptr, ascii, off, strlen (ptr));
+ if (blen)
+ GF_CHANGELOG_FILL_BUFFER (bname_start,
+ ascii, off, blen);
+ GF_CHANGELOG_FILL_BUFFER ("\n", ascii, off, 1);
+
+ if (gf_changelog_write (to_fd, ascii, off) != off) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "processing binary changelog failed due to "
+ " error in writing ascii change (reason: %s)",
+ strerror (errno));
+ break;
+ }
+
+ MOVER_MOVE (mover, nleft, 1);
+ }
+
+ if ( (nleft == 0) && (!parse_err))
+ ret = 0;
+
+ if (munmap (start, stbuf->st_size))
+ gf_log (this->name, GF_LOG_ERROR,
+ "munmap() error (reason: %s)", strerror (errno));
+ out:
+ return ret;
+}
+
+/**
+ * ascii decoder:
+ * - separate out one entry from another
+ * - use fop name rather than fop number
+ */
+static int
+gf_changelog_parse_ascii (xlator_t *this,
+ gf_changelog_t *gfc, int from_fd, int to_fd,
+ size_t start_offset, struct stat *stbuf)
+{
+ int ng = 0;
+ int ret = -1;
+ int fop = 0;
+ int len = 0;
+ off_t off = 0;
+ off_t nleft = 0;
+ char *ptr = NULL;
+ char *eptr = NULL;
+ char *start = NULL;
+ char *mover = NULL;
+ int parse_err = 0;
+ char current_mover = ' ';
+ char ascii[LINE_BUFSIZE] = {0,};
+ const char *fopname = NULL;
+
+ nleft = stbuf->st_size;
+
+ start = (char *) mmap (NULL, nleft,
+ PROT_READ, MAP_PRIVATE, from_fd, 0);
+ if (!start) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "mmap() error (reason: %s)", strerror (errno));
+ goto out;
+ }
+
+ mover = start;
+
+ MOVER_MOVE (mover, nleft, start_offset);
+
+ while (nleft > 0) {
+ off = 0;
+ current_mover = *mover;
+
+ GF_CHANGELOG_FILL_BUFFER (&current_mover, ascii, off, 1);
+ GF_CHANGELOG_FILL_BUFFER (" ", ascii, off, 1);
+
+ switch (current_mover) {
+ case 'D':
+ case 'M':
+ MOVER_MOVE (mover, nleft, 1);
+
+ /* target gfid */
+ PARSE_GFID (mover, ptr, UUID_CANONICAL_FORM_LEN,
+ conv_noop, parse_err);
+ FILL_AND_MOVE(ptr, ascii, off,
+ mover, nleft, UUID_CANONICAL_FORM_LEN);
+ break;
+
+ case 'E':
+ MOVER_MOVE (mover, nleft, 1);
+
+ /* target gfid */
+ PARSE_GFID (mover, ptr, UUID_CANONICAL_FORM_LEN,
+ conv_noop, parse_err);
+ FILL_AND_MOVE (ptr, ascii, off,
+ mover, nleft, UUID_CANONICAL_FORM_LEN);
+ FILL_AND_MOVE (" ", ascii, off,
+ mover, nleft, 1);
+
+ /* fop */
+ len = strlen (mover);
+ VERIFY_SEPARATOR (mover, len, parse_err);
+
+ fop = atoi (mover);
+ if ( (fopname = gf_fop_list[fop]) == NULL) {
+ parse_err = 1;
+ break;
+ }
+
+ MOVER_MOVE (mover, nleft, len);
+
+ len = strlen (fopname);
+ GF_CHANGELOG_FILL_BUFFER (fopname, ascii, off, len);
+
+ /* pargfid + bname */
+ ng = nr_gfids[fop];
+ while (ng-- > 0) {
+ MOVER_MOVE (mover, nleft, 1);
+ len = strlen (mover);
+ GF_CHANGELOG_FILL_BUFFER (" ", ascii, off, 1);
+
+ PARSE_GFID (mover, ptr, len,
+ conv_noop, parse_err);
+ eptr = calloc (3, strlen (ptr));
+ if (!eptr) {
+ parse_err = 1;
+ break;
+ }
+
+ gf_rfc3986_encode ((unsigned char *) ptr,
+ eptr, gfc->rfc3986);
+ FILL_AND_MOVE (eptr, ascii, off,
+ mover, nleft, len);
+ free (eptr);
+ }
+
+ break;
+ default:
+ parse_err = 1;
+ }
+
+ if (parse_err)
+ break;
+
+ GF_CHANGELOG_FILL_BUFFER ("\n", ascii, off, 1);
+
+ if (gf_changelog_write (to_fd, ascii, off) != off) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "processing ascii changelog failed due to "
+ " wrror in writing change (reason: %s)",
+ strerror (errno));
+ break;
+ }
+
+ MOVER_MOVE (mover, nleft, 1);
+
+ }
+
+ if ( (nleft == 0) && (!parse_err))
+ ret = 0;
+
+ if (munmap (start, stbuf->st_size))
+ gf_log (this->name, GF_LOG_ERROR,
+ "munmap() error (reason: %s)", strerror (errno));
+
+ out:
+ return ret;
+}
+
+#define COPY_BUFSIZE 8192
+static int
+gf_changelog_copy (xlator_t *this, int from_fd, int to_fd)
+{
+ ssize_t size = 0;
+ char buffer[COPY_BUFSIZE+1] = {0,};
+
+ while (1) {
+ size = read (from_fd, buffer, COPY_BUFSIZE);
+ if (size <= 0)
+ break;
+
+ if (gf_changelog_write (to_fd,
+ buffer, size) != size) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "error processing ascii changlog");
+ size = -1;
+ break;
+ }
+ }
+
+ return (size < 0 ? -1 : 0);
+}
+
+static int
+gf_changelog_decode (xlator_t *this, gf_changelog_t *gfc, int from_fd,
+ int to_fd, struct stat *stbuf, int *zerob)
+{
+ int ret = -1;
+ int encoding = -1;
+ size_t elen = 0;
+ char buffer[1024] = {0,};
+
+ CHANGELOG_GET_ENCODING (from_fd, buffer, 1024, encoding, elen);
+ if (encoding == -1) /* unknown encoding */
+ goto out;
+
+ if (!CHANGELOG_VALID_ENCODING (encoding))
+ goto out;
+
+ if (elen == stbuf->st_size) {
+ *zerob = 1;
+ goto out;
+ }
+
+ /**
+ * start processing after the header
+ */
+ lseek (from_fd, elen, SEEK_SET);
+
+ switch (encoding) {
+ case CHANGELOG_ENCODE_BINARY:
+ /**
+ * this ideally should have been a part of changelog-encoders.c
+ * (ie. part of the changelog translator).
+ */
+ ret = gf_changelog_parse_binary (this, gfc, from_fd,
+ to_fd, elen, stbuf);
+ break;
+
+ case CHANGELOG_ENCODE_ASCII:
+ ret = gf_changelog_parse_ascii (this, gfc, from_fd,
+ to_fd, elen, stbuf);
+ break;
+ default:
+ ret = gf_changelog_copy (this, from_fd, to_fd);
+ }
+
+ out:
+ return ret;
+}
+
+static int
+gf_changelog_consume (xlator_t *this, gf_changelog_t *gfc, char *from_path)
+{
+ int ret = -1;
+ int fd1 = 0;
+ int fd2 = 0;
+ int zerob = 0;
+ struct stat stbuf = {0,};
+ char dest[PATH_MAX] = {0,};
+ char to_path[PATH_MAX] = {0,};
+
+ ret = stat (from_path, &stbuf);
+ if (ret || !S_ISREG(stbuf.st_mode)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "stat failed on changelog file: %s", from_path);
+ goto out;
+ }
+
+ fd1 = open (from_path, O_RDONLY);
+ if (fd1 < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "cannot open changelog file: %s (reason: %s)",
+ from_path, strerror (errno));
+ goto out;
+ }
+
+ (void) snprintf (to_path, PATH_MAX, "%s%s",
+ gfc->gfc_current_dir, basename (from_path));
+ (void) snprintf (dest, PATH_MAX, "%s%s",
+ gfc->gfc_processing_dir, basename (from_path));
+
+ fd2 = open (to_path, O_CREAT | O_TRUNC | O_RDWR,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+ if (fd2 < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "cannot create ascii changelog file %s (reason %s)",
+ to_path, strerror (errno));
+ goto close_fd;
+ } else {
+ ret = gf_changelog_decode (this, gfc, fd1,
+ fd2, &stbuf, &zerob);
+
+ close (fd2);
+
+ if (!ret) {
+ /* move it to processing on a successfull
+ decode */
+ ret = rename (to_path, dest);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "error moving %s to processing dir"
+ " (reason: %s)", to_path,
+ strerror (errno));
+ }
+
+ /* remove it from .current if it's an empty file */
+ if (zerob) {
+ ret = unlink (to_path);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not unlink %s (reason: %s",
+ to_path, strerror (errno));
+ }
+ }
+
+ close_fd:
+ close (fd1);
+
+ out:
+ return ret;
+}
+
+static char *
+gf_changelog_ext_change (xlator_t *this,
+ gf_changelog_t *gfc, char *path, size_t readlen)
+{
+ int alo = 0;
+ int ret = 0;
+ size_t len = 0;
+ char *buf = NULL;
+
+ buf = path;
+ while (len < readlen) {
+ if (*buf == '\0') {
+ alo = 1;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "processing changelog: %s", path);
+ ret = gf_changelog_consume (this, gfc, path);
+ }
+
+ if (ret)
+ break;
+
+ len++; buf++;
+ if (alo) {
+ alo = 0;
+ path = buf;
+ }
+ }
+
+ return (ret) ? NULL : path;
+}
+
+void *
+gf_changelog_process (void *data)
+{
+ ssize_t len = 0;
+ ssize_t offlen = 0;
+ xlator_t *this = NULL;
+ char *sbuf = NULL;
+ gf_changelog_t *gfc = NULL;
+ char from_path[PATH_MAX] = {0,};
+
+ gfc = (gf_changelog_t *) data;
+ this = gfc->this;
+
+ pthread_detach (pthread_self());
+
+ for (;;) {
+ len = gf_changelog_read_path (gfc->gfc_sockfd,
+ from_path + offlen,
+ PATH_MAX - offlen);
+ if (len < 0)
+ continue; /* ignore it for now */
+
+ if (len == 0) { /* close() from the changelog translator */
+ gf_log (this->name, GF_LOG_INFO, "close from changelog"
+ " notification translator.");
+
+ if (gfc->gfc_connretries != 1) {
+ if (!gf_changelog_notification_init(this, gfc))
+ continue;
+ }
+
+ byebye = 1;
+ break;
+ }
+
+ len += offlen;
+ sbuf = gf_changelog_ext_change (this, gfc, from_path, len);
+ if (!sbuf) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not extract changelog filename");
+ continue;
+ }
+
+ offlen = 0;
+ if (sbuf != (from_path + len)) {
+ offlen = from_path + len - sbuf;
+ memmove (from_path, sbuf, offlen);
+ }
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "byebye (%d) from processing thread...", byebye);
+ return NULL;
+}
diff --git a/xlators/features/changelog/lib/src/gf-changelog.c b/xlators/features/changelog/lib/src/gf-changelog.c
new file mode 100644
index 000000000..0827f2cac
--- /dev/null
+++ b/xlators/features/changelog/lib/src/gf-changelog.c
@@ -0,0 +1,571 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <errno.h>
+#include <dirent.h>
+#include <stddef.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <string.h>
+
+#include "globals.h"
+#include "glusterfs.h"
+#include "logging.h"
+
+#include "gf-changelog-helpers.h"
+
+/* from the changelog translator */
+#include "changelog-misc.h"
+#include "changelog-mem-types.h"
+
+int byebye = 0;
+
+static void
+gf_changelog_cleanup (gf_changelog_t *gfc)
+{
+ /* socket */
+ if (gfc->gfc_sockfd != -1)
+ close (gfc->gfc_sockfd);
+ /* tracker fd */
+ if (gfc->gfc_fd != -1)
+ close (gfc->gfc_fd);
+ /* processing dir */
+ if (gfc->gfc_dir)
+ closedir (gfc->gfc_dir);
+
+ if (gfc->gfc_working_dir)
+ free (gfc->gfc_working_dir); /* allocated by realpath */
+}
+
+void
+__attribute__ ((constructor)) gf_changelog_ctor (void)
+{
+ glusterfs_ctx_t *ctx = NULL;
+
+ ctx = glusterfs_ctx_new ();
+ if (!ctx)
+ return;
+
+ if (glusterfs_globals_init (ctx)) {
+ free (ctx);
+ ctx = NULL;
+ return;
+ }
+
+ THIS->ctx = ctx;
+}
+
+void
+__attribute__ ((destructor)) gf_changelog_dtor (void)
+{
+ xlator_t *this = NULL;
+ glusterfs_ctx_t *ctx = NULL;
+ gf_changelog_t *gfc = NULL;
+
+ this = THIS;
+ if (!this)
+ return;
+
+ ctx = this->ctx;
+ gfc = this->private;
+
+ if (gfc) {
+ if (gfc->hist_gfc) {
+ gf_changelog_cleanup(gfc->hist_gfc);
+ GF_FREE (gfc->hist_gfc);
+ }
+ gf_changelog_cleanup (gfc);
+ GF_FREE (gfc);
+ }
+
+ if (ctx) {
+ pthread_mutex_destroy (&ctx->lock);
+ free (ctx);
+ ctx = NULL;
+ }
+}
+
+
+static int
+gf_changelog_open_dirs (gf_changelog_t *gfc)
+{
+ int ret = -1;
+ DIR *dir = NULL;
+ int tracker_fd = 0;
+ char tracker_path[PATH_MAX] = {0,};
+
+ (void) snprintf (gfc->gfc_current_dir, PATH_MAX,
+ "%s/"GF_CHANGELOG_CURRENT_DIR"/",
+ gfc->gfc_working_dir);
+ ret = mkdir_p (gfc->gfc_current_dir, 0600, _gf_false);
+ if (ret)
+ goto out;
+
+ (void) snprintf (gfc->gfc_processed_dir, PATH_MAX,
+ "%s/"GF_CHANGELOG_PROCESSED_DIR"/",
+ gfc->gfc_working_dir);
+ ret = mkdir_p (gfc->gfc_processed_dir, 0600, _gf_false);
+ if (ret)
+ goto out;
+
+ (void) snprintf (gfc->gfc_processing_dir, PATH_MAX,
+ "%s/"GF_CHANGELOG_PROCESSING_DIR"/",
+ gfc->gfc_working_dir);
+ ret = mkdir_p (gfc->gfc_processing_dir, 0600, _gf_false);
+ if (ret)
+ goto out;
+
+ dir = opendir (gfc->gfc_processing_dir);
+ if (!dir) {
+ gf_log ("", GF_LOG_ERROR,
+ "opendir() error [reason: %s]", strerror (errno));
+ goto out;
+ }
+
+ gfc->gfc_dir = dir;
+
+ (void) snprintf (tracker_path, PATH_MAX,
+ "%s/"GF_CHANGELOG_TRACKER, gfc->gfc_working_dir);
+
+ tracker_fd = open (tracker_path, O_CREAT | O_APPEND | O_RDWR,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+ if (tracker_fd < 0) {
+ closedir (gfc->gfc_dir);
+ ret = -1;
+ goto out;
+ }
+
+ gfc->gfc_fd = tracker_fd;
+ ret = 0;
+ out:
+ return ret;
+}
+
+int
+gf_changelog_notification_init (xlator_t *this, gf_changelog_t *gfc)
+{
+ int ret = 0;
+ int len = 0;
+ int tries = 0;
+ int sockfd = 0;
+ struct sockaddr_un remote;
+
+ this = gfc->this;
+
+ if (gfc->gfc_sockfd != -1) {
+ gf_log (this->name, GF_LOG_INFO,
+ "Reconnecting...");
+ close (gfc->gfc_sockfd);
+ }
+
+ sockfd = socket (AF_UNIX, SOCK_STREAM, 0);
+ if (sockfd < 0) {
+ ret = -1;
+ goto out;
+ }
+
+ CHANGELOG_MAKE_SOCKET_PATH (gfc->gfc_brickpath,
+ gfc->gfc_sockpath, UNIX_PATH_MAX);
+ gf_log (this->name, GF_LOG_INFO,
+ "connecting to changelog socket: %s (brick: %s)",
+ gfc->gfc_sockpath, gfc->gfc_brickpath);
+
+ remote.sun_family = AF_UNIX;
+ strcpy (remote.sun_path, gfc->gfc_sockpath);
+
+ len = strlen (remote.sun_path) + sizeof (remote.sun_family);
+
+ while (tries < gfc->gfc_connretries) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "connection attempt %d/%d...",
+ tries + 1, gfc->gfc_connretries);
+
+ /* initiate a connect */
+ if (connect (sockfd, (struct sockaddr *) &remote, len) == 0) {
+ gfc->gfc_sockfd = sockfd;
+ break;
+ }
+
+ tries++;
+ sleep (2);
+ }
+
+ if (tries == gfc->gfc_connretries) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not connect to changelog socket!"
+ " bailing out...");
+ close (sockfd);
+ ret = -1;
+ } else
+ gf_log (this->name, GF_LOG_INFO,
+ "connection successful");
+
+ out:
+ return ret;
+}
+
+int
+gf_changelog_done (char *file)
+{
+ int ret = -1;
+ char *buffer = NULL;
+ xlator_t *this = NULL;
+ gf_changelog_t *gfc = NULL;
+ char to_path[PATH_MAX] = {0,};
+
+ errno = EINVAL;
+
+ this = THIS;
+ if (!this)
+ goto out;
+
+ gfc = (gf_changelog_t *) this->private;
+ if (!gfc)
+ goto out;
+
+ if (!file || !strlen (file))
+ goto out;
+
+ /* make sure 'file' is inside ->gfc_working_dir */
+ buffer = realpath (file, NULL);
+ if (!buffer)
+ goto out;
+
+ if (strncmp (gfc->gfc_working_dir,
+ buffer, strlen (gfc->gfc_working_dir)))
+ goto out;
+
+ (void) snprintf (to_path, PATH_MAX, "%s%s",
+ gfc->gfc_processed_dir, basename (buffer));
+ gf_log (this->name, GF_LOG_DEBUG,
+ "moving %s to processed directory", file);
+ ret = rename (buffer, to_path);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "cannot move %s to %s (reason: %s)",
+ file, to_path, strerror (errno));
+ goto out;
+ }
+
+ ret = 0;
+
+ out:
+ if (buffer)
+ free (buffer); /* allocated by realpath() */
+ return ret;
+}
+
+/**
+ * @API
+ * for a set of changelogs, start from the begining
+ */
+int
+gf_changelog_start_fresh ()
+{
+ xlator_t *this = NULL;
+ gf_changelog_t *gfc = NULL;
+
+ this = THIS;
+ if (!this)
+ goto out;
+
+ errno = EINVAL;
+
+ gfc = (gf_changelog_t *) this->private;
+ if (!gfc)
+ goto out;
+
+ if (gf_ftruncate (gfc->gfc_fd, 0))
+ goto out;
+
+ return 0;
+
+ out:
+ return -1;
+}
+
+/**
+ * @API
+ * return the next changelog file entry. zero means all chanelogs
+ * consumed.
+ */
+ssize_t
+gf_changelog_next_change (char *bufptr, size_t maxlen)
+{
+ ssize_t size = 0;
+ int tracker_fd = 0;
+ xlator_t *this = NULL;
+ gf_changelog_t *gfc = NULL;
+ char buffer[PATH_MAX] = {0,};
+
+ errno = EINVAL;
+
+ this = THIS;
+ if (!this)
+ goto out;
+
+ gfc = (gf_changelog_t *) this->private;
+ if (!gfc)
+ goto out;
+
+ tracker_fd = gfc->gfc_fd;
+
+ size = gf_readline (tracker_fd, buffer, maxlen);
+ if (size < 0)
+ goto out;
+ if (size == 0)
+ return 0;
+
+ memcpy (bufptr, buffer, size - 1);
+ *(buffer + size) = '\0';
+
+ return size;
+
+ out:
+ return -1;
+}
+
+/**
+ * @API
+ * gf_changelog_scan() - scan and generate a list of change entries
+ *
+ * calling this api multiple times (without calling gf_changlog_done())
+ * would result new changelogs(s) being refreshed in the tracker file.
+ * This call also acts as a cancellation point for the consumer.
+ */
+ssize_t
+gf_changelog_scan ()
+{
+ int ret = 0;
+ int tracker_fd = 0;
+ size_t len = 0;
+ size_t off = 0;
+ xlator_t *this = NULL;
+ size_t nr_entries = 0;
+ gf_changelog_t *gfc = NULL;
+ struct dirent *entryp = NULL;
+ struct dirent *result = NULL;
+ char buffer[PATH_MAX] = {0,};
+
+ this = THIS;
+ if (!this)
+ goto out;
+
+ gfc = (gf_changelog_t *) this->private;
+ if (!gfc)
+ goto out;
+
+ /**
+ * do we need to protect 'byebye' with locks? worst, the
+ * consumer would get notified during next scan().
+ */
+ if (byebye) {
+ errno = ECONNREFUSED;
+ goto out;
+ }
+
+ errno = EINVAL;
+
+ tracker_fd = gfc->gfc_fd;
+
+ if (gf_ftruncate (tracker_fd, 0))
+ goto out;
+
+ len = offsetof(struct dirent, d_name)
+ + pathconf(gfc->gfc_processing_dir, _PC_NAME_MAX) + 1;
+ entryp = GF_CALLOC (1, len,
+ gf_changelog_mt_libgfchangelog_dirent_t);
+ if (!entryp)
+ goto out;
+
+ rewinddir (gfc->gfc_dir);
+ while (1) {
+ ret = readdir_r (gfc->gfc_dir, entryp, &result);
+ if (ret || !result)
+ break;
+
+ if ( !strcmp (basename (entryp->d_name), ".")
+ || !strcmp (basename (entryp->d_name), "..") )
+ continue;
+
+ nr_entries++;
+
+ GF_CHANGELOG_FILL_BUFFER (gfc->gfc_processing_dir,
+ buffer, off,
+ strlen (gfc->gfc_processing_dir));
+ GF_CHANGELOG_FILL_BUFFER (entryp->d_name, buffer,
+ off, strlen (entryp->d_name));
+ GF_CHANGELOG_FILL_BUFFER ("\n", buffer, off, 1);
+
+ if (gf_changelog_write (tracker_fd, buffer, off) != off) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "error writing changelog filename"
+ " to tracker file");
+ break;
+ }
+ off = 0;
+ }
+
+ GF_FREE (entryp);
+
+ if (!result) {
+ if (gf_lseek (tracker_fd, 0, SEEK_SET) != -1)
+ return nr_entries;
+ }
+ out:
+ return -1;
+}
+
+/**
+ * @API
+ * gf_changelog_register() - register a client for updates.
+ */
+int
+gf_changelog_register (char *brick_path, char *scratch_dir,
+ char *log_file, int log_level, int max_reconnects)
+{
+ int i = 0;
+ int ret = -1;
+ int errn = 0;
+ xlator_t *this = NULL;
+ gf_changelog_t *gfc = NULL;
+ char hist_scratch_dir[PATH_MAX] = {0,};
+
+ this = THIS;
+ if (!this->ctx)
+ goto out;
+
+ errno = ENOMEM;
+
+ gfc = GF_CALLOC (1, sizeof (*gfc),
+ gf_changelog_mt_libgfchangelog_t);
+ if (!gfc)
+ goto out;
+
+ gfc->this = this;
+
+ gfc->gfc_dir = NULL;
+ gfc->gfc_fd = gfc->gfc_sockfd = -1;
+
+ gfc->gfc_working_dir = realpath (scratch_dir, NULL);
+ if (!gfc->gfc_working_dir) {
+ errn = errno;
+ goto cleanup;
+ }
+
+ /* Begin: Changes for History API */
+ gfc->hist_gfc = NULL;
+
+ gfc->hist_gfc = GF_CALLOC (1, sizeof (*gfc),
+ gf_changelog_mt_libgfchangelog_t);
+ if (!gfc->hist_gfc)
+ goto cleanup;
+
+ gfc->hist_gfc->gfc_dir = NULL;
+ gfc->hist_gfc->gfc_fd = gfc->hist_gfc->gfc_sockfd = -1;
+ gfc->hist_gfc->this = NULL;
+
+ (void) strncpy (hist_scratch_dir, scratch_dir, PATH_MAX);
+ (void) snprintf (hist_scratch_dir, PATH_MAX,
+ "%s/"GF_CHANGELOG_HISTORY_DIR"/",
+ gfc->gfc_working_dir);
+
+ ret = mkdir_p (hist_scratch_dir, 0600, _gf_false);
+ if (ret) {
+ errn = errno;
+ goto cleanup;
+ }
+
+ gfc->hist_gfc->gfc_working_dir = realpath (hist_scratch_dir, NULL);
+ if (!gfc->hist_gfc->gfc_working_dir) {
+ errn = errno;
+ goto cleanup;
+ }
+
+ ret = gf_changelog_open_dirs (gfc->hist_gfc);
+ if (ret) {
+ errn = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not create entries in history scratch dir");
+ goto cleanup;
+ }
+
+ (void) strncpy (gfc->hist_gfc->gfc_brickpath, brick_path, PATH_MAX);
+
+ for (i=0; i < 256; i++) {
+ gfc->hist_gfc->rfc3986[i] =
+ (isalnum(i) || i == '~' ||
+ i == '-' || i == '.' || i == '_') ? i : 0;
+ }
+ /* End: Changes for History API*/
+
+ ret = gf_changelog_open_dirs (gfc);
+ if (ret) {
+ errn = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not create entries in scratch dir");
+ goto cleanup;
+ }
+
+ /* passing ident as NULL means to use default ident for syslog */
+ if (gf_log_init (this->ctx, log_file, NULL))
+ goto cleanup;
+
+ gf_log_set_loglevel ((log_level == -1) ? GF_LOG_INFO :
+ log_level);
+
+ gfc->gfc_connretries = (max_reconnects <= 0) ? 1 : max_reconnects;
+ (void) strncpy (gfc->gfc_brickpath, brick_path, PATH_MAX);
+
+ ret = gf_changelog_notification_init (this, gfc);
+ if (ret) {
+ errn = errno;
+ goto cleanup;
+ }
+
+ ret = gf_thread_create (&gfc->gfc_changelog_processor,
+ NULL, gf_changelog_process, gfc);
+ if (ret) {
+ errn = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "error creating changelog processor thread"
+ " new changes won't be recorded!!!");
+ goto cleanup;
+ }
+
+ for (i=0; i < 256; i++) {
+ gfc->rfc3986[i] =
+ (isalnum(i) || i == '~' ||
+ i == '-' || i == '.' || i == '_') ? i : 0;
+ }
+
+ ret = 0;
+ this->private = gfc;
+
+ goto out;
+
+ cleanup:
+ if (gfc->hist_gfc) {
+ gf_changelog_cleanup (gfc->hist_gfc);
+ GF_FREE (gfc->hist_gfc);
+ }
+ gf_changelog_cleanup (gfc);
+ GF_FREE (gfc);
+ this->private = NULL;
+ errno = errn;
+
+ out:
+ return ret;
+}
diff --git a/xlators/features/changelog/lib/src/gf-history-changelog.c b/xlators/features/changelog/lib/src/gf-history-changelog.c
new file mode 100644
index 000000000..bfc4cd37d
--- /dev/null
+++ b/xlators/features/changelog/lib/src/gf-history-changelog.c
@@ -0,0 +1,274 @@
+#include <errno.h>
+#include <dirent.h>
+#include <stddef.h>
+#include <sys/types.h>
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <string.h>
+
+#include "globals.h"
+#include "glusterfs.h"
+#include "logging.h"
+
+#include "gf-changelog-helpers.h"
+
+/* from the changelog translator */
+#include "changelog-misc.h"
+#include "changelog-mem-types.h"
+
+/*@API
+ * gf_history_changelog_done:
+ * Move processed history changelog file from .processing
+ * to .processed
+ *
+ * ARGUMENTS:
+ * file(IN): path to processed history changelog file in
+ * .processing directory.
+ *
+ * RETURN VALUE:
+ * 0: On success.
+ * -1: On error.
+ */
+int
+gf_history_changelog_done (char *file)
+{
+ int ret = -1;
+ char *buffer = NULL;
+ xlator_t *this = NULL;
+ gf_changelog_t *gfc = NULL;
+ gf_changelog_t *hist_gfc = NULL;
+ char to_path[PATH_MAX] = {0,};
+
+ errno = EINVAL;
+
+ this = THIS;
+ if (!this)
+ goto out;
+
+ gfc = (gf_changelog_t *) this->private;
+ if (!gfc)
+ goto out;
+
+ hist_gfc = gfc->hist_gfc;
+ if (!hist_gfc)
+ goto out;
+
+ if (!file || !strlen (file))
+ goto out;
+
+ /* make sure 'file' is inside ->gfc_working_dir */
+ buffer = realpath (file, NULL);
+ if (!buffer)
+ goto out;
+
+ if (strncmp (hist_gfc->gfc_working_dir,
+ buffer, strlen (hist_gfc->gfc_working_dir)))
+ goto out;
+
+ (void) snprintf (to_path, PATH_MAX, "%s%s",
+ hist_gfc->gfc_processed_dir, basename (buffer));
+ gf_log (this->name, GF_LOG_DEBUG,
+ "moving %s to processed directory", file);
+ ret = rename (buffer, to_path);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "cannot move %s to %s (reason: %s)",
+ file, to_path, strerror (errno));
+ goto out;
+ }
+
+ ret = 0;
+
+ out:
+ if (buffer)
+ free (buffer); /* allocated by realpath() */
+ return ret;
+}
+/**
+ * @API
+ * gf_history_changelog_start_fresh:
+ * For a set of changelogs, start from the begining.
+ * It will truncates the history tracker fd.
+ *
+ * RETURN VALUES:
+ * 0: On success.
+ * -1: On error.
+ */
+int
+gf_history_changelog_start_fresh ()
+{
+ xlator_t *this = NULL;
+ gf_changelog_t *gfc = NULL;
+ gf_changelog_t *hist_gfc = NULL;
+
+ this = THIS;
+ if (!this)
+ goto out;
+
+ errno = EINVAL;
+
+ gfc = (gf_changelog_t *) this->private;
+ if (!gfc)
+ goto out;
+
+ hist_gfc = gfc->hist_gfc;
+ if (!hist_gfc)
+ goto out;
+
+ if (gf_ftruncate (hist_gfc->gfc_fd, 0))
+ goto out;
+
+ return 0;
+
+ out:
+ return -1;
+}
+
+/*
+ * @API
+ * gf_history_changelog_next_change:
+ * Return the next history changelog file entry. Zero means all
+ * history chanelogs are consumed.
+ *
+ * ARGUMENTS:
+ * bufptr(OUT): Path to unprocessed history changelog file
+ * from tracker file.
+ * maxlen(IN): Usually PATH_MAX.
+ *
+ * RETURN VALUES:
+ * size: On success.
+ * -1 : On error.
+ */
+ssize_t
+gf_history_changelog_next_change (char *bufptr, size_t maxlen)
+{
+ ssize_t size = 0;
+ int tracker_fd = 0;
+ xlator_t *this = NULL;
+ gf_changelog_t *gfc = NULL;
+ gf_changelog_t *hist_gfc = NULL;
+ char buffer[PATH_MAX] = {0,};
+
+ errno = EINVAL;
+
+ this = THIS;
+ if (!this)
+ goto out;
+
+ gfc = (gf_changelog_t *) this->private;
+ if (!gfc)
+ goto out;
+
+ hist_gfc = gfc->hist_gfc;
+ if (!hist_gfc)
+ goto out;
+
+ tracker_fd = hist_gfc->gfc_fd;
+
+ size = gf_readline (tracker_fd, buffer, maxlen);
+ if (size < 0)
+ goto out;
+ if (size == 0)
+ return 0;
+
+ memcpy (bufptr, buffer, size - 1);
+ *(buffer + size) = '\0';
+
+ return size;
+
+ out:
+ return -1;
+}
+
+/*
+ * @API
+ * gf_history_changelog_scan:
+ * Scan and generate a list of change entries.
+ * Calling this api multiple times (without calling gf_changlog_done())
+ * would result new changelogs(s) being refreshed in the tracker file.
+ * This call also acts as a cancellation point for the consumer.
+ *
+ * RETURN VALUES:
+ * nr_entries: On success.
+ * -1 : On error.
+ */
+ssize_t
+gf_history_changelog_scan ()
+{
+ int ret = 0;
+ int tracker_fd = 0;
+ size_t len = 0;
+ size_t off = 0;
+ xlator_t *this = NULL;
+ size_t nr_entries = 0;
+ gf_changelog_t *gfc = NULL;
+ gf_changelog_t *hist_gfc = NULL;
+ struct dirent *entryp = NULL;
+ struct dirent *result = NULL;
+ char buffer[PATH_MAX] = {0,};
+
+ this = THIS;
+ if (!this)
+ goto out;
+
+ gfc = (gf_changelog_t *) this->private;
+ if (!gfc)
+ goto out;
+
+ hist_gfc = gfc->hist_gfc;
+ if (!hist_gfc)
+ goto out;
+
+ errno = EINVAL;
+
+ tracker_fd = hist_gfc->gfc_fd;
+
+ if (gf_ftruncate (tracker_fd, 0))
+ goto out;
+
+ len = offsetof(struct dirent, d_name)
+ + pathconf(hist_gfc->gfc_processing_dir, _PC_NAME_MAX) + 1;
+ entryp = GF_CALLOC (1, len,
+ gf_changelog_mt_libgfchangelog_dirent_t);
+ if (!entryp)
+ goto out;
+
+ rewinddir (hist_gfc->gfc_dir);
+ while (1) {
+ ret = readdir_r (hist_gfc->gfc_dir, entryp, &result);
+ if (ret || !result)
+ break;
+
+ if ( !strcmp (basename (entryp->d_name), ".")
+ || !strcmp (basename (entryp->d_name), "..") )
+ continue;
+
+ nr_entries++;
+
+ GF_CHANGELOG_FILL_BUFFER (hist_gfc->gfc_processing_dir,
+ buffer, off,
+ strlen (hist_gfc->gfc_processing_dir));
+ GF_CHANGELOG_FILL_BUFFER (entryp->d_name, buffer,
+ off, strlen (entryp->d_name));
+ GF_CHANGELOG_FILL_BUFFER ("\n", buffer, off, 1);
+
+ if (gf_changelog_write (tracker_fd, buffer, off) != off) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "error writing changelog filename"
+ " to tracker file");
+ break;
+ }
+ off = 0;
+ }
+
+ GF_FREE (entryp);
+
+ if (!result) {
+ if (gf_lseek (tracker_fd, 0, SEEK_SET) != -1)
+ return nr_entries;
+ }
+ out:
+ return -1;
+}
diff --git a/xlators/features/changelog/src/Makefile.am b/xlators/features/changelog/src/Makefile.am
new file mode 100644
index 000000000..525ce97dc
--- /dev/null
+++ b/xlators/features/changelog/src/Makefile.am
@@ -0,0 +1,21 @@
+xlator_LTLIBRARIES = changelog.la
+
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+noinst_HEADERS = changelog-helpers.h changelog-mem-types.h changelog-rt.h \
+ changelog-misc.h changelog-encoders.h changelog-notifier.h \
+ changelog-fops.h policy/changelog-policy.h
+
+changelog_la_LDFLAGS = -module -avoid-version
+
+changelog_la_SOURCES = changelog.c changelog-rt.c changelog-helpers.c \
+ changelog-encoders.c changelog-notifier.c changelog-default-fops.c \
+ policy/changelog-policy-default.c policy/changelog-policy-replication.c
+changelog_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src -Ipolicy/ -fPIC -D_FILE_OFFSET_BITS=64 \
+ -D_GNU_SOURCE -D$(GF_HOST_OS) -shared -DDATADIR=\"$(localstatedir)\"
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/changelog/src/changelog-default-fops.c b/xlators/features/changelog/src/changelog-default-fops.c
new file mode 100644
index 000000000..59749905e
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-default-fops.c
@@ -0,0 +1,561 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "xlator.h"
+#include "defaults.h"
+#include "logging.h"
+
+#include "changelog-encoders.h"
+
+/** FOPS */
+
+/* default rmdir */
+int32_t
+changelog_default_rmdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int xflags, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, loc->inode->gfid, 2);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+
+ co++;
+ CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 2);
+
+ frame->local = local;
+ ret = 0;
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+/* default unlink */
+int32_t
+changelog_default_unlink (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int xflags, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, loc->inode->gfid, 2);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+
+ co++;
+ CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 2);
+
+ frame->local = local;
+ ret = 0;
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+/* default rename */
+int32_t
+changelog_default_rename (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ /* 3 == fop + oldloc + newloc */
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, oldloc->inode->gfid, 3);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+
+ co++;
+ CHANGELOG_FILL_ENTRY (co, oldloc->pargfid, oldloc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ co++;
+ CHANGELOG_FILL_ENTRY (co, newloc->pargfid, newloc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 3);
+
+ frame->local = local;
+ ret = 0;
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+/* default link */
+int32_t
+changelog_default_link (call_frame_t *frame,
+ xlator_t *this, loc_t *oldloc,
+ loc_t *newloc, dict_t *xdata)
+{
+ int ret = 1;
+ size_t xtra_len = 0;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, oldloc->gfid, 2);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+
+ co++;
+ CHANGELOG_FILL_ENTRY (co, newloc->pargfid, newloc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 2);
+
+ frame->local = local;
+ ret = 0;
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+/* default mknid */
+int32_t
+changelog_default_mkdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata)
+{
+ int ret = -1;
+ uuid_t gfid = {0,};
+ void *uuid_req = NULL;
+ size_t xtra_len = 0;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ ret = dict_get_ptr (xdata, "gfid-req", &uuid_req);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "failed to get gfid from dict");
+ goto out;
+ }
+ uuid_copy (gfid, uuid_req);
+
+ ret = -1;
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, gfid, 2);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+
+ co++;
+ CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 2);
+
+ frame->local = local;
+ ret = 0;
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+/* default symlink */
+int32_t
+changelog_default_symlink (call_frame_t *frame, xlator_t *this,
+ const char *linkname, loc_t *loc,
+ mode_t umask, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ uuid_t gfid = {0,};
+ void *uuid_req = NULL;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ ret = dict_get_ptr (xdata, "gfid-req", &uuid_req);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "failed to get gfid from dict");
+ goto out;
+ }
+ uuid_copy (gfid, uuid_req);
+
+ ret = -1;
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, gfid, 2);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+
+ co++;
+ CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 2);
+
+ frame->local = local;
+ ret = 0;
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+/* default mknod */
+int32_t
+changelog_default_mknod (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ mode_t mode, dev_t dev, mode_t umask, dict_t *xdata)
+{
+ int ret = -1;
+ uuid_t gfid = {0,};
+ void *uuid_req = NULL;
+ size_t xtra_len = 0;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ ret = dict_get_ptr (xdata, "gfid-req", &uuid_req);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "failed to get gfid from dict");
+ goto out;
+ }
+ uuid_copy (gfid, uuid_req);
+
+ ret = -1;
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, gfid, 2);
+
+ co = changelog_get_usable_buffer (frame->local);
+ if (!co)
+ goto out;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+
+ co++;
+ CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 2);
+
+ frame->local = local;
+ ret = 0;
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+/* default create */
+int32_t
+changelog_default_create (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t flags, mode_t mode,
+ mode_t umask, fd_t *fd, dict_t *xdata)
+{
+ int ret = -1;
+ uuid_t gfid = {0,};
+ void *uuid_req = NULL;
+ changelog_opt_t *co = NULL;
+ size_t xtra_len = 0;
+ changelog_local_t *local = NULL;
+
+ ret = dict_get_ptr (xdata, "gfid-req", &uuid_req);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "failed to get gfid from dict");
+ goto out;
+ }
+ uuid_copy (gfid, uuid_req);
+
+ /* init with two extra records */
+ ret = -1;
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, gfid, 2);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+
+ co++;
+ CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 2);
+
+ frame->local = local;
+ ret = 0;
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+/* default fsetattr */
+int32_t
+changelog_default_fsetattr (call_frame_t *frame,
+ xlator_t *this, fd_t *fd,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+ CHANGELOG_INIT (this, frame->local,
+ fd->inode, fd->inode->gfid, 0);
+ return 0;
+}
+
+/* default setattr */
+int32_t
+changelog_default_setattr (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+ CHANGELOG_INIT (this, frame->local,
+ loc->inode, loc->inode->gfid, 0);
+ return 0;
+}
+
+/* default fremovexattr */
+int32_t
+changelog_default_fremovexattr (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, const char *name, dict_t *xdata)
+{
+ CHANGELOG_INIT (this, frame->local,
+ fd->inode, fd->inode->gfid, 0);
+ return 0;
+}
+
+/* default removexattr */
+int32_t
+changelog_default_removexattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name, dict_t *xdata)
+{
+ CHANGELOG_INIT (this, frame->local,
+ loc->inode, loc->inode->gfid, 0);
+ return 0;
+}
+
+/* default setxattr */
+int32_t
+changelog_default_setxattr (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ dict_t *dict, int32_t flags, dict_t *xdata)
+{
+ CHANGELOG_INIT (this, frame->local,
+ loc->inode, loc->inode->gfid, 0);
+ return 0;
+}
+
+/* default fsetxattr */
+int32_t
+changelog_default_fsetxattr (call_frame_t *frame,
+ xlator_t *this, fd_t *fd, dict_t *dict,
+ int32_t flags, dict_t *xdata)
+{
+ CHANGELOG_INIT (this, frame->local,
+ fd->inode, fd->inode->gfid, 0);
+ return 0;
+}
+
+/* default truncate */
+int32_t
+changelog_default_truncate (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ off_t offset, dict_t *xdata)
+{
+ CHANGELOG_INIT (this, frame->local,
+ loc->inode, loc->inode->gfid, 0);
+ return 0;
+}
+
+/* default ftruncate */
+int32_t
+changelog_default_ftruncate (call_frame_t *frame,
+ xlator_t *this, fd_t *fd,
+ off_t offset, dict_t *xdata)
+{
+ CHANGELOG_INIT (this, frame->local,
+ fd->inode, fd->inode->gfid, 0);
+ return 0;
+}
+
+/* default writev */
+int32_t
+changelog_default_writev (call_frame_t *frame,
+ xlator_t *this, fd_t *fd, struct iovec *vector,
+ int32_t count, off_t offset, uint32_t flags,
+ struct iobref *iobref, dict_t *xdata)
+{
+ CHANGELOG_INIT (this, frame->local,
+ fd->inode, fd->inode->gfid, 0);
+ return 0;
+}
+
+/** COPS */
+
+int
+changelog_default_cops_open (xlator_t *this,
+ changelog_priv_t *priv,
+ void *cpriv, char *name, gf_boolean_t last)
+{
+ changelog_log_data_t cld = {0,};
+ changelog_rollover_data_t *crd = NULL;
+ struct timeval tv = {0,};
+
+ crd = &cld.cld_roll;
+
+ cld.cld_type = CHANGELOG_TYPE_ROLLOVER;
+
+ if (gettimeofday (&tv, NULL))
+ return -1;
+
+ crd->crd_prealloc_size = 0; /* no preallocation */
+ crd->crd_finale = last;
+ crd->crd_use_suffix = _gf_true;
+ crd->crd_roll_key = (unsigned long) tv.tv_sec;
+
+ (void) strcpy (crd->crd_changelog_name, name);
+ (void) strcpy (crd->crd_changelog_oname, name);
+
+ /* inject a roll-over event */
+ return changelog_inject_single_event (this, priv, NULL, &cld);
+}
+
+int
+changelog_default_cops_rollover (xlator_t *this,
+ changelog_priv_t *priv, void *cpriv,
+ char *name, gf_boolean_t last)
+{
+ return changelog_default_cops_open (this, priv, cpriv, name, last);
+}
+
+int
+changelog_default_cops_sync (xlator_t *this,
+ changelog_priv_t *priv, void *cpriv)
+{
+ changelog_log_data_t cld = {0,};
+
+ cld.cld_type = CHANGELOG_TYPE_FSYNC;
+ return changelog_inject_single_event (this, priv, NULL, &cld);
+}
+
+/**
+ * write to the changelog: @changelog_update() implements inode version
+ * checking and all other stuffs...
+ */
+int
+changelog_default_cops_write (xlator_t *this,
+ changelog_priv_t *priv, void *cpriv,
+ changelog_local_t *local, changelog_log_type type)
+{
+ changelog_update (this, priv, local, type);
+ return 0;
+}
+
+off_t
+changelog_default_cops_get_offset (xlator_t *this,
+ changelog_priv_t *priv, void *cpriv,
+ changelog_local_t *local)
+{
+ return *(off_t *)cpriv;
+}
+
+void
+changelog_default_cops_set_offset (xlator_t *this,
+ changelog_priv_t *priv, void *cpriv,
+ changelog_local_t *local, off_t bytes)
+{
+ *(off_t *)cpriv += bytes;
+}
+
+void
+changelog_default_cops_reset_offset (xlator_t *this, changelog_priv_t *priv,
+ void *cpriv, changelog_local_t *local)
+{
+ *(off_t *)cpriv = 0;
+}
+
+/**
+ * roll-over takes care of close and open
+ */
+int
+changelog_default_cops_close (xlator_t *this,
+ changelog_priv_t *priv, void *cpriv)
+{
+ errno = ENOTSUP;
+ return -1;
+}
+
+int
+changelog_default_cops_read (xlator_t *this,
+ changelog_priv_t *priv, void *cpriv, char *buffer)
+{
+ errno = ENOTSUP;
+ return -1;
+}
+
+/**
+ * no purging of changelogs
+ */
+int
+changelog_default_cops_unlink (xlator_t *this,
+ changelog_priv_t *priv, void *cpriv, char *name)
+{
+ errno = ENOTSUP;
+ return -1;
+}
diff --git a/xlators/features/changelog/src/changelog-encoders.c b/xlators/features/changelog/src/changelog-encoders.c
new file mode 100644
index 000000000..ecd598e4d
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-encoders.c
@@ -0,0 +1,182 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "changelog-encoders.h"
+
+size_t
+entry_fn (void *data, char *buffer, gf_boolean_t encode)
+{
+ char *tmpbuf = NULL;
+ size_t bufsz = 0;
+ struct changelog_entry_fields *ce = NULL;
+
+ ce = (struct changelog_entry_fields *) data;
+
+ if (encode) {
+ tmpbuf = uuid_utoa (ce->cef_uuid);
+ CHANGELOG_FILL_BUFFER (buffer, bufsz, tmpbuf, strlen (tmpbuf));
+ } else {
+ CHANGELOG_FILL_BUFFER (buffer, bufsz,
+ ce->cef_uuid, sizeof (uuid_t));
+ }
+
+ CHANGELOG_FILL_BUFFER (buffer, bufsz, "/", 1);
+ CHANGELOG_FILL_BUFFER (buffer, bufsz,
+ ce->cef_bname, strlen (ce->cef_bname));
+ return bufsz;
+}
+
+size_t
+fop_fn (void *data, char *buffer, gf_boolean_t encode)
+{
+ char buf[10] = {0,};
+ size_t bufsz = 0;
+ glusterfs_fop_t fop = 0;
+
+ fop = *(glusterfs_fop_t *) data;
+
+ if (encode) {
+ (void) snprintf (buf, sizeof (buf), "%d", fop);
+ CHANGELOG_FILL_BUFFER (buffer, bufsz, buf, strlen (buf));
+ } else
+ CHANGELOG_FILL_BUFFER (buffer, bufsz, &fop, sizeof (fop));
+
+ return bufsz;
+}
+
+void
+entry_free_fn (void *data)
+{
+ changelog_opt_t *co = data;
+
+ if (!co)
+ return;
+
+ GF_FREE (co->co_entry.cef_bname);
+}
+
+/**
+ * try to write all data in one shot
+ */
+
+static inline void
+changelog_encode_write_xtra (changelog_write_data_t *cwd,
+ char *buffer, size_t *off, gf_boolean_t encode)
+{
+ int i = 0;
+ size_t offset = 0;
+ void *data = NULL;
+ changelog_opt_t *co = NULL;
+
+ offset = *off;
+
+ co = (changelog_opt_t *) cwd->cwd_ptr;
+
+ for (; i < cwd->cwd_xtra_records; i++, co++) {
+ if (i)
+ CHANGELOG_FILL_BUFFER (buffer, offset, "\0", 1);
+
+ switch (co->co_type) {
+ case CHANGELOG_OPT_REC_FOP:
+ data = &co->co_fop;
+ break;
+ case CHANGELOG_OPT_REC_ENTRY:
+ data = &co->co_entry;
+ break;
+ case CHANGELOG_OPT_REC_NAME:
+ data = co->co_entry.cef_bname;
+ break;
+ case CHANGELOG_OPT_REC_ULL:
+ data = &co->co_number;
+ break;
+ case CHANGELOG_OPT_REC_UUID:
+ data = &co->co_uuid;
+ break;
+ case CHANGELOG_OPT_REC_INT32:
+ data = &co->co_int32;
+ break;
+ case CHANGELOG_OPT_REC_UINT32:
+ data = &co->co_uint32;
+ break;
+ }
+
+ if (co->co_convert)
+ offset += co->co_convert (data,
+ buffer + offset, encode);
+ else /* no coversion: write it out as it is */
+ CHANGELOG_FILL_BUFFER (buffer, offset,
+ data, co->co_len);
+ }
+
+ *off = offset;
+}
+
+int
+changelog_encode_ascii (xlator_t *this,
+ changelog_local_t *local, changelog_log_data_t *cld)
+{
+ size_t off = 0;
+ size_t gfid_len = 0;
+ char *gfid_str = NULL;
+ char *buffer = NULL;
+ changelog_priv_t *priv = NULL;
+ changelog_write_data_t *cwd = NULL;
+
+ priv = this->private;
+ cwd = &cld->cld_wdata;
+
+ gfid_str = uuid_utoa (cwd->cwd_gfid);
+ gfid_len = strlen (gfid_str);
+
+ /* extra bytes for decorations */
+ buffer = alloca (gfid_len + cwd->cwd_ptr_len + 100);
+ if (!priv->no_gfid_hdr)
+ CHANGELOG_STORE_ASCII (priv, buffer,
+ off, gfid_str, gfid_len, cld);
+
+ if (cwd->cwd_xtra_records) {
+ changelog_encode_write_xtra (cwd, buffer, &off, _gf_true);
+ CHANGELOG_FILL_BUFFER (buffer, off, "\0", 1);
+ }
+
+ return changelog_write_change (this, priv,
+ local, buffer, off);
+}
+
+int
+changelog_encode_binary (xlator_t *this,
+ changelog_local_t *local, changelog_log_data_t *cld)
+{
+ size_t off = 0;
+ char *buffer = NULL;
+ changelog_priv_t *priv = NULL;
+ changelog_write_data_t *cwd = NULL;
+
+ priv = this->private;
+ cwd = &cld->cld_wdata;
+
+ /* extra bytes for decorations */
+ buffer = alloca (sizeof (uuid_t) + cwd->cwd_ptr_len + 100);
+ if (!priv->no_gfid_hdr)
+ CHANGELOG_STORE_BINARY (priv, buffer, off, cwd->cwd_gfid, cld);
+
+ if (cwd->cwd_xtra_records) {
+ changelog_encode_write_xtra (cwd, buffer, &off, _gf_false);
+ CHANGELOG_FILL_BUFFER (buffer, off, "\0", 1);
+ }
+
+ return changelog_write_change (this, priv,
+ local, buffer, off);
+}
diff --git a/xlators/features/changelog/src/changelog-encoders.h b/xlators/features/changelog/src/changelog-encoders.h
new file mode 100644
index 000000000..2a96ba4dd
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-encoders.h
@@ -0,0 +1,48 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CHANGELOG_ENCODERS_H
+#define _CHANGELOG_ENCODERS_H
+
+#include "xlator.h"
+#include "defaults.h"
+
+#include "changelog-helpers.h"
+
+#define CHANGELOG_STORE_ASCII(priv, buf, off, gfid, gfid_len, cld) do { \
+ CHANGELOG_FILL_BUFFER (buffer, off, \
+ priv->maps[cld->cld_type], 1); \
+ CHANGELOG_FILL_BUFFER (buffer, \
+ off, gfid, gfid_len); \
+ CHANGELOG_FILL_BUFFER (buffer, off, "\0", 1); \
+ } while (0)
+
+#define CHANGELOG_STORE_BINARY(priv, buf, off, gfid, cld) do { \
+ CHANGELOG_FILL_BUFFER (buffer, off, \
+ priv->maps[cld->cld_type], 1); \
+ CHANGELOG_FILL_BUFFER (buffer, \
+ off, gfid, sizeof (uuid_t)); \
+ CHANGELOG_FILL_BUFFER (buffer, off, "\0", 1); \
+ } while (0)
+
+size_t
+entry_fn (void *data, char *buffer, gf_boolean_t encode);
+size_t
+fop_fn (void *data, char *buffer, gf_boolean_t encode);
+void
+entry_free_fn (void *data);
+int
+changelog_encode_binary (xlator_t *,
+ changelog_local_t *, changelog_log_data_t *);
+int
+changelog_encode_ascii (xlator_t *,
+ changelog_local_t *, changelog_log_data_t *);
+
+#endif /* _CHANGELOG_ENCODERS_H */
diff --git a/xlators/features/changelog/src/changelog-fops.h b/xlators/features/changelog/src/changelog-fops.h
new file mode 100644
index 000000000..597327be3
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-fops.h
@@ -0,0 +1,157 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CHANGELOG_FOPS_H
+#define _CHANGELOG_FOPS_H
+
+/* FOPS */
+
+int32_t
+changelog_default_rmdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int xflags, dict_t *xdata);
+int32_t
+changelog_default_unlink (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int xflags, dict_t *xdata);
+int32_t
+changelog_default_rename (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc, dict_t *xdata);
+int32_t
+changelog_default_link (call_frame_t *frame,
+ xlator_t *this, loc_t *oldloc,
+ loc_t *newloc, dict_t *xdata);
+int32_t
+changelog_default_mkdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata);
+int32_t
+changelog_default_symlink (call_frame_t *frame, xlator_t *this,
+ const char *linkname, loc_t *loc,
+ mode_t umask, dict_t *xdata);
+int32_t
+changelog_default_mknod (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ mode_t mode, dev_t dev, mode_t umask, dict_t *xdata);
+int32_t
+changelog_default_create (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t flags, mode_t mode,
+ mode_t umask, fd_t *fd, dict_t *xdata);
+int32_t
+changelog_default_fsetattr (call_frame_t *frame,
+ xlator_t *this, fd_t *fd,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata);
+int32_t
+changelog_default_setattr (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata);
+int32_t
+changelog_default_fremovexattr (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, const char *name, dict_t *xdata);
+int32_t
+changelog_default_removexattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name, dict_t *xdata);
+int32_t
+changelog_default_setxattr (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ dict_t *dict, int32_t flags, dict_t *xdata);
+int32_t
+changelog_default_fsetxattr (call_frame_t *frame,
+ xlator_t *this, fd_t *fd, dict_t *dict,
+ int32_t flags, dict_t *xdata);
+int32_t
+changelog_default_truncate (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ off_t offset, dict_t *xdata);
+int32_t
+changelog_default_ftruncate (call_frame_t *frame,
+ xlator_t *this, fd_t *fd,
+ off_t offset, dict_t *xdata);
+int32_t
+changelog_default_writev (call_frame_t *frame,
+ xlator_t *this, fd_t *fd, struct iovec *vector,
+ int32_t count, off_t offset, uint32_t flags,
+ struct iobref *iobref, dict_t *xdata);
+
+/* COPS */
+int
+changelog_default_cops_open (xlator_t *, changelog_priv_t *,
+ void *, char*, gf_boolean_t);
+
+int
+changelog_default_cops_close (xlator_t *, changelog_priv_t *, void *);
+
+int
+changelog_default_cops_sync (xlator_t *this,
+ changelog_priv_t *priv, void *);
+
+int
+changelog_default_cops_rollover (xlator_t *,
+ changelog_priv_t *, void *,
+ char *, gf_boolean_t);
+int
+changelog_default_cops_write (xlator_t *,
+ changelog_priv_t *, void *,
+ changelog_local_t *, changelog_log_type);
+
+int
+changelog_default_cops_read (xlator_t *,
+ changelog_priv_t *, void *, char *);
+
+int
+changelog_default_cops_unlink (xlator_t *,
+ changelog_priv_t *, void *, char *);
+
+off_t
+changelog_default_cops_get_offset (xlator_t *,
+ changelog_priv_t *, void *,
+ changelog_local_t *);
+
+void
+changelog_default_cops_set_offset (xlator_t *,
+ changelog_priv_t *, void *,
+ changelog_local_t *, off_t );
+
+void
+changelog_default_cops_reset_offset (xlator_t *, changelog_priv_t *,
+ void *, changelog_local_t *);
+
+
+GF_UNUSED static struct xlator_fops changelog_default_fops = {
+ .mknod = changelog_default_mknod,
+ .mkdir = changelog_default_mkdir,
+ .create = changelog_default_create,
+ .symlink = changelog_default_symlink,
+ .writev = changelog_default_writev,
+ .truncate = changelog_default_truncate,
+ .ftruncate = changelog_default_ftruncate,
+ .link = changelog_default_link,
+ .rename = changelog_default_rename,
+ .unlink = changelog_default_unlink,
+ .rmdir = changelog_default_rmdir,
+ .setattr = changelog_default_setattr,
+ .fsetattr = changelog_default_fsetattr,
+ .setxattr = changelog_default_setxattr,
+ .fsetxattr = changelog_default_fsetxattr,
+ .removexattr = changelog_default_removexattr,
+ .fremovexattr = changelog_default_fremovexattr,
+};
+
+GF_UNUSED static struct changelog_ops changelog_default_cops = {
+ .open = changelog_default_cops_open,
+ .sync = changelog_default_cops_sync,
+ .read = changelog_default_cops_read,
+ .close = changelog_default_cops_close,
+ .write = changelog_default_cops_write,
+ .unlink = changelog_default_cops_unlink,
+ .rollover = changelog_default_cops_rollover,
+ .get_offset = changelog_default_cops_get_offset,
+ .set_offset = changelog_default_cops_set_offset,
+ .reset_offset = changelog_default_cops_reset_offset,
+};
+
+#endif /* _CHANGELOG_FOPS_H */
diff --git a/xlators/features/changelog/src/changelog-helpers.c b/xlators/features/changelog/src/changelog-helpers.c
new file mode 100644
index 000000000..ad4fe4013
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-helpers.c
@@ -0,0 +1,719 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "xlator.h"
+#include "defaults.h"
+#include "logging.h"
+#include "iobuf.h"
+
+#include "changelog-helpers.h"
+#include "changelog-mem-types.h"
+
+#include <pthread.h>
+
+void
+changelog_thread_cleanup (xlator_t *this, pthread_t thr_id)
+{
+ int ret = 0;
+ void *retval = NULL;
+
+ /* send a cancel request to the thread */
+ ret = pthread_cancel (thr_id);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not cancel thread (reason: %s)",
+ strerror (errno));
+ goto out;
+ }
+
+ ret = pthread_join (thr_id, &retval);
+ if (ret || (retval != PTHREAD_CANCELED)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "cancel request not adhered as expected"
+ " (reason: %s)", strerror (errno));
+ }
+
+ out:
+ return;
+}
+
+inline void *
+changelog_get_usable_buffer (changelog_local_t *local)
+{
+ changelog_write_data_t *cwd = &local->cld.cld_wdata;
+
+ if (!cwd->cwd_iobuf)
+ return NULL;
+
+ return cwd->cwd_ptr;
+}
+
+inline void
+changelog_set_usable_record_and_length (changelog_local_t *local,
+ size_t len, int xr)
+{
+ changelog_write_data_t *cwd = &local->cld.cld_wdata;
+
+ cwd->cwd_ptr_len = len;
+ cwd->cwd_xtra_records = xr;
+}
+
+void
+changelog_local_cleanup (xlator_t *xl, changelog_local_t *local)
+{
+ int i = 0;
+ changelog_opt_t *co = NULL;
+ changelog_write_data_t *cwd = NULL;
+
+ if (!local)
+ return;
+
+ cwd = &local->cld.cld_wdata;
+
+ /* cleanup dynamic allocation for extra records */
+ if (cwd->cwd_xtra_records) {
+ co = (changelog_opt_t *) cwd->cwd_ptr;
+ for (; i < cwd->cwd_xtra_records; i++, co++)
+ if (co->co_free)
+ co->co_free (co);
+ }
+
+ CHANGELOG_IOBUF_UNREF (cwd->cwd_iobuf);
+
+ if (local->inode)
+ inode_unref (local->inode);
+
+ mem_put (local);
+}
+
+inline int
+changelog_write (int fd, char *buffer, size_t len)
+{
+ ssize_t size = 0;
+ size_t writen = 0;
+
+ while (writen < len) {
+ size = write (fd,
+ buffer + writen, len - writen);
+ if (size <= 0)
+ break;
+
+ writen += size;
+ }
+
+ return (writen != len);
+}
+
+static int
+changelog_rollover_changelog (xlator_t *this,
+ changelog_priv_t *priv,
+ changelog_rollover_data_t *crd)
+{
+ int ret = -1;
+ int notify = 0;
+ char *bname = NULL;
+ char ofile[PATH_MAX] = {0,};
+ char nfile[PATH_MAX] = {0,};
+
+ if (priv->changelog_fd != -1) {
+ close (priv->changelog_fd);
+ priv->changelog_fd = -1;
+ }
+
+ /**
+ * no rolling-over of changelogs, policy implementer choose
+ * to do the heavy-lifting of having distinct changelog name.
+ *
+ * NOTE: This implies libgfchangelog would not be notified
+ (well, we could, but lets not do that now...)
+ */
+ if (!crd->crd_use_suffix)
+ return 0;
+
+ (void) snprintf (ofile, PATH_MAX,
+ "%s/%s", priv->changelog_dir,
+ crd->crd_changelog_oname);
+ (void) snprintf (nfile, PATH_MAX, "%s/%s.%lu",
+ priv->changelog_dir,
+ crd->crd_changelog_name, crd->crd_roll_key);
+
+ ret = rename (ofile, nfile);
+ if (!ret)
+ notify = 1;
+
+ if (ret && (errno == ENOENT)) {
+ ret = 0;
+ }
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "error renaming %s -> %s (reason %s)",
+ ofile, nfile, strerror (errno));
+ }
+
+ if (notify) {
+ bname = basename (nfile);
+ gf_log (this->name, GF_LOG_DEBUG, "notifying: %s", bname);
+ ret = changelog_write (priv->wfd, bname, strlen (bname) + 1);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to send file name to notify thread"
+ " (reason: %s)", strerror (errno));
+ }
+ }
+
+ return ret;
+}
+
+int
+changelog_open (xlator_t *this,
+ changelog_priv_t *priv,
+ changelog_local_t *local, changelog_rollover_data_t *crd)
+{
+ int fd = 0;
+ int ret = -1;
+ int flags = 0;
+ char buffer[1024] = {0,};
+ char changelog_path[PATH_MAX] = {0,};
+
+ (void) snprintf (changelog_path, PATH_MAX,
+ "%s/%s", priv->changelog_dir,
+ crd->crd_changelog_name);
+
+ flags |= (O_CREAT | O_RDWR);
+ if (priv->fsync_interval == 0)
+ flags |= O_SYNC;
+
+ fd = open (changelog_path, flags,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+ if (fd < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "unable to open/create changelog file %s"
+ " (reason: %s). change-logging will be"
+ " inactive", changelog_path, strerror (errno));
+ goto out;
+ }
+
+ priv->changelog_fd = fd;
+ CHANGELOG_INVOKE_CFOP (this, priv, reset_offset, local);
+
+ /* preallocate if required */
+ if (crd->crd_prealloc_size > 0) {
+ ret = posix_fallocate (priv->changelog_fd,
+ 0, crd->crd_prealloc_size);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to preallocate %llu bytes",
+ (unsigned long long) crd->crd_prealloc_size);
+ }
+ }
+
+ (void) snprintf (buffer, 1024, CHANGELOG_HEADER,
+ CHANGELOG_VERSION_MAJOR,
+ CHANGELOG_VERSION_MINOR,
+ priv->encode_mode);
+ ret = changelog_write_change (this, priv,
+ local, buffer, strlen (buffer));
+ if (ret) {
+ close (priv->changelog_fd);
+ priv->changelog_fd = -1;
+ goto out;
+ }
+
+ ret = 0;
+
+ out:
+ return ret;
+}
+
+static int
+changelog_start_next_change (xlator_t *this,
+ changelog_priv_t *priv,
+ changelog_local_t *local,
+ changelog_log_data_t *cld)
+{
+ int ret = 0;
+ changelog_rollover_data_t *crd = &cld->cld_roll;
+
+ ret = changelog_rollover_changelog (this, priv, crd);
+
+ if (!ret && !crd->crd_finale)
+ ret = changelog_open (this, priv, local, crd);
+ return ret;
+}
+
+/**
+ * return the length of entry
+ */
+inline size_t
+changelog_entry_length ()
+{
+ return sizeof (changelog_log_data_t);
+}
+
+int
+changelog_write_change (xlator_t *this, changelog_priv_t *priv,
+ changelog_local_t *local, char *buffer, size_t len)
+{
+ int ret = -1;
+ off_t offset = 0;
+ ssize_t size = 0;
+ size_t writen = 0;
+
+ offset = CHANGELOG_INVOKE_CFOP (this, priv, get_offset, local);
+
+ while (writen < len) {
+ size = pwrite (priv->changelog_fd,
+ buffer + writen, len - writen, offset + writen);
+ if (size <= 0)
+ break;
+
+ writen += size;
+ }
+
+ if (writen == len) {
+ ret = 0;
+ CHANGELOG_INVOKE_CFOP (this, priv, set_offset, local, writen);
+ }
+
+ return ret;
+}
+
+inline int
+changelog_handle_change (xlator_t *this,
+ changelog_priv_t *priv,
+ changelog_local_t *local, changelog_log_data_t *cld)
+{
+ int ret = 0;
+
+ if (CHANGELOG_TYPE_IS_ROLLOVER (cld->cld_type)) {
+ ret = changelog_start_next_change (this, priv, local, cld);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Problem rolling over changelog(s)");
+ goto out;
+ }
+
+ /**
+ * case when there is reconfigure done (disabling changelog) and there
+ * are still fops that have updates in prgress.
+ */
+ if (priv->changelog_fd == -1)
+ return 0;
+
+ if (CHANGELOG_TYPE_IS_FSYNC (cld->cld_type)) {
+ ret = fsync (priv->changelog_fd);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "fsync failed (reason: %s)",
+ strerror (errno));
+ }
+ goto out;
+ }
+
+ ret = priv->ce->encode (this, local, cld);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "error writing changelog to disk");
+ }
+
+ out:
+ return ret;
+}
+
+static inline void
+changelog_local_init_defaults (changelog_local_t *local,
+ uuid_t gfid, struct iobuf *iobuf)
+{
+ changelog_write_data_t *cwd = &(local->cld.cld_wdata);
+
+ uuid_copy (cwd->cwd_gfid, gfid);
+ cwd->cwd_iobuf = iobuf;
+ cwd->cwd_xtra_records = 0; /* set by the caller */
+}
+
+changelog_local_t *
+changelog_local_init (xlator_t *this, inode_t *inode,
+ uuid_t gfid, int xtra_records,
+ gf_boolean_t update_flag)
+{
+ changelog_local_t *local = NULL;
+ struct iobuf *iobuf = NULL;
+
+ /**
+ * Relax the presence of inode if @update_flag is true.
+ * The caller (implmentation of the fop) needs to be careful to
+ * not blindly use local->inode.
+ */
+ if (!update_flag && !inode) {
+ gf_log_callingfn (this->name, GF_LOG_WARNING,
+ "inode needed for version checking !!!");
+ goto out;
+ }
+
+ if (xtra_records) {
+ iobuf = iobuf_get2 (this->ctx->iobuf_pool,
+ xtra_records * CHANGELOG_OPT_RECORD_LEN);
+ if (!iobuf)
+ goto out;
+ }
+
+ local = mem_get0 (this->local_pool);
+ if (!local) {
+ CHANGELOG_IOBUF_UNREF (iobuf);
+ goto out;
+ }
+
+ local->update_no_check = update_flag;
+
+ (void) changelog_local_init_defaults (local, gfid, iobuf);
+
+ if (inode)
+ local->inode = inode_ref (inode);
+
+ out:
+ return local;
+}
+
+int
+changelog_forget (xlator_t *this, inode_t *inode)
+{
+ uint64_t ctx_addr = 0;
+ changelog_inode_ctx_t *ctx = NULL;
+
+ inode_ctx_del (inode, this, &ctx_addr);
+ if (!ctx_addr)
+ return 0;
+
+ ctx = (changelog_inode_ctx_t *) (long) ctx_addr;
+ GF_FREE (ctx);
+
+ return 0;
+}
+
+int
+changelog_inject_single_event (xlator_t *this,
+ changelog_priv_t *priv,
+ changelog_local_t *local,
+ changelog_log_data_t *cld)
+{
+ return priv->cd.dispatchfn (this, priv,
+ priv->cd.cd_data, local, cld);
+}
+
+/**
+ * TODO: these threads have many thing in common (wake up after
+ * a certain time etc..). move them into separate routine.
+ */
+void *
+changelog_rollover (void *data)
+{
+ int ret = 0;
+ char *cname = NULL;
+ xlator_t *this = NULL;
+ struct timeval tv = {0,};
+ changelog_time_slice_t *slice = NULL;
+ changelog_priv_t *priv = data;
+
+ this = priv->cr.this;
+ slice = &priv->slice;
+
+ while (1) {
+ tv.tv_sec = priv->rollover_time;
+ tv.tv_usec = 0;
+
+ ret = select (0, NULL, NULL, NULL, &tv);
+ if (ret)
+ continue;
+
+ LOCK (&priv->lock);
+ {
+ cname = CHANGELOG_FNAME_FROM_POLICY (priv->cp);
+ ret = CHANGELOG_INVOKE_CFOP (this, priv, rollover,
+ cname, _gf_false);
+ if (!ret)
+ SLICE_VERSION_UPDATE (slice);
+ }
+ UNLOCK (&priv->lock);
+ }
+
+ return NULL;
+}
+
+void *
+changelog_fsync_thread (void *data)
+{
+ int ret = 0;
+ xlator_t *this = NULL;
+ struct timeval tv = {0,};
+ changelog_priv_t *priv = data;
+
+ this = priv->cf.this;
+
+ while (1) {
+ tv.tv_sec = priv->fsync_interval;
+ tv.tv_usec = 0;
+
+ ret = select (0, NULL, NULL, NULL, &tv);
+ if (ret)
+ continue;
+
+ ret = CHANGELOG_INVOKE_CFOP (this, priv, sync);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to inject fsync event");
+ }
+
+ return NULL;
+}
+
+/* macros for inode/changelog version checks */
+
+#define INODE_VERSION_UPDATE(priv, inode, iver, slice, type) do { \
+ LOCK (&inode->lock); \
+ { \
+ LOCK (&priv->lock); \
+ { \
+ *iver = slice->changelog_version[type]; \
+ } \
+ UNLOCK (&priv->lock); \
+ } \
+ UNLOCK (&inode->lock); \
+ } while (0)
+
+#define INODE_VERSION_EQUALS_SLICE(priv, ver, slice, type, upd) do { \
+ LOCK (&priv->lock); \
+ { \
+ upd = (ver == slice->changelog_version[type]) \
+ ? _gf_false : _gf_true; \
+ } \
+ UNLOCK (&priv->lock); \
+ } while (0)
+
+static int
+__changelog_inode_ctx_set (xlator_t *this,
+ inode_t *inode, changelog_inode_ctx_t *ctx)
+{
+ uint64_t ctx_addr = (uint64_t) ctx;
+ return __inode_ctx_set (inode, this, &ctx_addr);
+}
+
+/**
+ * one shot routine to get the address and the value of a inode version
+ * for a particular type.
+ */
+static changelog_inode_ctx_t *
+__changelog_inode_ctx_get (xlator_t *this,
+ inode_t *inode, unsigned long **iver,
+ unsigned long *version, changelog_log_type type)
+{
+ int ret = 0;
+ uint64_t ctx_addr = 0;
+ changelog_inode_ctx_t *ctx = NULL;
+
+ ret = __inode_ctx_get (inode, this, &ctx_addr);
+ if (ret < 0)
+ ctx_addr = 0;
+ if (ctx_addr != 0) {
+ ctx = (changelog_inode_ctx_t *) (long)ctx_addr;
+ goto out;
+ }
+
+ ctx = GF_CALLOC (1, sizeof (*ctx), gf_changelog_mt_inode_ctx_t);
+ if (!ctx)
+ goto out;
+
+ ret = __changelog_inode_ctx_set (this, inode, ctx);
+ if (ret) {
+ GF_FREE (ctx);
+ ctx = NULL;
+ }
+
+ out:
+ if (ctx && iver && version) {
+ *iver = CHANGELOG_INODE_VERSION_TYPE (ctx, type);
+ *version = **iver;
+ }
+
+ return ctx;
+}
+
+static changelog_inode_ctx_t *
+changelog_inode_ctx_get (xlator_t *this,
+ inode_t *inode, unsigned long **iver,
+ unsigned long *version, changelog_log_type type)
+{
+ changelog_inode_ctx_t *ctx = NULL;
+
+ LOCK (&inode->lock);
+ {
+ ctx = __changelog_inode_ctx_get (this,
+ inode, iver, version, type);
+ }
+ UNLOCK (&inode->lock);
+
+ return ctx;
+}
+
+/**
+ * This is the main update routine. Locking has been made granular so as to
+ * maximize parallelism of fops - I'll try to explain it below using execution
+ * timelines.
+ *
+ * Basically, the contention is between multiple execution threads of this
+ * routine and the roll-over thread. So, instead of having a big lock, we hold
+ * granular locks: inode->lock and priv->lock. Now I'll explain what happens
+ * when there is an update and a roll-over at just about the same time.
+ * NOTE:
+ * - the dispatcher itself synchronizes updates via it's own lock
+ * - the slice version in incremented by the roll-over thread
+ *
+ * Case 1: When the rollover thread wins before the inode version can be
+ * compared with the slice version.
+ *
+ * [updater] | [rollover]
+ * |
+ * | <SLICE: 1, 1, 1>
+ * <changelog_update> |
+ * <changelog_inode_ctx_get> |
+ * <CTX: 1, 1, 1> |
+ * | <dispatch-rollover-event>
+ * | LOCK (&priv->lock)
+ * | <SLICE_VERSION_UPDATE>
+ * | <SLICE: 2, 2, 2>
+ * | UNLOCK (&priv->lock)
+ * |
+ * LOCK (&priv->lock) |
+ * <INODE_VERSION_EQUALS_SLICE> |
+ * I: 1 <-> S: 2 |
+ * update: true |
+ * UNLOCK (&priv->lock) |
+ * |
+ * <if update == true> |
+ * <dispath-update-event> |
+ * <INODE_VERSION_UPDATE> |
+ * LOCK (&inode->lock) |
+ * LOCK (&priv->lock) |
+ * <CTX: 2, 1, 1> |
+ * UNLOCK (&priv->lock) |
+ * UNLOCK (&inode->lock) |
+ *
+ * Therefore, the change gets recorded in the next change (no lost change). If
+ * the slice version was ahead of the inode version (say I:1, S: 2), then
+ * anyway the comparison would result in a update (I: 1, S: 3).
+ *
+ * If the rollover time is too less, then there is another contention when the
+ * updater tries to bring up inode version to the slice version (this is also
+ * the case when the roll-over thread wakes up during INODE_VERSION_UPDATE.
+ *
+ * <CTX: 1, 1, 1> | <SLICE: 2, 2, 2>
+ * |
+ * |
+ * <dispath-update-event> |
+ * <INODE_VERSION_UPDATE> |
+ * LOCK (&inode->lock) |
+ * LOCK (&priv->lock) |
+ * <CTX: 2, 1, 1> |
+ * UNLOCK (&priv->lock) |
+ * UNLOCK (&inode->lock) |
+ * | <dispatch-rollover-event>
+ * | LOCK (&priv->lock)
+ * | <SLICE_VERSION_UPDATE>
+ * | <SLICE: 3, 3, 3>
+ * | UNLOCK (&priv->lock)
+ *
+ *
+ * Case 2: When the fop thread wins
+ *
+ * [updater] | [rollover]
+ * |
+ * | <SLICE: 1, 1, 1>
+ * <changelog_update> |
+ * <changelog_inode_ctx_get> |
+ * <CTX: 0, 0, 0> |
+ * |
+ * LOCK (&priv->lock) |
+ * <INODE_VERSION_EQUALS_SLICE> |
+ * I: 0 <-> S: 1 |
+ * update: true |
+ * UNLOCK (&priv->lock) |
+ * | <dispatch-rollover-event>
+ * | LOCK (&priv->lock)
+ * | <SLICE_VERSION_UPDATE>
+ * | <SLICE: 2, 2, 2>
+ * | UNLOCK (&priv->lock)
+ * <if update == true> |
+ * <dispath-update-event> |
+ * <INODE_VERSION_UPDATE> |
+ * LOCK (&inode->lock) |
+ * LOCK (&priv->lock) |
+ * <CTX: 2, 0, 0> |
+ * UNLOCK (&priv->lock) |
+ * UNLOCK (&inode->lock) |
+ *
+ * Here again, if the inode version was equal to the slice version (I: 1, S: 1)
+ * then there is no need to record an update (as the equality of the two version
+ * signifies an update was recorded in the current time slice).
+ */
+inline void
+changelog_update (xlator_t *this,
+ changelog_priv_t *priv,
+ changelog_local_t *local,
+ changelog_log_type type)
+{
+ int ret = 0;
+ unsigned long *iver = NULL;
+ unsigned long version = 0;
+ inode_t *inode = NULL;
+ changelog_time_slice_t *slice = NULL;
+ changelog_inode_ctx_t *ctx = NULL;
+ changelog_log_data_t *cld_0 = NULL;
+ gf_boolean_t need_upd = _gf_true;
+
+ slice = &priv->slice;
+
+ /**
+ * for fops that do not require inode version checking
+ */
+ if (local->update_no_check)
+ goto update;
+
+ inode = local->inode;
+
+ ctx = changelog_inode_ctx_get (this,
+ inode, &iver, &version, type);
+ if (!ctx)
+ goto update;
+
+ INODE_VERSION_EQUALS_SLICE (priv, version, slice, type, need_upd);
+
+ update:
+ if (need_upd) {
+ cld_0 = &local->cld;
+ cld_0->cld_type = type;
+
+ ret = priv->cd.dispatchfn (this, priv,
+ priv->cd.cd_data, local, cld_0);
+
+ /**
+ * update after the dispatcher has successfully done
+ * it's job.
+ */
+ if (!local->update_no_check && iver && !ret)
+ INODE_VERSION_UPDATE (priv, inode, iver, slice, type);
+ }
+
+ return;
+}
diff --git a/xlators/features/changelog/src/changelog-helpers.h b/xlators/features/changelog/src/changelog-helpers.h
new file mode 100644
index 000000000..e4e2dfc96
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-helpers.h
@@ -0,0 +1,578 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CHANGELOG_HELPERS_H
+#define _CHANGELOG_HELPERS_H
+
+#include "locking.h"
+#include "timer.h"
+#include "pthread.h"
+#include "iobuf.h"
+
+#include "changelog-misc.h"
+
+/**
+ * structures representing the changelog entries
+ */
+typedef struct changelog_write_data {
+ /**
+ * sincd gfid is _always_ a necessity, it's not a part
+ * of the iobuf. by doing this we do not add any overhead
+ * for data and metadata related fops.
+ */
+ uuid_t cwd_gfid;
+
+ /**
+ * iobufs are used for optionals records: pargfid, path,
+ * write offsets etc.. It's the fop implementers job
+ * to allocate (iobuf_get() in the fop) and get unref'ed
+ * in the callback (CHANGELOG_STACK_UNWIND).
+ */
+ struct iobuf *cwd_iobuf;
+
+ /**
+ * after allocation you can point this to the length of
+ * usable data, but make sure it does not exceed the
+ * the size of the requested iobuf.
+ */
+ size_t cwd_iobuf_len;
+ #define cwd_ptr cwd_iobuf->ptr
+ #define cwd_ptr_len cwd_iobuf_len
+
+ /**
+ * number of optional records
+ */
+ int cwd_xtra_records;
+} changelog_write_data_t;
+
+typedef struct changelog_rollover_data {
+ /**
+ * need a changelog reopen?
+ */
+ gf_boolean_t crd_finale;
+
+ /**
+ * changelog file name to be opened after a rollover
+ */
+ char crd_changelog_name[PATH_MAX];
+
+ /**
+ * changelog file name before rollover
+ */
+ char crd_changelog_oname[PATH_MAX];
+
+ /**
+ * use @crd_roll_key as suffix during roll-over
+ */
+ gf_boolean_t crd_use_suffix;
+
+ /**
+ * suffix used when rolling a changelog
+ */
+ unsigned long crd_roll_key;
+
+ /**
+ * preallocation? if yes, how much?
+ */
+ off_t crd_prealloc_size;
+} changelog_rollover_data_t;
+
+/**
+ * the changelog entry: structure representing the type of entry
+ * and a union encapsulating the above declared structures.
+ */
+typedef struct changelog_log_data {
+ /**
+ * type of the log data entry
+ */
+ changelog_log_type cld_type;
+
+ /**
+ * union for the type of changelog operations. @fsync() does
+ * not have a corresponding entry in this union as it just
+ * performs and @fsync() on ->changelog_fd.
+ */
+ union {
+ changelog_write_data_t cld_wdata;
+ changelog_rollover_data_t cld_roll;
+ };
+} changelog_log_data_t;
+
+typedef struct changelog_local changelog_local_t;
+
+/**
+ * holder for dispatch function and private data
+ */
+
+typedef struct changelog_priv changelog_priv_t;
+
+typedef struct changelog_dispatcher {
+ void *cd_data;
+ int (*dispatchfn) (xlator_t *,
+ changelog_priv_t *, void *,
+ changelog_local_t *, changelog_log_data_t *);
+} changelog_dispatcher_t;
+
+struct changelog_bootstrap {
+ changelog_mode_t mode;
+ int (*ctor) (xlator_t *, changelog_dispatcher_t *, gf_boolean_t);
+ int (*dtor) (xlator_t *, changelog_dispatcher_t *);
+};
+
+struct changelog_encoder {
+ changelog_encoder_t encoder;
+ int (*encode) (xlator_t *,
+ changelog_local_t *, changelog_log_data_t *);
+};
+
+struct changelog_ops {
+ /* changelog open */
+ int (*open) (xlator_t *, changelog_priv_t *,
+ void *, char *, gf_boolean_t);
+
+ /* changelog close */
+ int (*close) (xlator_t *, changelog_priv_t *, void *);
+
+ /* changelog rollover */
+ int (*rollover) (xlator_t *,
+ changelog_priv_t *,
+ void *, char *, gf_boolean_t);
+
+ int (*sync) (xlator_t *, changelog_priv_t *, void *);
+
+ /* changelog write */
+ int (*write) (xlator_t *,
+ changelog_priv_t *, void *,
+ changelog_local_t *, changelog_log_type);
+
+ /* changelog read */
+ int (*read) (xlator_t *,
+ changelog_priv_t *, void *, char *);
+
+ int (*unlink) (xlator_t *,
+ changelog_priv_t *, void *, char *);
+
+ /* {get|set} offset */
+ off_t (*get_offset) (xlator_t *this,
+ changelog_priv_t *, void *, changelog_local_t *);
+
+ void (*set_offset) (xlator_t *this,
+ changelog_priv_t *, void *,
+ changelog_local_t *, off_t);
+
+ void (*reset_offset) (xlator_t *this, changelog_priv_t *,
+ void *, changelog_local_t *);
+};
+
+/**
+ * This structure is _filled_ by the policy init (@init_policy) routine.
+ * Default @fops and @cops are passed to the init routine, which can
+ * choose to override the file operation or changelog operation behaviour.
+ * Just by _replacing_ the function pointers, a policy can change it's
+ * file and changelog operation behaviour. Kind of inheritance...
+ */
+struct changelog_logpolicy {
+ /* current changelog name */
+ char changelog_name[PATH_MAX];
+
+ /* private data */
+ void *cpriv;
+
+ /* file ops for the policy */
+ struct xlator_fops *fops;
+
+ /* changelog operations for the policy */
+ struct changelog_ops *cops;
+
+ /* current active policy */
+ changelog_log_policy_t policy;
+
+ int (*init_policy) (xlator_t *,
+ changelog_priv_t *priv,
+ struct changelog_logpolicy *);
+ int (*fini_policy) (xlator_t *, struct changelog_logpolicy *);
+};
+
+#define CHANGELOG_FNAME_FROM_POLICY(c) c->changelog_name
+
+#define CHANGELOG_INVOKE_FOP(priv,fop,...) priv->cp->fops->fop (__VA_ARGS__)
+
+#define CHANGELOG_INVOKE_CFOP(this,priv,fop,...) \
+ priv->cp->cops->fop (this, priv, priv->cp->cpriv, ##__VA_ARGS__)
+
+/* xlator private */
+
+typedef struct changelog_time_slice {
+ /**
+ * just in case we need nanosecond granularity some day.
+ * field is unused as of now (maybe we'd need it later).
+ */
+ struct timeval tv_start;
+
+ /**
+ * version of changelog file, incremented each time changes
+ * rollover.
+ */
+ unsigned long changelog_version[CHANGELOG_MAX_TYPE];
+} changelog_time_slice_t;
+
+typedef struct changelog_rollover {
+ /* rollover thread */
+ pthread_t rollover_th;
+
+ xlator_t *this;
+} changelog_rollover_t;
+
+typedef struct changelog_fsync {
+ /* fsync() thread */
+ pthread_t fsync_th;
+
+ xlator_t *this;
+} changelog_fsync_t;
+
+# define CHANGELOG_MAX_CLIENTS 5
+typedef struct changelog_notify {
+ /* reader end of the pipe */
+ int rfd;
+
+ /* notifier thread */
+ pthread_t notify_th;
+
+ /* unique socket path */
+ char sockpath[UNIX_PATH_MAX];
+
+ int socket_fd;
+
+ /**
+ * simple array of accept()'ed fds. Not scalable at all
+ * for large number of clients, but it's okay as we have
+ * a ahrd limit in this version (@CHANGELOG_MAX_CLIENTS).
+ */
+ int client_fd[CHANGELOG_MAX_CLIENTS];
+
+ xlator_t *this;
+} changelog_notify_t;
+
+struct changelog_priv {
+ gf_boolean_t active;
+
+ /**
+ * write the record header?
+ */
+ gf_boolean_t no_gfid_hdr;
+
+ gf_boolean_t lockless_update;
+
+ /* to generate unique socket file per brick */
+ char *changelog_brick;
+
+ /* logging directory */
+ char *changelog_dir;
+
+ /* one file for all changelog types */
+ int changelog_fd;
+
+ gf_lock_t lock;
+
+ /* writen end of the pipe */
+ int wfd;
+
+ /* rollover time */
+ int32_t rollover_time;
+
+ /* fsync() interval */
+ int32_t fsync_interval;
+
+ /* changelog type maps */
+ const char *maps[CHANGELOG_MAX_TYPE];
+
+ /* time slicer */
+ changelog_time_slice_t slice;
+
+ /* context of the updater */
+ changelog_dispatcher_t cd;
+
+ /* context of the rollover thread */
+ changelog_rollover_t cr;
+
+ /* context of fsync thread */
+ changelog_fsync_t cf;
+
+ /* context of the notifier thread */
+ changelog_notify_t cn;
+
+ /* operation mode */
+ changelog_mode_t op_mode;
+
+ /* bootstrap routine for 'current' logger */
+ struct changelog_bootstrap *cb;
+
+ /* encoder mode */
+ changelog_encoder_t encode_mode;
+
+ /* encoder */
+ struct changelog_encoder *ce;
+
+ /* logging policy */
+ changelog_log_policy_t policy;
+
+ /* policy logger */
+ struct changelog_logpolicy *cp;
+
+ /* current NSR term */
+ uint32_t term;
+};
+
+struct changelog_local {
+ inode_t *inode;
+
+ /**
+ * fops that do not need inode version checks
+ */
+ gf_boolean_t update_no_check;
+
+ /**
+ * the log data entry
+ */
+ changelog_log_data_t cld;
+
+ /**
+ * number of bytes written: used for continuation
+ */
+ off_t nr_bytes;
+
+ /**
+ * temporary scratch pads
+ */
+ union {
+ void *ptr;
+ unsigned long val;
+ } lu;
+};
+
+/* inode version is stored in inode ctx */
+typedef struct changelog_inode_ctx {
+ unsigned long iversion[CHANGELOG_MAX_TYPE];
+} changelog_inode_ctx_t;
+
+#define CHANGELOG_INODE_VERSION_TYPE(ctx, type) &(ctx->iversion[type])
+
+/**
+ * Optional Records:
+ * fops that need to save additional information request a array of
+ * @changelog_opt_t struct. The array is allocated via @iobufs.
+ */
+typedef enum {
+ CHANGELOG_OPT_REC_FOP,
+ CHANGELOG_OPT_REC_ULL,
+ CHANGELOG_OPT_REC_UUID,
+ CHANGELOG_OPT_REC_NAME,
+ CHANGELOG_OPT_REC_ENTRY,
+ CHANGELOG_OPT_REC_INT32,
+ CHANGELOG_OPT_REC_UINT32,
+} changelog_optional_rec_type_t;
+
+struct changelog_entry_fields {
+ uuid_t cef_uuid;
+ char *cef_bname;
+};
+
+typedef struct {
+ /**
+ * @co_covert can be used to do post-processing of the record before
+ * it's persisted to the CHANGELOG. If this is NULL, then the record
+ * is persisted as per it's in memory format.
+ */
+ size_t (*co_convert) (void *data, char *buffer, gf_boolean_t encode);
+
+ /* release routines */
+ void (*co_free) (void *data);
+
+ /* type of the field */
+ changelog_optional_rec_type_t co_type;
+
+ /**
+ * sizeof of the 'valid' field in the union. This field is not used if
+ * @co_convert is specified.
+ */
+ size_t co_len;
+
+ union {
+ uuid_t co_uuid;
+ glusterfs_fop_t co_fop;
+ int co_int32;
+ unsigned int co_uint32;
+ unsigned long long co_number;
+ struct changelog_entry_fields co_entry;
+ };
+} changelog_opt_t;
+
+#define CHANGELOG_OPT_RECORD_LEN sizeof (changelog_opt_t)
+
+/**
+ * helpers routines
+ */
+
+void
+changelog_thread_cleanup (xlator_t *this, pthread_t thr_id);
+
+void *
+changelog_get_usable_buffer (changelog_local_t *local);
+
+void
+changelog_set_usable_record_and_length (changelog_local_t *local,
+ size_t len, int xr);
+void
+changelog_local_cleanup (xlator_t *xl, changelog_local_t *local);
+changelog_local_t *
+changelog_local_init (xlator_t *this, inode_t *inode, uuid_t gfid,
+ int xtra_records, gf_boolean_t update_flag);
+int
+changelog_inject_single_event (xlator_t *this,
+ changelog_priv_t *priv,
+ changelog_local_t *local,
+ changelog_log_data_t *cld);
+size_t
+changelog_entry_length ();
+int
+changelog_write (int fd, char *buffer, size_t len);
+int
+changelog_write_change (xlator_t *this, changelog_priv_t *priv,
+ changelog_local_t *local, char *buffer, size_t len);
+inline int
+changelog_handle_change (xlator_t *this,
+ changelog_priv_t *priv,
+ changelog_local_t *local, changelog_log_data_t *cld);
+inline void
+changelog_update (xlator_t *this,
+ changelog_priv_t *priv,
+ changelog_local_t *local,
+ changelog_log_type type);
+void *
+changelog_rollover (void *data);
+void *
+changelog_fsync_thread (void *data);
+int
+changelog_forget (xlator_t *this, inode_t *inode);
+
+/* macros */
+
+#define CHANGELOG_STACK_UNWIND(fop, frame, params ...) do { \
+ changelog_local_t *__local = NULL; \
+ xlator_t *__xl = NULL; \
+ if (frame) { \
+ __local = frame->local; \
+ __xl = frame->this; \
+ frame->local = NULL; \
+ } \
+ STACK_UNWIND_STRICT (fop, frame, params); \
+ changelog_local_cleanup (__xl, __local); \
+ } while (0)
+
+#define CHANGELOG_IOBUF_REF(iobuf) do { \
+ if (iobuf) \
+ iobuf_ref (iobuf); \
+ } while (0)
+
+#define CHANGELOG_IOBUF_UNREF(iobuf) do { \
+ if (iobuf) \
+ iobuf_unref (iobuf); \
+ } while (0)
+
+#define CHANGELOG_FILL_BUFFER(buffer, off, val, len) do { \
+ memcpy (buffer + off, val, len); \
+ off += len; \
+ } while (0)
+
+#define SLICE_VERSION_UPDATE(slice) do { \
+ int i = 0; \
+ for (; i < CHANGELOG_MAX_TYPE; i++) { \
+ slice->changelog_version[i]++; \
+ } \
+ } while (0)
+
+#define CHANGELOG_FILL_INT32(co, number, converter, xlen) do { \
+ co->co_convert = converter; \
+ co->co_free = NULL; \
+ co->co_type = CHANGELOG_OPT_REC_INT32; \
+ co->co_int32 = number; \
+ xlen += sizeof (int); \
+ } while (0)
+
+#define CHANGELOG_FILL_UINT32(co, number, converter, xlen) do { \
+ co->co_convert = converter; \
+ co->co_free = NULL; \
+ co->co_type = CHANGELOG_OPT_REC_UINT32; \
+ co->co_uint32 = number; \
+ xlen += sizeof (unsigned int); \
+ } while (0)
+
+#define CHANGELOG_FILL_FOP_NUMBER(co, fop, converter, xlen) do { \
+ co->co_convert = converter; \
+ co->co_free = NULL; \
+ co->co_type = CHANGELOG_OPT_REC_FOP; \
+ co->co_fop = fop; \
+ xlen += sizeof (fop); \
+ } while (0)
+
+#define CHANGELOG_FILL_NAME(co, name, freefn, xlen, label) \
+ do { \
+ co->co_convert = NULL; \
+ co->co_free = freefn; \
+ co->co_type = CHANGELOG_OPT_REC_NAME; \
+ co->co_entry.cef_bname = gf_strdup(name); \
+ if (!co->co_entry.cef_bname) \
+ goto label; \
+ co->co_len = strlen (name); \
+ xlen += co->co_len; \
+ } while(0) \
+
+#define CHANGELOG_FILL_ENTRY(co, pargfid, bname, \
+ converter, freefn, xlen, label) \
+ do { \
+ co->co_convert = converter; \
+ co->co_free = freefn; \
+ co->co_type = CHANGELOG_OPT_REC_ENTRY; \
+ uuid_copy (co->co_entry.cef_uuid, pargfid); \
+ co->co_entry.cef_bname = gf_strdup(bname); \
+ if (!co->co_entry.cef_bname) \
+ goto label; \
+ xlen += (UUID_CANONICAL_FORM_LEN + strlen (bname)); \
+ } while (0)
+
+#define CHANGELOG_INIT(this, local, inode, gfid, xrec) \
+ local = changelog_local_init (this, inode, gfid, xrec, _gf_false)
+
+#define CHANGELOG_INIT_NOCHECK(this, local, inode, gfid, xrec) \
+ local = changelog_local_init (this, inode, gfid, xrec, _gf_true)
+
+#define CHANGELOG_NOT_ACTIVE_THEN_GOTO(frame, priv, label) do { \
+ if (!priv->active) \
+ goto label; \
+ /* ignore rebalance process's activity. */ \
+ if (frame->root->pid == GF_CLIENT_PID_DEFRAG) \
+ goto label; \
+ } while (0)
+
+/* ignore internal fops */
+#define CHANGELOG_IF_INTERNAL_FOP_THEN_GOTO(dict, label) do { \
+ if (dict && dict_get (dict, GLUSTERFS_INTERNAL_FOP_KEY)) \
+ goto label; \
+ } while (0)
+
+#define CHANGELOG_COND_GOTO(priv, cond, label) do { \
+ if (!priv->active || cond) \
+ goto label; \
+ } while (0)
+
+int
+changelog_open (xlator_t *this, changelog_priv_t *priv, changelog_local_t *local, changelog_rollover_data_t *crd);
+
+#endif /* _CHANGELOG_HELPERS_H */
diff --git a/xlators/features/changelog/src/changelog-mem-types.h b/xlators/features/changelog/src/changelog-mem-types.h
new file mode 100644
index 000000000..a65bbb4f2
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-mem-types.h
@@ -0,0 +1,30 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CHANGELOG_MEM_TYPES_H
+#define _CHANGELOG_MEM_TYPES_H
+
+#include "mem-types.h"
+
+enum gf_changelog_mem_types {
+ gf_changelog_mt_priv_t = gf_common_mt_end + 1,
+ gf_changelog_mt_str_t = gf_common_mt_end + 2,
+ gf_changelog_mt_batch_t = gf_common_mt_end + 3,
+ gf_changelog_mt_rt_t = gf_common_mt_end + 4,
+ gf_changelog_mt_inode_ctx_t = gf_common_mt_end + 5,
+ gf_changelog_mt_fop_policy_t = gf_common_mt_end + 6,
+ gf_changelog_mt_libgfchangelog_t = gf_common_mt_end + 7,
+ gf_changelog_mt_libgfchangelog_rl_t = gf_common_mt_end + 8,
+ gf_changelog_mt_libgfchangelog_dirent_t = gf_common_mt_end + 9,
+ gf_changelog_mt_changelog_buffer_t = gf_common_mt_end + 10,
+ gf_changelog_mt_end
+};
+
+#endif
diff --git a/xlators/features/changelog/src/changelog-misc.h b/xlators/features/changelog/src/changelog-misc.h
new file mode 100644
index 000000000..58bd3279d
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-misc.h
@@ -0,0 +1,107 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CHANGELOG_MISC_H
+#define _CHANGELOG_MISC_H
+
+#include "glusterfs.h"
+#include "common-utils.h"
+
+#define CHANGELOG_MAX_TYPE 3
+#define CHANGELOG_FILE_NAME "CHANGELOG"
+
+#define CHANGELOG_VERSION_MAJOR 1
+#define CHANGELOG_VERSION_MINOR 0
+
+#define CHANGELOG_UNIX_SOCK DEFAULT_VAR_RUN_DIRECTORY"/changelog-%s.sock"
+
+/**
+ * header starts with the version and the format of the changelog.
+ * 'version' not much of a use now.
+ */
+#define CHANGELOG_HEADER \
+ "GlusterFS Changelog | version: v%d.%d | encoding : %d\n"
+
+#define CHANGELOG_MAKE_SOCKET_PATH(brick_path, sockpath, len) do { \
+ char md5_sum[MD5_DIGEST_LENGTH*2+1] = {0,}; \
+ md5_wrapper((unsigned char *) brick_path, \
+ strlen(brick_path), \
+ md5_sum); \
+ (void) snprintf (sockpath, len, \
+ CHANGELOG_UNIX_SOCK, md5_sum); \
+ } while (0)
+
+/**
+ * ... used by libgfchangelog.
+ */
+#define CHANGELOG_GET_ENCODING(fd, buffer, len, enc, enc_len) do { \
+ FILE *fp; \
+ int fd_dup, maj, min; \
+ \
+ enc = -1; \
+ fd_dup = dup (fd); \
+ \
+ if (fd_dup != -1) { \
+ fp = fdopen (fd_dup, "r"); \
+ if (fp) { \
+ if (fgets (buffer, len, fp)) { \
+ elen = strlen (buffer); \
+ sscanf (buffer, \
+ CHANGELOG_HEADER, \
+ &maj, &min, &enc); \
+ } \
+ fclose (fp); \
+ } else { \
+ close (fd_dup); \
+ } \
+ } \
+ } while (0)
+
+/**
+ * everything after @CHANGELOG_TYPE_ENTRY are internal types
+ * (ie. none of the fops trigger this type of event), hence
+ * CHANGELOG_MAX_TYPE = 3
+ */
+typedef enum {
+ CHANGELOG_TYPE_DATA = 0,
+ CHANGELOG_TYPE_METADATA,
+ CHANGELOG_TYPE_ENTRY,
+ CHANGELOG_TYPE_ROLLOVER,
+ CHANGELOG_TYPE_FSYNC,
+} changelog_log_type;
+
+/* operation modes - RT for now */
+typedef enum {
+ CHANGELOG_MODE_RT = 0,
+} changelog_mode_t;
+
+/* encoder types */
+
+typedef enum {
+ CHANGELOG_ENCODE_MIN = 0,
+ CHANGELOG_ENCODE_BINARY,
+ CHANGELOG_ENCODE_ASCII,
+ CHANGELOG_ENCODE_MAX,
+} changelog_encoder_t;
+
+/* logging policies */
+typedef enum {
+ CHANGELOG_LOG_POLICY_DEFAULT = 0,
+ CHANGELOG_LOG_POLICY_REPLICATE,
+} changelog_log_policy_t;
+
+#define CHANGELOG_VALID_ENCODING(enc) \
+ (enc > CHANGELOG_ENCODE_MIN && enc < CHANGELOG_ENCODE_MAX)
+
+#define CHANGELOG_TYPE_IS_ENTRY(type) (type == CHANGELOG_TYPE_ENTRY)
+#define CHANGELOG_TYPE_IS_ROLLOVER(type) (type == CHANGELOG_TYPE_ROLLOVER)
+#define CHANGELOG_TYPE_IS_FSYNC(type) (type == CHANGELOG_TYPE_FSYNC)
+
+#endif /* _CHANGELOG_MISC_H */
diff --git a/xlators/features/changelog/src/changelog-notifier.c b/xlators/features/changelog/src/changelog-notifier.c
new file mode 100644
index 000000000..5f3d063a8
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-notifier.c
@@ -0,0 +1,314 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "changelog-notifier.h"
+
+#include <pthread.h>
+
+inline static void
+changelog_notify_clear_fd (changelog_notify_t *cn, int i)
+{
+ cn->client_fd[i] = -1;
+}
+
+inline static void
+changelog_notify_save_fd (changelog_notify_t *cn, int i, int fd)
+{
+ cn->client_fd[i] = fd;
+}
+
+static int
+changelog_notify_insert_fd (xlator_t *this, changelog_notify_t *cn, int fd)
+{
+ int i = 0;
+ int ret = 0;
+
+ for (; i < CHANGELOG_MAX_CLIENTS; i++) {
+ if (cn->client_fd[i] == -1)
+ break;
+ }
+
+ if (i == CHANGELOG_MAX_CLIENTS) {
+ /**
+ * this case should not be hit as listen() would limit
+ * the number of completely established connections.
+ */
+ gf_log (this->name, GF_LOG_WARNING,
+ "hit max client limit (%d)", CHANGELOG_MAX_CLIENTS);
+ ret = -1;
+ }
+ else
+ changelog_notify_save_fd (cn, i, fd);
+
+ return ret;
+}
+
+static void
+changelog_notify_fill_rset (changelog_notify_t *cn, fd_set *rset, int *maxfd)
+{
+ int i = 0;
+
+ FD_ZERO (rset);
+
+ FD_SET (cn->socket_fd, rset);
+ *maxfd = cn->socket_fd;
+
+ FD_SET (cn->rfd, rset);
+ *maxfd = max (*maxfd, cn->rfd);
+
+ for (; i < CHANGELOG_MAX_CLIENTS; i++) {
+ if (cn->client_fd[i] != -1) {
+ FD_SET (cn->client_fd[i], rset);
+ *maxfd = max (*maxfd, cn->client_fd[i]);
+ }
+ }
+
+ *maxfd = *maxfd + 1;
+}
+
+static int
+changelog_notify_client (changelog_notify_t *cn, char *path, ssize_t len)
+{
+ int i = 0;
+ int ret = 0;
+
+ for (; i < CHANGELOG_MAX_CLIENTS; i++) {
+ if (cn->client_fd[i] == -1)
+ continue;
+
+ if (changelog_write (cn->client_fd[i],
+ path, len)) {
+ ret = -1;
+
+ close (cn->client_fd[i]);
+ changelog_notify_clear_fd (cn, i);
+ }
+ }
+
+ return ret;
+}
+
+static void
+changelog_notifier_init (changelog_notify_t *cn)
+{
+ int i = 0;
+
+ cn->socket_fd = -1;
+
+ for (; i < CHANGELOG_MAX_CLIENTS; i++) {
+ changelog_notify_clear_fd (cn, i);
+ }
+}
+
+static void
+changelog_close_client_conn (changelog_notify_t *cn)
+{
+ int i = 0;
+
+ for (; i < CHANGELOG_MAX_CLIENTS; i++) {
+ if (cn->client_fd[i] == -1)
+ continue;
+
+ close (cn->client_fd[i]);
+ changelog_notify_clear_fd (cn, i);
+ }
+}
+
+static void
+changelog_notifier_cleanup (void *arg)
+{
+ changelog_notify_t *cn = NULL;
+
+ cn = (changelog_notify_t *) arg;
+
+ changelog_close_client_conn (cn);
+
+ if (cn->socket_fd != -1)
+ close (cn->socket_fd);
+
+ if (cn->rfd)
+ close (cn->rfd);
+
+ if (unlink (cn->sockpath))
+ gf_log ("", GF_LOG_WARNING,
+ "could not unlink changelog socket file"
+ " %s (reason: %s", cn->sockpath, strerror (errno));
+}
+
+void *
+changelog_notifier (void *data)
+{
+ int i = 0;
+ int fd = 0;
+ int max_fd = 0;
+ int len = 0;
+ ssize_t readlen = 0;
+ xlator_t *this = NULL;
+ changelog_priv_t *priv = NULL;
+ changelog_notify_t *cn = NULL;
+ struct sockaddr_un local = {0,};
+ char path[PATH_MAX] = {0,};
+ char abspath[PATH_MAX] = {0,};
+
+ char buffer;
+ fd_set rset;
+
+ priv = (changelog_priv_t *) data;
+
+ cn = &priv->cn;
+ this = cn->this;
+
+ pthread_cleanup_push (changelog_notifier_cleanup, cn);
+
+ changelog_notifier_init (cn);
+
+ cn->socket_fd = socket (AF_UNIX, SOCK_STREAM, 0);
+ if (cn->socket_fd < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "changelog socket error (reason: %s)",
+ strerror (errno));
+ goto out;
+ }
+
+ CHANGELOG_MAKE_SOCKET_PATH (priv->changelog_brick,
+ cn->sockpath, UNIX_PATH_MAX);
+ if (unlink (cn->sockpath) < 0) {
+ if (errno != ENOENT) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not unlink changelog socket file (%s)"
+ " (reason: %s)",
+ CHANGELOG_UNIX_SOCK, strerror (errno));
+ goto cleanup;
+ }
+ }
+
+ local.sun_family = AF_UNIX;
+ strcpy (local.sun_path, cn->sockpath);
+
+ len = strlen (local.sun_path) + sizeof (local.sun_family);
+
+ /* bind to the unix domain socket */
+ if (bind (cn->socket_fd, (struct sockaddr *) &local, len) < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not bind to changelog socket (reason: %s)",
+ strerror (errno));
+ goto cleanup;
+ }
+
+ /* listen for incoming connections */
+ if (listen (cn->socket_fd, CHANGELOG_MAX_CLIENTS) < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "listen() error on changelog socket (reason: %s)",
+ strerror (errno));
+ goto cleanup;
+ }
+
+ /**
+ * simple select() on all to-be-read file descriptors. This method
+ * though old school works pretty well when you have a handfull of
+ * fd's to be watched (clients).
+ *
+ * Future TODO: move this to epoll based notification facility if
+ * number of clients increase.
+ */
+ for (;;) {
+ changelog_notify_fill_rset (cn, &rset, &max_fd);
+
+ if (select (max_fd, &rset, NULL, NULL, NULL) < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "select() returned -1 (reason: %s)",
+ strerror (errno));
+ sleep (2);
+ continue;
+ }
+
+ if (FD_ISSET (cn->socket_fd, &rset)) {
+ fd = accept (cn->socket_fd, NULL, NULL);
+ if (fd < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "accept error on changelog socket"
+ " (reason: %s)", strerror (errno));
+ } else if (changelog_notify_insert_fd (this, cn, fd)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "hit max client limit");
+ }
+ }
+
+ if (FD_ISSET (cn->rfd, &rset)) {
+ /**
+ * read changelog filename and notify all connected
+ * clients.
+ */
+ readlen = 0;
+ while (readlen < PATH_MAX) {
+ len = read (cn->rfd, &path[readlen++], 1);
+ if (len == -1) {
+ break;
+ }
+
+ if (len == 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "rollover thread sent EOF"
+ " on pipe - possibly a crash.");
+ /* be blunt and close all connections */
+ pthread_exit(NULL);
+ }
+
+ if (path[readlen - 1] == '\0')
+ break;
+ }
+
+ /* should we close all client connections here too? */
+ if (len < 0 || readlen == PATH_MAX) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not get pathname from rollover"
+ " thread or pathname too long");
+ goto process_rest;
+ }
+
+ (void) snprintf (abspath, PATH_MAX,
+ "%s/%s", priv->changelog_dir, path);
+ if (changelog_notify_client (cn, abspath,
+ strlen (abspath) + 1))
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not notify some clients with new"
+ " changelogs");
+ }
+
+ process_rest:
+ for (i = 0; i < CHANGELOG_MAX_CLIENTS; i++) {
+ if ( (fd = cn->client_fd[i]) == -1 )
+ continue;
+
+ if (FD_ISSET (fd, &rset)) {
+ /**
+ * the only data we accept from the client is a
+ * disconnect. Anything else is treated as bogus
+ * and is silently discarded (also warned!!!).
+ */
+ if ( (readlen = read (fd, &buffer, 1)) <= 0 ) {
+ close (fd);
+ changelog_notify_clear_fd (cn, i);
+ } else {
+ /* silently discard data and log */
+ gf_log (this->name, GF_LOG_WARNING,
+ "misbehaving changelog client");
+ }
+ }
+ }
+
+ }
+
+ cleanup:;
+ pthread_cleanup_pop (1);
+
+ out:
+ return NULL;
+}
diff --git a/xlators/features/changelog/src/changelog-notifier.h b/xlators/features/changelog/src/changelog-notifier.h
new file mode 100644
index 000000000..55e728356
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-notifier.h
@@ -0,0 +1,19 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CHANGELOG_NOTIFIER_H
+#define _CHANGELOG_NOTIFIER_H
+
+#include "changelog-helpers.h"
+
+void *
+changelog_notifier (void *data);
+
+#endif
diff --git a/xlators/features/changelog/src/changelog-rt.c b/xlators/features/changelog/src/changelog-rt.c
new file mode 100644
index 000000000..91d47e059
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-rt.c
@@ -0,0 +1,83 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "xlator.h"
+#include "defaults.h"
+#include "logging.h"
+
+#include "changelog-rt.h"
+#include "changelog-mem-types.h"
+
+int
+changelog_rt_init (xlator_t *this,
+ changelog_dispatcher_t *cd, gf_boolean_t lockless_update)
+{
+ changelog_rt_t *crt = NULL;
+
+ crt = GF_CALLOC (1, sizeof (*crt),
+ gf_changelog_mt_rt_t);
+ if (!crt)
+ return -1;
+
+ /* TBD: don't init (and destroy) if lock-less update */
+ LOCK_INIT (&crt->lock);
+
+ cd->cd_data = crt;
+ cd->dispatchfn = lockless_update ?
+ &changelog_rt_enqueue_lockless : &changelog_rt_enqueue;
+
+ return 0;
+}
+
+int
+changelog_rt_fini (xlator_t *this, changelog_dispatcher_t *cd)
+{
+ changelog_rt_t *crt = NULL;
+
+ crt = cd->cd_data;
+
+ LOCK_DESTROY (&crt->lock);
+ GF_FREE (crt);
+
+ return 0;
+}
+
+int
+changelog_rt_enqueue (xlator_t *this,
+ changelog_priv_t *priv, void *cbatch,
+ changelog_local_t *local, changelog_log_data_t *cld_0)
+{
+ int ret = 0;
+ changelog_rt_t *crt = NULL;
+
+ crt = (changelog_rt_t *) cbatch;
+
+ LOCK (&crt->lock);
+ {
+ ret = changelog_handle_change (this, priv, local, cld_0);
+ }
+ UNLOCK (&crt->lock);
+
+ return ret;
+}
+
+int
+changelog_rt_enqueue_lockless (xlator_t *this,
+ changelog_priv_t *priv, void *cbatch,
+ changelog_local_t *local,
+ changelog_log_data_t *cld_0)
+{
+ return changelog_handle_change (this, priv, local, cld_0);
+}
diff --git a/xlators/features/changelog/src/changelog-rt.h b/xlators/features/changelog/src/changelog-rt.h
new file mode 100644
index 000000000..634b7473b
--- /dev/null
+++ b/xlators/features/changelog/src/changelog-rt.h
@@ -0,0 +1,40 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CHANGELOG_RT_H
+#define _CHANGELOG_RT_H
+
+#include "locking.h"
+#include "timer.h"
+#include "pthread.h"
+
+#include "changelog-helpers.h"
+
+/* unused as of now - may be you would need it later */
+typedef struct changelog_rt {
+ gf_lock_t lock;
+} changelog_rt_t;
+
+int
+changelog_rt_init (xlator_t *this,
+ changelog_dispatcher_t *cd, gf_boolean_t lockless_update);
+int
+changelog_rt_fini (xlator_t *this, changelog_dispatcher_t *cd);
+int
+changelog_rt_enqueue (xlator_t *this,
+ changelog_priv_t *priv, void *cbatch,
+ changelog_local_t *local, changelog_log_data_t *cld_0);
+int
+changelog_rt_enqueue_lockless (xlator_t *this,
+ changelog_priv_t *priv, void *cbatch,
+ changelog_local_t *local,
+ changelog_log_data_t *cld_0);
+
+#endif /* _CHANGELOG_RT_H */
diff --git a/xlators/features/changelog/src/changelog.c b/xlators/features/changelog/src/changelog.c
new file mode 100644
index 000000000..6d4b502de
--- /dev/null
+++ b/xlators/features/changelog/src/changelog.c
@@ -0,0 +1,1389 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "xlator.h"
+#include "defaults.h"
+#include "logging.h"
+#include "iobuf.h"
+#include <pthread.h>
+
+#include "changelog-rt.h"
+#include "changelog-notifier.h"
+#include "changelog-encoders.h"
+#include "changelog-mem-types.h"
+
+#include "changelog-fops.h"
+#include "changelog-policy.h"
+
+static struct changelog_bootstrap
+cb_bootstrap[] = {
+ {
+ .mode = CHANGELOG_MODE_RT,
+ .ctor = changelog_rt_init,
+ .dtor = changelog_rt_fini,
+ },
+};
+
+static struct changelog_encoder
+cb_encoder[] = {
+ [CHANGELOG_ENCODE_BINARY] =
+ {
+ .encoder = CHANGELOG_ENCODE_BINARY,
+ .encode = changelog_encode_binary,
+ },
+ [CHANGELOG_ENCODE_ASCII] =
+ {
+ .encoder = CHANGELOG_ENCODE_ASCII,
+ .encode = changelog_encode_ascii,
+ },
+};
+
+static struct changelog_logpolicy
+cb_policy[] = {
+ [CHANGELOG_LOG_POLICY_DEFAULT] =
+ {
+ .fops = NULL,
+ .cops = NULL,
+ .policy = CHANGELOG_LOG_POLICY_DEFAULT,
+ .init_policy = changelog_default_policy_init,
+ .fini_policy = changelog_default_policy_fini,
+ },
+ [CHANGELOG_LOG_POLICY_REPLICATE] =
+ {
+ .fops = NULL,
+ .cops = NULL,
+ .policy = CHANGELOG_LOG_POLICY_REPLICATE,
+ .init_policy = changelog_replication_policy_init,
+ .fini_policy = changelog_replication_policy_fini,
+ },
+};
+
+/* Entry operations - TYPE III */
+
+/* {{{ */
+
+/* rmdir */
+
+int32_t
+changelog_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv,
+ write, local, CHANGELOG_TYPE_ENTRY);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (rmdir, frame, op_ret, op_errno,
+ preparent, postparent, xdata);
+ return 0;
+}
+
+int32_t
+changelog_rmdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int xflags, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, rmdir, frame, this, loc, xflags, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_rmdir_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->rmdir,
+ loc, xflags, xdata);
+ return 0;
+}
+
+/* unlink */
+
+int32_t
+changelog_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv,
+ write, local, CHANGELOG_TYPE_ENTRY);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (unlink, frame, op_ret, op_errno,
+ preparent, postparent, xdata);
+ return 0;
+}
+
+int32_t
+changelog_unlink (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int xflags, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+ CHANGELOG_IF_INTERNAL_FOP_THEN_GOTO (xdata, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, unlink, frame, this, loc, xflags, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_unlink_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->unlink,
+ loc, xflags, xdata);
+ return 0;
+}
+
+/* rename */
+
+int32_t
+changelog_rename_cbk (call_frame_t *frame,
+ void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *buf, struct iatt *preoldparent,
+ struct iatt *postoldparent, struct iatt *prenewparent,
+ struct iatt *postnewparent, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv,
+ write, local, CHANGELOG_TYPE_ENTRY);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (rename, frame, op_ret, op_errno,
+ buf, preoldparent, postoldparent,
+ prenewparent, postnewparent, xdata);
+ return 0;
+}
+
+
+int32_t
+changelog_rename (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, rename, frame, this, oldloc, newloc, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_rename_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->rename,
+ oldloc, newloc, xdata);
+ return 0;
+}
+
+/* link */
+
+int32_t
+changelog_link_cbk (call_frame_t *frame,
+ void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv,
+ write, local, CHANGELOG_TYPE_ENTRY);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (link, frame, op_ret, op_errno,
+ inode, buf, preparent, postparent, xdata);
+ return 0;
+}
+
+int32_t
+changelog_link (call_frame_t *frame,
+ xlator_t *this, loc_t *oldloc,
+ loc_t *newloc, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+ CHANGELOG_IF_INTERNAL_FOP_THEN_GOTO (xdata, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, link, frame, this, oldloc, newloc, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_link_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->link,
+ oldloc, newloc, xdata);
+ return 0;
+}
+
+/* mkdir */
+
+int32_t
+changelog_mkdir_cbk (call_frame_t *frame,
+ void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv,
+ write, local, CHANGELOG_TYPE_ENTRY);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (mkdir, frame, op_ret, op_errno,
+ inode, buf, preparent, postparent, xdata);
+ return 0;
+}
+
+int32_t
+changelog_mkdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, mkdir, frame, this,
+ loc, mode, umask, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_mkdir_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->mkdir,
+ loc, mode, umask, xdata);
+ return 0;
+}
+
+/* symlink */
+
+int32_t
+changelog_symlink_cbk (call_frame_t *frame,
+ void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv,
+ write, local, CHANGELOG_TYPE_ENTRY);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (symlink, frame, op_ret, op_errno,
+ inode, buf, preparent, postparent, xdata);
+ return 0;
+}
+
+int32_t
+changelog_symlink (call_frame_t *frame, xlator_t *this,
+ const char *linkname, loc_t *loc,
+ mode_t umask, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, symlink, frame, this,
+ linkname, loc, umask, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_symlink_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->symlink,
+ linkname, loc, umask, xdata);
+ return 0;
+}
+
+/* mknod */
+
+int32_t
+changelog_mknod_cbk (call_frame_t *frame,
+ void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv,
+ write, local, CHANGELOG_TYPE_ENTRY);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (mknod, frame, op_ret, op_errno,
+ inode, buf, preparent, postparent, xdata);
+ return 0;
+}
+
+int32_t
+changelog_mknod (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ mode_t mode, dev_t dev, mode_t umask, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, mknod, frame, this,
+ loc, mode, dev, umask, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_mknod_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->mknod,
+ loc, mode, dev, umask, xdata);
+ return 0;
+}
+
+/* creat */
+
+int32_t
+changelog_create_cbk (call_frame_t *frame,
+ void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ fd_t *fd, inode_t *inode, struct iatt *buf,
+ struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv,
+ write, local, CHANGELOG_TYPE_ENTRY);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (create, frame,
+ op_ret, op_errno, fd, inode,
+ buf, preparent, postparent, xdata);
+ return 0;
+}
+
+int32_t
+changelog_create (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t flags, mode_t mode,
+ mode_t umask, fd_t *fd, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, create, frame, this, loc,
+ flags, mode, umask, fd, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_create_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->create,
+ loc, flags, mode, umask, fd, xdata);
+ return 0;
+}
+
+/* }}} */
+
+
+/* Metadata modification fops - TYPE II */
+
+/* {{{ */
+
+/* {f}setattr */
+
+int32_t
+changelog_fsetattr_cbk (call_frame_t *frame,
+ void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *preop_stbuf,
+ struct iatt *postop_stbuf, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv,
+ write, local, CHANGELOG_TYPE_METADATA);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (fsetattr, frame, op_ret, op_errno,
+ preop_stbuf, postop_stbuf, xdata);
+
+ return 0;
+}
+
+int32_t
+changelog_fsetattr (call_frame_t *frame,
+ xlator_t *this, fd_t *fd,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, fsetattr,
+ frame, this, fd, stbuf, valid, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_fsetattr_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetattr,
+ fd, stbuf, valid, xdata);
+ return 0;
+
+
+}
+
+int32_t
+changelog_setattr_cbk (call_frame_t *frame,
+ void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *preop_stbuf,
+ struct iatt *postop_stbuf, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv,
+ write, local, CHANGELOG_TYPE_METADATA);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (setattr, frame, op_ret, op_errno,
+ preop_stbuf, postop_stbuf, xdata);
+
+ return 0;
+}
+
+int32_t
+changelog_setattr (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, setattr,
+ frame, this, loc, stbuf, valid, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_setattr_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->setattr,
+ loc, stbuf, valid, xdata);
+ return 0;
+}
+
+/* {f}removexattr */
+
+int32_t
+changelog_fremovexattr_cbk (call_frame_t *frame,
+ void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv,
+ write, local, CHANGELOG_TYPE_METADATA);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, xdata);
+
+ return 0;
+}
+
+int32_t
+changelog_fremovexattr (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, const char *name, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, fremovexattr,
+ frame, this, fd, name, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_fremovexattr_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->fremovexattr,
+ fd, name, xdata);
+ return 0;
+}
+
+int32_t
+changelog_removexattr_cbk (call_frame_t *frame,
+ void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv,
+ write, local, CHANGELOG_TYPE_METADATA);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (removexattr, frame, op_ret, op_errno, xdata);
+
+ return 0;
+}
+
+int32_t
+changelog_removexattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, removexattr, frame, this, loc, name, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_removexattr_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->removexattr,
+ loc, name, xdata);
+ return 0;
+}
+
+/* {f}setxattr */
+
+int32_t
+changelog_setxattr_cbk (call_frame_t *frame,
+ void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv,
+ write, local, CHANGELOG_TYPE_METADATA);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata);
+
+ return 0;
+}
+
+int32_t
+changelog_setxattr (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ dict_t *dict, int32_t flags, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, setxattr,
+ frame, this, loc, dict, flags, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_setxattr_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->setxattr,
+ loc, dict, flags, xdata);
+ return 0;
+}
+
+int32_t
+changelog_fsetxattr_cbk (call_frame_t *frame,
+ void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv,
+ write, local, CHANGELOG_TYPE_METADATA);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, xdata);
+
+ return 0;
+}
+
+int32_t
+changelog_fsetxattr (call_frame_t *frame,
+ xlator_t *this, fd_t *fd, dict_t *dict,
+ int32_t flags, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, fsetxattr,
+ frame, this, fd, dict, flags, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_fsetxattr_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetxattr,
+ fd, dict, flags, xdata);
+ return 0;
+}
+
+/* }}} */
+
+
+/* Data modification fops - TYPE I */
+
+/* {{{ */
+
+/* {f}truncate() */
+
+int32_t
+changelog_truncate_cbk (call_frame_t *frame,
+ void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv, write, local, CHANGELOG_TYPE_DATA);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (truncate, frame,
+ op_ret, op_errno, prebuf, postbuf, xdata);
+ return 0;
+}
+
+int32_t
+changelog_truncate (call_frame_t *frame,
+ xlator_t *this, loc_t *loc, off_t offset, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, truncate, frame, this, loc, offset, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_truncate_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->truncate,
+ loc, offset, xdata);
+ return 0;
+}
+
+int32_t
+changelog_ftruncate_cbk (call_frame_t *frame,
+ void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret < 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv, write, local, CHANGELOG_TYPE_DATA);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (ftruncate, frame,
+ op_ret, op_errno, prebuf, postbuf, xdata);
+ return 0;
+}
+
+int32_t
+changelog_ftruncate (call_frame_t *frame,
+ xlator_t *this, fd_t *fd, off_t offset, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, ftruncate, frame, this, fd, offset, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_ftruncate_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->ftruncate,
+ fd, offset, xdata);
+ return 0;
+}
+
+/* writev() */
+
+int32_t
+changelog_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf,
+ dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ CHANGELOG_COND_GOTO (priv, ((op_ret <= 0) || !local), unwind);
+
+ CHANGELOG_INVOKE_CFOP (this, priv, write, local, CHANGELOG_TYPE_DATA);
+
+ unwind:
+ CHANGELOG_STACK_UNWIND (writev, frame,
+ op_ret, op_errno, prebuf, postbuf, xdata);
+ return 0;
+}
+
+int32_t
+changelog_writev (call_frame_t *frame,
+ xlator_t *this, fd_t *fd, struct iovec *vector,
+ int32_t count, off_t offset, uint32_t flags,
+ struct iobref *iobref, dict_t *xdata)
+{
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+
+ CHANGELOG_NOT_ACTIVE_THEN_GOTO (frame, priv, wind);
+
+ CHANGELOG_INVOKE_FOP (priv, writev, frame, this, fd,
+ vector, count, offset, flags, iobref, xdata);
+
+ wind:
+ STACK_WIND (frame, changelog_writev_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->writev, fd, vector,
+ count, offset, flags, iobref, xdata);
+ return 0;
+}
+
+/* }}} */
+
+/**
+ * The
+ * - @init ()
+ * - @fini ()
+ * - @reconfigure ()
+ * ... and helper routines
+ */
+
+/**
+ * needed if there are more operation modes in the future.
+ */
+static void
+changelog_assign_opmode (changelog_priv_t *priv, char *mode)
+{
+ if ( strncmp (mode, "realtime", 8) == 0 ) {
+ priv->op_mode = CHANGELOG_MODE_RT;
+ }
+}
+
+static void
+changelog_assign_encoding (changelog_priv_t *priv, char *enc)
+{
+ if ( strncmp (enc, "binary", 6) == 0 ) {
+ priv->encode_mode = CHANGELOG_ENCODE_BINARY;
+ } else if ( strncmp (enc, "ascii", 5) == 0 ) {
+ priv->encode_mode = CHANGELOG_ENCODE_ASCII;
+ }
+}
+
+static void
+changelog_assign_policy (changelog_priv_t *priv, char *pol)
+{
+ if ( strncmp (pol, "default", 7) == 0 )
+ priv->policy = CHANGELOG_LOG_POLICY_DEFAULT;
+ else if ( strncmp (pol, "replication", 11) == 0 )
+ priv->policy = CHANGELOG_LOG_POLICY_REPLICATE;
+}
+
+/* cleanup any helper threads that are running */
+static void
+changelog_cleanup_helper_threads (xlator_t *this, changelog_priv_t *priv)
+{
+ if (priv->cr.rollover_th) {
+ changelog_thread_cleanup (this, priv->cr.rollover_th);
+ priv->cr.rollover_th = 0;
+ }
+
+ if (priv->cf.fsync_th) {
+ changelog_thread_cleanup (this, priv->cf.fsync_th);
+ priv->cf.fsync_th = 0;
+ }
+}
+
+/* spawn helper thread; cleaning up in case of errors */
+static int
+changelog_spawn_helper_threads (xlator_t *this, changelog_priv_t *priv)
+{
+ int ret = 0;
+
+ priv->cr.this = this;
+ if (priv->rollover_time) {
+ ret = pthread_create (&priv->cr.rollover_th,
+ NULL, changelog_rollover, priv);
+ if (ret)
+ goto out;
+ }
+
+ if (priv->fsync_interval) {
+ priv->cf.this = this;
+ ret = pthread_create (&priv->cf.fsync_th,
+ NULL, changelog_fsync_thread, priv);
+ }
+
+ if (ret)
+ changelog_cleanup_helper_threads (this, priv);
+
+ out:
+ return ret;
+}
+
+/* cleanup the notifier thread */
+static int
+changelog_cleanup_notifier (xlator_t *this, changelog_priv_t *priv)
+{
+ int ret = 0;
+
+ if (priv->cn.notify_th) {
+ changelog_thread_cleanup (this, priv->cn.notify_th);
+ priv->cn.notify_th = 0;
+
+ ret = close (priv->wfd);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "error closing writer end of notifier pipe"
+ " (reason: %s)", strerror (errno));
+ }
+
+ return ret;
+}
+
+/* spawn the notifier thread - nop if already running */
+static int
+changelog_spawn_notifier (xlator_t *this, changelog_priv_t *priv)
+{
+ int ret = 0;
+ int flags = 0;
+ int pipe_fd[2] = {0, 0};
+
+ if (priv->cn.notify_th)
+ goto out; /* notifier thread already running */
+
+ ret = pipe (pipe_fd);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Cannot create pipe (reason: %s)", strerror (errno));
+ goto out;
+ }
+
+ /* writer is non-blocking */
+ flags = fcntl (pipe_fd[1], F_GETFL);
+ flags |= O_NONBLOCK;
+
+ ret = fcntl (pipe_fd[1], F_SETFL, flags);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set O_NONBLOCK flag");
+ goto out;
+ }
+
+ priv->wfd = pipe_fd[1];
+
+ priv->cn.this = this;
+ priv->cn.rfd = pipe_fd[0];
+
+ ret = pthread_create (&priv->cn.notify_th,
+ NULL, changelog_notifier, priv);
+
+ out:
+ return ret;
+}
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ if (!this)
+ return ret;
+
+ ret = xlator_mem_acct_init (this, gf_changelog_mt_end + 1);
+
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_WARNING, "Memory accounting"
+ " init failed");
+ return ret;
+ }
+
+ return ret;
+}
+
+static int
+changelog_init (xlator_t *this, changelog_priv_t *priv)
+{
+ int i = 0;
+ int ret = -1;
+ char *cname = NULL;
+ struct timeval tv = {0,};
+
+ ret = gettimeofday (&tv, NULL);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "gettimeofday() failure");
+ goto out;
+ }
+
+ priv->slice.tv_start = tv;
+
+ priv->maps[CHANGELOG_TYPE_DATA] = "D ";
+ priv->maps[CHANGELOG_TYPE_METADATA] = "M ";
+ priv->maps[CHANGELOG_TYPE_ENTRY] = "E ";
+
+ for (; i < CHANGELOG_MAX_TYPE; i++) {
+ /* start with version 1 */
+ priv->slice.changelog_version[i] = 1;
+ }
+
+ if (!priv->active)
+ return ret;
+
+ /* spawn the notifier thread */
+ ret = changelog_spawn_notifier (this, priv);
+ if (ret)
+ goto out;
+
+ cname = CHANGELOG_FNAME_FROM_POLICY (priv->cp);
+
+ LOCK (&priv->lock);
+ {
+ ret = CHANGELOG_INVOKE_CFOP (this, priv,
+ open, cname, _gf_false);
+ }
+ UNLOCK (&priv->lock);
+
+ if (ret)
+ goto out;
+
+ /* ... and finally spawn the helpers threads */
+ ret = changelog_spawn_helper_threads (this, priv);
+
+ out:
+ return ret;
+}
+
+int
+reconfigure (xlator_t *this, dict_t *options)
+{
+ int ret = 0;
+ char *tmp = NULL;
+ char *cname = NULL;
+ changelog_priv_t *priv = NULL;
+ gf_boolean_t active_earlier = _gf_true;
+ gf_boolean_t active_now = _gf_true;
+ changelog_time_slice_t *slice = NULL;
+
+ priv = this->private;
+ if (!priv)
+ goto out;
+
+ ret = -1;
+ active_earlier = priv->active;
+
+ /* first stop the rollover and the fsync thread */
+ changelog_cleanup_helper_threads (this, priv);
+
+ GF_OPTION_RECONF ("changelog-dir", tmp, options, str, out);
+ if (!tmp) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "\"changelog-dir\" option is not set");
+ goto out;
+ }
+
+ GF_FREE (priv->changelog_dir);
+ priv->changelog_dir = gf_strdup (tmp);
+ if (!priv->changelog_dir)
+ goto out;
+
+ ret = mkdir_p (priv->changelog_dir, 0600, _gf_true);
+ if (ret)
+ goto out;
+
+ GF_OPTION_RECONF ("changelog", active_now, options, bool, out);
+
+ /**
+ * changelog_handle_change() handles changes that could possibly
+ * have been submit changes before changelog deactivation.
+ */
+ if (!active_now)
+ priv->active = _gf_false;
+
+ GF_OPTION_RECONF ("op-mode", tmp, options, str, out);
+ changelog_assign_opmode (priv, tmp);
+
+ tmp = NULL;
+
+ GF_OPTION_RECONF ("encoding", tmp, options, str, out);
+ changelog_assign_encoding (priv, tmp);
+
+ GF_OPTION_RECONF ("rollover-time",
+ priv->rollover_time, options, int32, out);
+ GF_OPTION_RECONF ("fsync-interval",
+ priv->fsync_interval, options, int32, out);
+
+ if (active_now || active_earlier) {
+ slice = &priv->slice;
+ cname = CHANGELOG_FNAME_FROM_POLICY (priv->cp);
+
+ LOCK (&priv->lock);
+ {
+ ret = CHANGELOG_INVOKE_CFOP (this, priv, rollover,
+ cname, !active_now);
+ if (!ret && active_now)
+ SLICE_VERSION_UPDATE (slice);
+ }
+ UNLOCK (&priv->lock);
+
+ if (ret)
+ goto out;
+
+ if (active_now) {
+ ret = changelog_spawn_notifier (this, priv);
+ if (!ret)
+ ret = changelog_spawn_helper_threads (this,
+ priv);
+ } else
+ ret = changelog_cleanup_notifier (this, priv);
+ }
+
+ out:
+ if (ret) {
+ ret = changelog_cleanup_notifier (this, priv);
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "changelog reconfigured");
+ if (active_now)
+ priv->active = _gf_true;
+ }
+
+ return ret;
+}
+
+int32_t
+init (xlator_t *this)
+{
+ int ret = -1;
+ char *tmp = NULL;
+ changelog_priv_t *priv = NULL;
+
+ GF_VALIDATE_OR_GOTO ("changelog", this, out);
+
+ if (!this->children || this->children->next) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "translator needs a single subvolume");
+ goto out;
+ }
+
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "dangling volume. please check volfile");
+ goto out;
+ }
+
+ priv = GF_CALLOC (1, sizeof (*priv), gf_changelog_mt_priv_t);
+ if (!priv)
+ goto out;
+
+ this->local_pool = mem_pool_new (changelog_local_t, 64);
+ if (!this->local_pool) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to create local memory pool");
+ goto out;
+ }
+
+ LOCK_INIT (&priv->lock);
+
+ GF_OPTION_INIT ("changelog-brick", tmp, str, out);
+ if (!tmp) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "\"changelog-brick\" option is not set");
+ goto out;
+ }
+
+ priv->changelog_brick = gf_strdup (tmp);
+ if (!priv->changelog_brick)
+ goto out;
+ tmp = NULL;
+
+ GF_OPTION_INIT ("changelog-dir", tmp, str, out);
+ if (!tmp) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "\"changelog-dir\" option is not set");
+ goto out;
+ }
+
+ priv->changelog_dir = gf_strdup (tmp);
+ if (!priv->changelog_dir)
+ goto out;
+ tmp = NULL;
+
+ /**
+ * create the directory even if change-logging would be inactive
+ * so that consumers can _look_ into it (finding nothing...)
+ */
+ ret = mkdir_p (priv->changelog_dir, 0600, _gf_true);
+ if (ret)
+ goto out;
+
+ GF_OPTION_INIT ("changelog", priv->active, bool, out);
+
+ GF_OPTION_INIT ("op-mode", tmp, str, out);
+ changelog_assign_opmode (priv, tmp);
+
+ tmp = NULL;
+
+ GF_OPTION_INIT ("encoding", tmp, str, out);
+ changelog_assign_encoding (priv, tmp);
+
+ tmp = NULL;
+
+ GF_OPTION_INIT ("policy", tmp, str, out);
+ changelog_assign_policy (priv, tmp);
+
+ GF_OPTION_INIT ("fsync-interval", priv->fsync_interval, int32, out);
+
+ GF_ASSERT (cb_encoder[priv->encode_mode].encoder == priv->encode_mode);
+ priv->ce = &cb_encoder[priv->encode_mode];
+
+ GF_ASSERT (cb_bootstrap[priv->op_mode].mode == priv->op_mode);
+ priv->cb = &cb_bootstrap[priv->op_mode];
+
+ GF_ASSERT (cb_policy[priv->policy].policy == priv->policy);
+ priv->cp = &cb_policy[priv->policy];
+
+
+ /* ... init logging policy */
+ ret = priv->cp->init_policy (this, priv, priv->cp);
+ if (ret)
+ goto out;
+
+ /* ... now bootstrap the logger */
+ ret = priv->cb->ctor (this, &priv->cd, priv->lockless_update);
+ if (ret)
+ goto out;
+
+ /* override the value if set */
+ if (dict_get (this->options, "rollover-time")) {
+ ret = dict_get_int32 (this->options,
+ "rollover-time", &priv->rollover_time);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Cannot get value for \"rollover-time\"");
+ goto out;
+ }
+ }
+
+ priv->changelog_fd = -1;
+ ret = changelog_init (this, priv);
+ if (ret)
+ goto out;
+
+ gf_log (this->name, GF_LOG_DEBUG, "changelog translator loaded");
+
+ out:
+ if (ret) {
+ if (this->local_pool)
+ mem_pool_destroy (this->local_pool);
+ if (priv->cb) {
+ ret = priv->cb->dtor (this, &priv->cd);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "error in cleanup during init()");
+ }
+ GF_FREE (priv->changelog_brick);
+ GF_FREE (priv->changelog_dir);
+ GF_FREE (priv);
+ this->private = NULL;
+ } else
+ this->private = priv;
+
+ return ret;
+}
+
+void
+fini (xlator_t *this)
+{
+ int ret = -1;
+ changelog_priv_t *priv = NULL;
+
+ priv = this->private;
+
+ if (priv) {
+ ret = priv->cb->dtor (this, &priv->cd);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "error in fini");
+ mem_pool_destroy (this->local_pool);
+ GF_FREE (priv->changelog_brick);
+ GF_FREE (priv->changelog_dir);
+ GF_FREE (priv);
+ }
+
+ this->private = NULL;
+
+ return;
+}
+
+struct xlator_fops fops = {
+ .mknod = changelog_mknod,
+ .mkdir = changelog_mkdir,
+ .create = changelog_create,
+ .symlink = changelog_symlink,
+ .writev = changelog_writev,
+ .truncate = changelog_truncate,
+ .ftruncate = changelog_ftruncate,
+ .link = changelog_link,
+ .rename = changelog_rename,
+ .unlink = changelog_unlink,
+ .rmdir = changelog_rmdir,
+ .setattr = changelog_setattr,
+ .fsetattr = changelog_fsetattr,
+ .setxattr = changelog_setxattr,
+ .fsetxattr = changelog_fsetxattr,
+ .removexattr = changelog_removexattr,
+ .fremovexattr = changelog_fremovexattr,
+};
+
+struct xlator_cbks cbks = {
+ .forget = changelog_forget,
+};
+
+struct volume_options options[] = {
+ {.key = {"changelog"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "enable/disable change-logging"
+ },
+ {.key = {"changelog-brick"},
+ .type = GF_OPTION_TYPE_PATH,
+ .description = "brick path to generate unique socket file name."
+ " should be the export directory of the volume strictly."
+ },
+ {.key = {"changelog-dir"},
+ .type = GF_OPTION_TYPE_PATH,
+ .description = "directory for the changelog files"
+ },
+ {.key = {"op-mode"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = "realtime",
+ .value = {"realtime"},
+ .description = "operation mode - futuristic operation modes"
+ },
+ {.key = {"encoding"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = "ascii",
+ .value = {"binary", "ascii"},
+ .description = "encoding type for changelogs"
+ },
+ {.key = {"rollover-time"},
+ .description = "time to switch to a new changelog file (in seconds)"
+ },
+ {.key = {"fsync-interval"},
+ .type = GF_OPTION_TYPE_TIME,
+ .default_value = "5",
+ .description = "do not open CHANGELOG file with O_SYNC mode."
+ " instead perform fsync() at specified intervals"
+ },
+ {.key = {"policy"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = "replication",
+ .value = {"default", "replication"},
+ .description = "Logging policies"
+ },
+ {.key = {NULL}
+ },
+};
diff --git a/xlators/features/changelog/src/policy/changelog-policy-default.c b/xlators/features/changelog/src/policy/changelog-policy-default.c
new file mode 100644
index 000000000..eaa3d107f
--- /dev/null
+++ b/xlators/features/changelog/src/policy/changelog-policy-default.c
@@ -0,0 +1,45 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "changelog-policy.h"
+#include "changelog-fops.h"
+
+int
+changelog_default_policy_init (xlator_t *this,
+ changelog_priv_t *priv,
+ struct changelog_logpolicy *cp)
+{
+ priv->rollover_time = 15;
+
+ priv->no_gfid_hdr = _gf_false;
+ priv->lockless_update = _gf_false;
+
+ cp->cpriv = GF_CALLOC (1, sizeof (off_t),
+ gf_changelog_mt_fop_policy_t);
+ if (!cp->cpriv)
+ return -1;
+
+ (void) memset (cp->changelog_name, '\0', PATH_MAX);
+ (void) memcpy (cp->changelog_name,
+ CHANGELOG_FILE_NAME, strlen (CHANGELOG_FILE_NAME));
+
+ cp->fops = &changelog_default_fops; /* default logging policy */
+ cp->cops = &changelog_default_cops; /* default changelog operations */
+
+ return 0;
+}
+
+int
+changelog_default_policy_fini (xlator_t *this,
+ struct changelog_logpolicy *cp)
+{
+ GF_FREE (cp->cpriv);
+ return 0;
+}
diff --git a/xlators/features/changelog/src/policy/changelog-policy-replication.c b/xlators/features/changelog/src/policy/changelog-policy-replication.c
new file mode 100644
index 000000000..29c049716
--- /dev/null
+++ b/xlators/features/changelog/src/policy/changelog-policy-replication.c
@@ -0,0 +1,1374 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "changelog-policy.h"
+#include "changelog-encoders.h"
+#include "changelog-fops.h"
+
+#define JOURNAL_NAME "TERM"
+
+#define JOURNAL_SECTOR_SIZE 128
+
+#define PRE_OP_MARK 0x5F4552505FULL /* _PRE_ */
+#define POST_OP_MARK 0x5F54534F505FULL /* _POST_ */
+
+/* similar to fop_fn, but... */
+size_t
+int32_fn (void *data, char *buffer, gf_boolean_t encode)
+{
+ size_t bufsz = 0;
+ int nr = 0;
+ char buf[20] = {0,};
+
+ nr = *(int *) data;
+
+ if (encode) {
+ (void) snprintf (buf, sizeof (buf), "%d", nr);
+ CHANGELOG_FILL_BUFFER (buffer, bufsz, buf, strlen (buf));
+ } else
+ CHANGELOG_FILL_BUFFER (buffer, bufsz,
+ &nr, sizeof (int));
+
+ return bufsz;
+}
+
+
+size_t
+uint32_fn (void *data, char *buffer, gf_boolean_t encode)
+{
+ size_t bufsz = 0;
+ unsigned int nr = 0;
+ char buf[20] = {0,};
+
+ nr = *(unsigned int *) data;
+
+ if (encode) {
+ (void) snprintf (buf, sizeof (buf), "%u", nr);
+ CHANGELOG_FILL_BUFFER (buffer, bufsz, buf, strlen (buf));
+ } else
+ CHANGELOG_FILL_BUFFER (buffer, bufsz,
+ &nr, sizeof (unsigned int));
+
+ return bufsz;
+}
+
+size_t
+number_fn (void *data, char *buffer, gf_boolean_t encode)
+{
+ char buf[1024] = {0,};
+ size_t bufsz = 0;
+ unsigned long long nr = 0;
+
+ nr = *(unsigned long long *) data;
+
+ if (encode) {
+ (void) snprintf (buf, sizeof (buf), "%llu", nr);
+ CHANGELOG_FILL_BUFFER (buffer, bufsz, buf, strlen (buf));
+ } else
+ CHANGELOG_FILL_BUFFER (buffer, bufsz,
+ &nr, sizeof (unsigned long long));
+
+ return bufsz;
+}
+
+size_t
+uuid_fn (void *data, char *buffer, gf_boolean_t encode)
+{
+ char buf[1024] = {0,};
+ uuid_t uuid = {0,};
+ size_t bufsz = 0;
+
+ memcpy (uuid, (uuid_t *) data, sizeof (uuid_t));
+
+ if (encode) {
+ char *tmpbuf = uuid_utoa (uuid);
+ (void) snprintf (buf, sizeof (buf), "%s", tmpbuf);
+ CHANGELOG_FILL_BUFFER (buffer, bufsz, buf, strlen (buf));
+ } else
+ CHANGELOG_FILL_BUFFER (buffer, bufsz, uuid, sizeof (uuid_t));
+
+ return bufsz;
+}
+
+#define CHANGELOG_FILL_USIGNLL(co, number, converter, xlen) do { \
+ co->co_convert = converter; \
+ co->co_free = NULL; \
+ co->co_type = CHANGELOG_OPT_REC_ULL; \
+ co->co_number = (unsigned long long) number; \
+ xlen += sizeof (unsigned long long); \
+ if (!co->co_convert) \
+ co->co_len = sizeof (unsigned long long); \
+ } while (0)
+
+#define CHANGELOG_FILL_UUID(co, uuid, converter, xlen) do { \
+ co->co_convert = converter; \
+ co->co_free = NULL; \
+ co->co_type = CHANGELOG_OPT_REC_UUID; \
+ uuid_copy (co->co_uuid, uuid); \
+ xlen += sizeof (uuid_t); \
+ } while (0)
+
+
+/* TBD: move declarations here and nsr.c into a common place */
+#define NSR_TERM_XATTR "trusted.nsr.term"
+#define NSR_INDEX_XATTR "trusted.nsr.index"
+#define RECON_TERM_XATTR "trusted.nsr.recon-term"
+#define RECON_INDEX_XATTR "trusted.nsr.recon-index"
+
+static gf_boolean_t
+changelog_fix_term(xlator_t *this,
+ changelog_local_t *local,
+ dict_t *xdata)
+{
+ int32_t old_term, new_term;
+ uint32_t index;
+ changelog_priv_t *priv = this->private;
+ int ret = 0;
+ char nfile[PATH_MAX] = {0,};
+ int32_t recon_term, recon_index;
+ changelog_rollover_data_t crd;
+
+ // If coming via the regular IO path, we should get the dict "nsr-term"
+ // If coming via reconciliation, we should get the dicts "nsr-recon-term"
+ // that indicates the term and "nsr-recon-index" for the index
+ if ((dict_get_int32(xdata,NSR_TERM_XATTR,&new_term) == 0) &&
+ (dict_get_uint32(xdata, NSR_INDEX_XATTR, &index) == 0)) {
+ old_term = priv->term;
+
+ if (old_term != new_term) {
+ GF_ASSERT(new_term > old_term);
+ LOCK (&priv->lock);
+ priv->term = new_term;
+ (void) snprintf (nfile, PATH_MAX, "%s.%d",
+ JOURNAL_NAME, priv->term);
+ ret = CHANGELOG_INVOKE_CFOP(this, priv, rollover,
+ nfile, _gf_false);
+ UNLOCK (&priv->lock);
+ if (ret != 0)
+ return _gf_false;
+ }
+ local->nr_bytes = 0;
+ local->lu.val = index;
+ } else if ((dict_get_int32(xdata, RECON_TERM_XATTR, &recon_term) == 0) &&
+ (dict_get_int32(xdata, RECON_INDEX_XATTR, &recon_index) == 0)) {
+
+ old_term = priv->term;
+
+ if (old_term != recon_term) {
+ LOCK (&priv->lock);
+ {
+ priv->term = recon_term;
+ (void) snprintf (crd.crd_changelog_name,
+ PATH_MAX, "%s.%d",
+ JOURNAL_NAME, priv->term);
+ crd.crd_prealloc_size = 1<<29;
+ ret = changelog_open(this, priv, local, &crd);
+ }
+ UNLOCK (&priv->lock);
+ if (ret != 0)
+ return _gf_false;
+ }
+
+ local->nr_bytes = 0;
+ local->lu.val = recon_index;
+ } else {
+ return _gf_false;
+ }
+
+ return _gf_true;
+}
+
+/**
+ * Replication policy records journal entries in the FOP path. This is
+ * quite different that the default policy (used by geo-replication),
+ * which journals records in the callback path on a successfull posix
+ * operation. Additionally, each record starts with a PRE-OP marker and
+ * the index number generated by the leader.
+ * (c.f. nsr_$NAME$() ~/xlator/cluster/nsr/nsr-server/src/all-templates.c)
+ *
+ * POST-OPs are marked asynchronously and not during in the callback path
+ * Marking it in the callback path is incorrect as the actual FOP may not
+ * have been synchronized to the disk. Therefore, POST op marking is done
+ * after a successful file system sync, which is trigerred periodically
+ * by NSR server component. To keep journal updates strictly sequential,
+ * POST-OPs are separate record in the journal.
+ */
+
+/**
+ * Override File Operations
+ *
+ * NOTE: Since journal updates are done in the FOP path, there is no
+ * actual use of @local in cbk. Therefore, @local could have been
+ * declared statically for each FOP (which would remove the overhead
+ * of allocating (here) and deallocating (in cbk)). Let's not do that
+ * and keep it this way for now. Will worry about it later (for code
+ * reasonability and performance).
+ */
+
+int32_t
+changelog_replication_rmdir (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int xflags, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ changelog_priv_t *priv = NULL;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+
+ /* <PRE> + IDX + FOP + GFID + UID + GID + Entry */
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, loc->inode->gfid, 7);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ if (changelog_fix_term(this, local, xdata) == _gf_false)
+ goto out;
+
+ CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UUID (co, loc->inode->gfid, uuid_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_INT32 (co, frame->root->uid, int32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_INT32 (co, frame->root->gid, int32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 7);
+
+ frame->local = local;
+ ret = 0;
+
+ changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY);
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+int32_t
+changelog_replication_unlink (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int xflags, dict_t *xdata)
+{
+ return changelog_replication_rmdir (frame, this, loc, xflags, xdata);
+}
+
+int32_t
+changelog_replication_rename (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ changelog_priv_t *priv = NULL;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+
+ /* <PRE> + IDX + FOP + GFID + UID + GID + OLDLOC + NEWLOC */
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, oldloc->inode->gfid, 8);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ if (changelog_fix_term(this, local, xdata) == _gf_false)
+ goto out;
+
+ CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UUID (co, oldloc->inode->gfid, uuid_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_INT32 (co, frame->root->uid, int32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_INT32 (co, frame->root->gid, int32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_ENTRY (co, oldloc->pargfid, oldloc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+ co++;
+
+ CHANGELOG_FILL_ENTRY (co, newloc->pargfid, newloc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 8);
+
+ frame->local = local;
+ ret = 0;
+
+ changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY);
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+int32_t
+changelog_replication_link (call_frame_t *frame,
+ xlator_t *this, loc_t *oldloc,
+ loc_t *newloc, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ changelog_priv_t *priv = NULL;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+
+ /* <PRE> + IDX + FOP + GFID + UID + GID + Entry */
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, oldloc->gfid, 7);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ if (changelog_fix_term(this, local, xdata) == _gf_false)
+ goto out;
+
+ CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UUID (co, oldloc->gfid, uuid_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_INT32 (co, frame->root->uid, int32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_INT32 (co, frame->root->gid, int32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_ENTRY (co, newloc->pargfid, newloc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 7);
+
+ frame->local = local;
+ ret = 0;
+
+ changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY);
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+int32_t
+changelog_replication_mkdir (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ mode_t mode, mode_t umask, dict_t *xdata)
+{
+ int ret = -1;
+ uuid_t gfid = {0,};
+ void *uuid_req = NULL;
+ size_t xtra_len = 0;
+ changelog_priv_t *priv = NULL;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+
+ ret = dict_get_ptr (xdata, "gfid-req", &uuid_req);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "failed to get gfid from dict");
+ goto out;
+ }
+ uuid_copy (gfid, uuid_req);
+
+ ret = -1;
+
+ /* <PRE> + IDX + FOP + GFID + MODE + UID + GID + Entry */
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, gfid, 8);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ if (changelog_fix_term(this, local, xdata) == _gf_false)
+ goto out;
+
+ CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UUID (co, gfid, uuid_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, mode | S_IFDIR, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_INT32 (co, frame->root->uid, int32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_INT32 (co, frame->root->gid, int32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 8);
+
+ frame->local = local;
+ ret = 0;
+
+ changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY);
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+int32_t
+changelog_replication_symlink (call_frame_t *frame, xlator_t *this,
+ const char *linkname, loc_t *loc,
+ mode_t umask, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ uuid_t gfid = {0,};
+ void *uuid_req = NULL;
+ changelog_priv_t *priv = NULL;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+
+ ret = dict_get_ptr (xdata, "gfid-req", &uuid_req);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "failed to get gfid from dict");
+ goto out;
+ }
+ uuid_copy (gfid, uuid_req);
+
+ ret = -1;
+
+ /* <PRE> + IDX + FOP + GFID + LINKNAME + UID + GID + Entry */
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, gfid, 8);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ if (changelog_fix_term(this, local, xdata) == _gf_false)
+ goto out;
+
+ CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UUID (co, gfid, uuid_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_NAME (co, linkname, entry_free_fn, xtra_len, out);
+ co++;
+
+ CHANGELOG_FILL_INT32 (co, frame->root->uid, int32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_INT32 (co, frame->root->gid, int32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 8);
+
+ frame->local = local;
+ ret = 0;
+
+ changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY);
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+int32_t
+changelog_replication_mknod (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ mode_t mode, dev_t dev,
+ mode_t umask, dict_t *xdata)
+{
+ int ret = -1;
+ uuid_t gfid = {0,};
+ void *uuid_req = NULL;
+ size_t xtra_len = 0;
+ changelog_priv_t *priv = NULL;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+
+ ret = dict_get_ptr (xdata, "gfid-req", &uuid_req);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "failed to get gfid from dict");
+ goto out;
+ }
+ uuid_copy (gfid, uuid_req);
+
+ ret = -1;
+
+ /* <PRE> + IDX + FOP + GFID + MODE + UID + GID + DEV + Entry */
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, gfid, 9);
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ if (changelog_fix_term (this, local, xdata) == _gf_false)
+ goto out;
+
+ CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UUID (co, gfid, uuid_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, mode, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_INT32 (co, frame->root->uid, int32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_INT32 (co, frame->root->gid, int32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_USIGNLL (co, dev, number_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 9);
+
+ frame->local = local;
+ ret = 0;
+
+ changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY);
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+int32_t
+changelog_replication_create (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int32_t flags, mode_t mode,
+ mode_t umask, fd_t *fd, dict_t *xdata)
+{
+ int ret = -1;
+ uuid_t gfid = {0,};
+ void *uuid_req = NULL;
+ size_t xtra_len = 0;
+ changelog_priv_t *priv = NULL;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+
+ ret = dict_get_ptr (xdata, "gfid-req", &uuid_req);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "failed to get gfid from dict");
+ goto out;
+ }
+ uuid_copy (gfid, uuid_req);
+
+ ret = -1;
+
+ /* <PRE> + IDX + FOP + GFID + MODE + UID + GID + ENTRY */
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, gfid, 8);
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ if (changelog_fix_term (this, local, xdata) == _gf_false)
+ goto out;
+
+ CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UUID (co, gfid, uuid_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, mode, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_INT32 (co, frame->root->uid, int32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_INT32 (co, frame->root->gid, int32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_ENTRY (co, loc->pargfid, loc->name,
+ entry_fn, entry_free_fn, xtra_len, out);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 8);
+
+ frame->local = local;
+ ret = 0;
+
+ changelog_update (this, priv, local, CHANGELOG_TYPE_ENTRY);
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+static int
+_changelog_setattr_fill_common (call_frame_t *frame, xlator_t *this,
+ int32_t attr, struct iatt *stbuf,
+ uuid_t gfid, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ int used_count = 0;
+ changelog_priv_t *priv = NULL;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+
+ used_count = 7;
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, gfid, used_count);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ if (changelog_fix_term(this, local, xdata) == _gf_false)
+ goto out;
+
+ /**
+ * - <PRE>
+ * - IDX
+ * - FOP
+ * - GFID
+ * - Valid flag
+ * GF_SET_ATTR_MODE [chmod]
+ * ->ia_prot
+ * ->ia_type
+ * GF_SET_ATTR_UID | GF_SET_ATTR_GID [chown]
+ * ->ia_uid
+ * ->ia_gid
+ * GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME [utimes]
+ * ->ia_atime
+ * ->ia_mtime
+ */
+
+ CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UUID (co, gfid, uuid_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, attr, uint32_fn, xtra_len);
+ co++;
+
+ if (attr & GF_SET_ATTR_MODE) {
+ mode_t mode = 0;
+
+ /* ->ia_prot & ->ia_type stored as a consolidated value */
+ used_count--;
+ mode = st_mode_from_ia (stbuf->ia_prot, stbuf->ia_type);
+
+ CHANGELOG_FILL_UINT32 (co, mode, uint32_fn, xtra_len);
+
+ } else if (attr & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)) {
+ uid_t uid = -1;
+ gid_t gid = -1;
+
+ if (attr & GF_SET_ATTR_UID)
+ uid = stbuf->ia_uid;
+ if (attr & GF_SET_ATTR_GID)
+ gid = stbuf->ia_gid;
+
+ /* ->ia_uid & ->ia_gid */
+ CHANGELOG_FILL_INT32 (co, uid, int32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_INT32 (co, gid, int32_fn, xtra_len);
+
+ } else if (attr & (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME)) {
+
+ /* ->ia_atime & ->ia_mtime, need usecs? */
+ CHANGELOG_FILL_UINT32 (co,
+ stbuf->ia_atime, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co,
+ stbuf->ia_mtime, uint32_fn, xtra_len);
+ }
+
+ changelog_set_usable_record_and_length (local, xtra_len, used_count);
+
+ ret = 0;
+ frame->local = local;
+
+ changelog_update (this, priv, frame->local, CHANGELOG_TYPE_METADATA);
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+
+ return ret;
+}
+
+int32_t
+changelog_replication_fsetattr (call_frame_t *frame,
+ xlator_t *this, fd_t *fd,
+ struct iatt *stbuf, int32_t valid,
+ dict_t *xdata)
+{
+ return _changelog_setattr_fill_common (frame, this, valid,
+ stbuf, fd->inode->gfid, xdata);
+}
+
+int32_t
+changelog_replication_setattr (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ struct iatt *stbuf, int32_t valid, dict_t *xdata)
+{
+ return _changelog_setattr_fill_common (frame, this, valid,
+ stbuf, loc->inode->gfid, xdata);
+}
+
+int32_t
+changelog_replication_fremovexattr (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, const char *name, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ changelog_opt_t *co = NULL;
+ changelog_priv_t *priv = NULL;
+ changelog_local_t *local = NULL;
+ int32_t xattr_op;
+
+ priv = this->private;
+
+ /* <PRE> + IDX + FOP + GFID */
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, fd->inode->gfid, 4);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ if (changelog_fix_term(this, local, xdata) == _gf_false)
+ goto out;
+
+ CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ if (dict_get_int32(xdata, "recon-xattr-opcode", &xattr_op) == 0)
+ CHANGELOG_FILL_FOP_NUMBER (co, (glusterfs_fop_t)xattr_op,
+ fop_fn, xtra_len);
+ else
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn,
+ xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UUID (co, fd->inode->gfid, uuid_fn, xtra_len);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 4);
+
+ frame->local = local;
+ ret = 0;
+
+ changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA);
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+int32_t
+changelog_replication_removexattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ changelog_priv_t *priv = NULL;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+ int32_t xattr_op;
+
+ priv = this->private;
+
+ /* <PRE> + IDX + FOP + GFID */
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, loc->inode->gfid, 4);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ if (changelog_fix_term(this, local, xdata) == _gf_false)
+ goto out;
+
+ CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ if (dict_get_int32(xdata, "recon-xattr-opcode", &xattr_op) == 0)
+ CHANGELOG_FILL_FOP_NUMBER (co, (glusterfs_fop_t)xattr_op,
+ fop_fn, xtra_len);
+ else
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn,
+ xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UUID (co, loc->inode->gfid, uuid_fn, xtra_len);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 4);
+
+ frame->local = local;
+ ret = 0;
+
+ changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA);
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+int32_t
+changelog_replication_setxattr (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ dict_t *dict, int32_t flags, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ changelog_priv_t *priv = NULL;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+ int32_t xattr_op;
+
+ priv = this->private;
+
+ /* <PRE> + IDX + FOP + GFID */
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, loc->inode->gfid, 4);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ if (changelog_fix_term(this, local, xdata) == _gf_false)
+ goto out;
+
+ CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ if (dict_get_int32(xdata, "recon-xattr-opcode", &xattr_op) == 0)
+ CHANGELOG_FILL_FOP_NUMBER (co, (glusterfs_fop_t)xattr_op,
+ fop_fn, xtra_len);
+ else
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn,
+ xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UUID (co, loc->inode->gfid, uuid_fn, xtra_len);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 4);
+
+ frame->local = local;
+ ret = 0;
+
+ changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA);
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+int32_t
+changelog_replication_fsetxattr (call_frame_t *frame,
+ xlator_t *this, fd_t *fd, dict_t *dict,
+ int32_t flags, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ changelog_priv_t *priv = NULL;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+ int32_t xattr_op;
+
+ priv = this->private;
+
+ /* <PRE> + IDX + FOP + GFID */
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, fd->inode->gfid, 4);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ if (changelog_fix_term(this, local, xdata) == _gf_false)
+ goto out;
+
+ CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ if (dict_get_int32(xdata, "recon-xattr-opcode", &xattr_op) == 0)
+ CHANGELOG_FILL_FOP_NUMBER (co, (glusterfs_fop_t)xattr_op,
+ fop_fn, xtra_len);
+ else
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn,
+ xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UUID (co, fd->inode->gfid, uuid_fn, xtra_len);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 4);
+
+ frame->local = local;
+ ret = 0;
+
+ changelog_update (this, priv, local, CHANGELOG_TYPE_METADATA);
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+int32_t
+changelog_replication_truncate (call_frame_t *frame,
+ xlator_t *this, loc_t *loc,
+ off_t offset, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ changelog_priv_t *priv = NULL;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+
+ /* <PRE> + IDX + FOP + GFID + Offset */
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, loc->inode->gfid, 5);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ if (changelog_fix_term(this, local, xdata) == _gf_false)
+ goto out;
+
+ CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UUID (co, loc->inode->gfid, uuid_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_USIGNLL (co, offset, number_fn, xtra_len);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 5);
+
+ frame->local = local;
+ ret = 0;
+
+ changelog_update (this, priv, local, CHANGELOG_TYPE_DATA);
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+int32_t
+changelog_replication_ftruncate (call_frame_t *frame,
+ xlator_t *this, fd_t *fd,
+ off_t offset, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ changelog_priv_t *priv = NULL;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+
+ /* <PRE> + IDX + FOP + GFID + Offset */
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, fd->inode->gfid, 5);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ if (changelog_fix_term(this, local, xdata) == _gf_false)
+ goto out;
+
+ CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UUID (co, fd->inode->gfid, uuid_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_USIGNLL (co, offset, number_fn, xtra_len);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 5);
+
+ frame->local = local;
+ ret = 0;
+
+ changelog_update (this, priv, local, CHANGELOG_TYPE_DATA);
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+int32_t
+changelog_replication_writev (call_frame_t *frame,
+ xlator_t *this, fd_t *fd, struct iovec *vector,
+ int32_t count, off_t offset, uint32_t flags,
+ struct iobref *iobref, dict_t *xdata)
+{
+ int ret = -1;
+ size_t xtra_len = 0;
+ changelog_priv_t *priv = NULL;
+ changelog_opt_t *co = NULL;
+ changelog_local_t *local = NULL;
+
+ priv = this->private;
+
+ /* <PRE> + IDX + FOP + GFID + Offset + Length */
+ CHANGELOG_INIT_NOCHECK (this, local, NULL, fd->inode->gfid, 6);
+ if (!local)
+ goto out;
+
+ co = changelog_get_usable_buffer (local);
+ if (!co)
+ goto out;
+
+ if (changelog_fix_term(this, local, xdata) == _gf_false)
+ goto out;
+
+ CHANGELOG_FILL_USIGNLL (co, PRE_OP_MARK, NULL, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UINT32 (co, local->lu.val, uint32_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_FOP_NUMBER (co, frame->root->op, fop_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_UUID (co, fd->inode->gfid, uuid_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_USIGNLL (co, offset, number_fn, xtra_len);
+ co++;
+
+ CHANGELOG_FILL_USIGNLL (co, iov_length (vector, count),
+ number_fn, xtra_len);
+
+ changelog_set_usable_record_and_length (local, xtra_len, 6);
+
+ frame->local = local;
+ ret = 0;
+
+ changelog_update (this, priv, local, CHANGELOG_TYPE_DATA);
+
+ out:
+ if (ret)
+ changelog_local_cleanup (this, local);
+ return ret;
+}
+
+/* overriden COPS */
+int
+changelog_replication_cops_open (xlator_t *this,
+ changelog_priv_t *priv, void *cpriv,
+ char *name, gf_boolean_t last)
+{
+ changelog_local_t local = {0,};
+ changelog_log_data_t cld = {0,};
+ changelog_rollover_data_t *crd = NULL;
+
+ crd = &cld.cld_roll;
+
+ cld.cld_type = CHANGELOG_TYPE_ROLLOVER;
+
+ crd->crd_finale = last;
+ crd->crd_use_suffix = _gf_false;
+ crd->crd_prealloc_size = 1<<29; /* preallocate 512 MB */
+
+
+ (void) strcpy (crd->crd_changelog_name, name);
+
+ local.lu.val = 0;
+ local.nr_bytes = 0;
+
+ return changelog_inject_single_event (this, priv, &local, &cld);
+}
+
+/**
+ * NO-OP changelog write (from changelog.c). Records are journaled
+ * in the FOP path.
+ */
+int
+changelog_replication_cops_write (xlator_t *this,
+ changelog_priv_t *priv, void *cpriv,
+ changelog_local_t *local,
+ changelog_log_type type)
+{
+ return 0;
+}
+
+/**
+ * no implicit rollover
+ */
+int
+changelog_replication_cops_rollover (xlator_t *this,
+ changelog_priv_t *priv, void *cpriv,
+ char *name, gf_boolean_t last)
+{
+ return changelog_replication_cops_open (this, priv, cpriv, name, last);
+}
+
+off_t
+changelog_replication_cops_get_offset (xlator_t *this,
+ changelog_priv_t *priv, void *cpriv,
+ changelog_local_t *local)
+{
+ if (!local)
+ return 0;
+
+ return (local->lu.val * JOURNAL_SECTOR_SIZE) + local->nr_bytes;
+}
+
+void
+changelog_replication_cops_set_offset (xlator_t *this,
+ changelog_priv_t *priv, void *cpriv,
+ changelog_local_t *local, off_t bytes)
+{
+ local->nr_bytes += bytes;
+}
+
+void
+changelog_replication_cops_reset_offset (xlator_t *this, changelog_priv_t *priv,
+ void *cpriv, changelog_local_t *local)
+{
+ return;
+}
+
+int
+changelog_replication_policy_init (xlator_t *this,
+ changelog_priv_t *priv,
+ struct changelog_logpolicy *cp)
+{
+ struct xlator_fops *r_fops = NULL;
+ struct changelog_ops *r_cops = NULL;
+
+ r_fops = GF_CALLOC (1, sizeof (struct xlator_fops),
+ gf_changelog_mt_fop_policy_t);
+ if (!r_fops)
+ return -1;
+
+ r_cops = GF_CALLOC (1, sizeof (struct changelog_ops),
+ gf_changelog_mt_fop_policy_t);
+ if (!r_cops) {
+ GF_FREE (r_fops);
+ return -1;
+ }
+
+ cp->cpriv = GF_CALLOC (1, sizeof (off_t),
+ gf_changelog_mt_fop_policy_t);
+ if (!cp->cpriv) {
+ GF_FREE (r_fops);
+ GF_FREE (r_cops);
+ return -1;
+ }
+
+ /* no roll-over, one big fat journal per term */
+ priv->rollover_time = 0;
+
+ /* fsync() is internally trigerred by NSR */
+ priv->fsync_interval = 0;
+
+ /* no record header: extra data (via iobufs) are always persisted */
+ priv->no_gfid_hdr = _gf_true;
+
+ priv->lockless_update = _gf_false;
+
+ memcpy (r_fops, &changelog_default_fops, sizeof (struct xlator_fops));
+ memcpy (r_cops, &changelog_default_cops, sizeof (struct changelog_ops));
+
+ priv->term = 0;
+ (void) memset (cp->changelog_name, '\0', PATH_MAX);
+ memcpy(cp->changelog_name, JOURNAL_NAME, strlen(JOURNAL_NAME));
+#if 0
+ (void) snprintf (cp->changelog_name, PATH_MAX,
+ JOURNAL_NAME, priv->term);
+#endif
+
+ /* overload all fops */
+ r_fops->writev = changelog_replication_writev;
+ r_fops->ftruncate = changelog_replication_ftruncate;
+ r_fops->truncate = changelog_replication_truncate;
+ r_fops->fsetxattr = changelog_replication_fsetxattr;
+ r_fops->setxattr = changelog_replication_setxattr;
+ r_fops->removexattr = changelog_replication_removexattr;
+ r_fops->fremovexattr = changelog_replication_fremovexattr;
+ r_fops->setattr = changelog_replication_setattr;
+ r_fops->fsetattr = changelog_replication_fsetattr;
+ r_fops->create = changelog_replication_create;
+ r_fops->mknod = changelog_replication_mknod;
+ r_fops->symlink = changelog_replication_symlink;
+ r_fops->mkdir = changelog_replication_mkdir;
+ r_fops->link = changelog_replication_link;
+ r_fops->rename = changelog_replication_rename;
+ r_fops->unlink = changelog_replication_unlink;
+ r_fops->rmdir = changelog_replication_rmdir;
+
+ /* overload cops */
+ r_cops->open = changelog_replication_cops_open;
+ r_cops->write = changelog_replication_cops_write;
+ r_cops->rollover = changelog_replication_cops_rollover;
+ r_cops->get_offset = changelog_replication_cops_get_offset;
+ r_cops->set_offset = changelog_replication_cops_set_offset;
+ r_cops->reset_offset = changelog_replication_cops_reset_offset;
+
+
+ cp->fops = r_fops;
+ cp->cops = r_cops;
+
+ return 0;
+}
+
+int
+changelog_replication_policy_fini (xlator_t *this,
+ struct changelog_logpolicy *cp)
+{
+ GF_FREE (cp->fops);
+ GF_FREE (cp->cops);
+ GF_FREE (cp->cpriv);
+ return 0;
+}
diff --git a/xlators/features/changelog/src/policy/changelog-policy.h b/xlators/features/changelog/src/policy/changelog-policy.h
new file mode 100644
index 000000000..73fdc1a98
--- /dev/null
+++ b/xlators/features/changelog/src/policy/changelog-policy.h
@@ -0,0 +1,41 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CHANGELOG_POLICY_H
+#define _CHANGELOG_POLICY_H
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "xlator.h"
+#include "defaults.h"
+#include "logging.h"
+
+#include "changelog-mem-types.h"
+#include "changelog-helpers.h"
+
+int
+changelog_default_policy_init (xlator_t *this,
+ changelog_priv_t *priv,
+ struct changelog_logpolicy *);
+int
+changelog_default_policy_fini (xlator_t *this,
+ struct changelog_logpolicy *);
+int
+changelog_replication_policy_init (xlator_t *this,
+ changelog_priv_t *priv,
+ struct changelog_logpolicy *cp);
+int
+changelog_replication_policy_fini (xlator_t *this,
+ struct changelog_logpolicy *cp);
+
+#endif /* _CHANGELOG_POLICY_H */
diff --git a/xlators/features/compress/Makefile.am b/xlators/features/compress/Makefile.am
new file mode 100644
index 000000000..a985f42a8
--- /dev/null
+++ b/xlators/features/compress/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/features/compress/src/Makefile.am b/xlators/features/compress/src/Makefile.am
new file mode 100644
index 000000000..263b21b78
--- /dev/null
+++ b/xlators/features/compress/src/Makefile.am
@@ -0,0 +1,17 @@
+xlator_LTLIBRARIES = cdc.la
+
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+noinst_HEADERS = cdc.h cdc-mem-types.h
+
+cdc_la_LDFLAGS = -module -avoid-version $(LIBZ_LIBS)
+
+cdc_la_SOURCES = cdc.c cdc-helper.c
+cdc_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -D$(GF_HOST_OS) \
+-shared $(LIBZ_CFLAGS)
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/compress/src/cdc-helper.c b/xlators/features/compress/src/cdc-helper.c
new file mode 100644
index 000000000..54432ff45
--- /dev/null
+++ b/xlators/features/compress/src/cdc-helper.c
@@ -0,0 +1,547 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "logging.h"
+
+#include "cdc.h"
+#include "cdc-mem-types.h"
+
+#ifdef HAVE_LIB_Z
+#include "zlib.h"
+#endif
+
+#ifdef HAVE_LIB_Z
+/* gzip header looks something like this
+ * (RFC 1950)
+ *
+ * +---+---+---+---+---+---+---+---+---+---+
+ * |ID1|ID2|CM |FLG| MTIME |XFL|OS |
+ * +---+---+---+---+---+---+---+---+---+---+
+ *
+ * Data is usually sent without this header i.e
+ * Data sent = <compressed-data> + trailer(8)
+ * The trailer contains the checksum.
+ *
+ * gzip_header is added only during debugging.
+ * Refer to the function cdc_dump_iovec_to_disk
+ */
+static const char gzip_header[10] =
+ {
+ '\037', '\213', Z_DEFLATED, 0,
+ 0, 0, 0, 0,
+ 0, GF_CDC_OS_ID
+ };
+
+static int32_t
+cdc_next_iovec (xlator_t *this, cdc_info_t *ci)
+{
+ int ret = -1;
+
+ ci->ncount++;
+ /* check for iovec overflow -- should not happen */
+ if (ci->ncount == MAX_IOVEC) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Zlib output buffer overflow"
+ " ->ncount (%d) | ->MAX_IOVEC (%d)",
+ ci->ncount, MAX_IOVEC);
+ goto out;
+ }
+
+ ret = 0;
+
+ out:
+ return ret;
+}
+
+static void
+cdc_put_long (unsigned char *string, unsigned long x)
+{
+ string[0] = (unsigned char) (x & 0xff);
+ string[1] = (unsigned char) ((x & 0xff00) >> 8);
+ string[2] = (unsigned char) ((x & 0xff0000) >> 16);
+ string[3] = (unsigned char) ((x & 0xff000000) >> 24);
+}
+
+static unsigned long
+cdc_get_long (unsigned char *buf)
+{
+ return ((unsigned long) buf[0])
+ | (((unsigned long) buf[1]) << 8)
+ | (((unsigned long) buf[2]) << 16)
+ | (((unsigned long) buf[3]) << 24);
+}
+
+static int32_t
+cdc_init_gzip_trailer (xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci)
+{
+ int ret = -1;
+ char *buf = NULL;
+
+ ret = cdc_next_iovec (this, ci);
+ if (ret)
+ goto out;
+
+ buf = CURR_VEC(ci).iov_base =
+ (char *) GF_CALLOC (1, GF_CDC_VALIDATION_SIZE,
+ gf_cdc_mt_gzip_trailer_t);
+
+ if (!CURR_VEC(ci).iov_base)
+ goto out;
+
+ CURR_VEC(ci).iov_len = GF_CDC_VALIDATION_SIZE;
+
+ cdc_put_long ((unsigned char *)&buf[0], ci->crc);
+ cdc_put_long ((unsigned char *)&buf[4], ci->stream.total_in);
+
+ ret = 0;
+
+ out:
+ return ret;
+}
+
+static int32_t
+cdc_alloc_iobuf_and_init_vec (xlator_t *this,
+ cdc_priv_t *priv, cdc_info_t *ci,
+ int size)
+{
+ int ret = -1;
+ int alloc_len = 0;
+ struct iobuf *iobuf = NULL;
+
+ ret = cdc_next_iovec (this, ci);
+ if (ret)
+ goto out;
+
+ alloc_len = size ? size : ci->buffer_size;
+
+ iobuf = iobuf_get2 (this->ctx->iobuf_pool, alloc_len);
+ if (!iobuf)
+ goto out;
+
+ ret = iobref_add (ci->iobref, iobuf);
+ if (ret)
+ goto out;
+
+ /* Initialize this iovec */
+ CURR_VEC(ci).iov_base = iobuf->ptr;
+ CURR_VEC(ci).iov_len = alloc_len;
+
+ ret = 0;
+
+ out:
+ return ret;
+}
+
+static void
+cdc_init_zlib_output_stream (cdc_priv_t *priv, cdc_info_t *ci, int size)
+{
+ ci->stream.next_out = (unsigned char *) CURR_VEC(ci).iov_base;
+ ci->stream.avail_out = size ? size : ci->buffer_size;
+}
+
+/* This routine is for testing and debugging only.
+ * Data written = header(10) + <compressed-data> + trailer(8)
+ * So each gzip dump file is at least 18 bytes in size.
+ */
+void
+cdc_dump_iovec_to_disk (xlator_t *this, cdc_info_t *ci, const char *file)
+{
+ int i = 0;
+ int fd = 0;
+ size_t writen = 0;
+ size_t total_writen = 0;
+
+ fd = open (file, O_WRONLY|O_CREAT|O_TRUNC, 0777 );
+ if (fd < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Cannot open file: %s", file);
+ return;
+ }
+
+ writen = write (fd, (char *) gzip_header, 10);
+ total_writen += writen;
+ for (i = 0; i < ci->ncount; i++) {
+ writen = write (fd, (char *) ci->vec[i].iov_base, ci->vec[i].iov_len);
+ total_writen += writen;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "dump'd %zu bytes to %s", total_writen, GF_CDC_DEBUG_DUMP_FILE );
+
+ close (fd);
+}
+
+static int32_t
+cdc_flush_libz_buffer (cdc_priv_t *priv, xlator_t *this, cdc_info_t *ci,
+ int (*libz_func)(z_streamp, int),
+ int flush)
+{
+ int32_t ret = Z_OK;
+ int done = 0;
+ unsigned int deflate_len = 0;
+
+ for (;;) {
+ deflate_len = ci->buffer_size - ci->stream.avail_out;
+
+ if (deflate_len != 0) {
+ CURR_VEC(ci).iov_len = deflate_len;
+
+ ret = cdc_alloc_iobuf_and_init_vec (this, priv, ci, 0);
+ if (ret) {
+ ret = Z_MEM_ERROR;
+ break;
+ }
+
+ /* Re-position Zlib output buffer */
+ cdc_init_zlib_output_stream (priv, ci, 0);
+ }
+
+ if (done) {
+ ci->ncount--;
+ break;
+ }
+
+ ret = libz_func (&ci->stream, flush);
+
+ if (ret == Z_BUF_ERROR) {
+ ret = Z_OK;
+ ci->ncount--;
+ break;
+ }
+
+ done = (ci->stream.avail_out != 0 || ret == Z_STREAM_END);
+
+ if (ret != Z_OK && ret != Z_STREAM_END)
+ break;
+ }
+
+ return ret;
+}
+
+static int32_t
+do_cdc_compress (struct iovec *vec, xlator_t *this, cdc_priv_t *priv,
+ cdc_info_t *ci)
+{
+ int ret = -1;
+
+ /* Initialize defalte */
+ ret = deflateInit2 (&ci->stream, priv->cdc_level, Z_DEFLATED,
+ priv->window_size, priv->mem_level,
+ Z_DEFAULT_STRATEGY);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "unable to init Zlib (retval: %d)", ret);
+ goto out;
+ }
+
+ ret = cdc_alloc_iobuf_and_init_vec (this, priv, ci, 0);
+ if (ret)
+ goto out;
+
+ /* setup output buffer */
+ cdc_init_zlib_output_stream (priv, ci, 0);
+
+ /* setup input buffer */
+ ci->stream.next_in = (unsigned char *) vec->iov_base;
+ ci->stream.avail_in = vec->iov_len;
+
+ ci->crc = crc32 (ci->crc, (const Bytef *) vec->iov_base, vec->iov_len);
+
+ gf_log (this->name, GF_LOG_DEBUG, "crc=%lu len=%d buffer_size=%d",
+ ci->crc, ci->stream.avail_in, ci->buffer_size);
+
+ /* compress !! */
+ while (ci->stream.avail_in != 0) {
+ if (ci->stream.avail_out == 0) {
+
+ CURR_VEC(ci).iov_len = ci->buffer_size;
+
+ ret = cdc_alloc_iobuf_and_init_vec (this, priv, ci, 0);
+ if (ret)
+ break;
+
+ /* Re-position Zlib output buffer */
+ cdc_init_zlib_output_stream (priv, ci, 0);
+ }
+
+ ret = deflate (&ci->stream, Z_NO_FLUSH);
+ if (ret != Z_OK)
+ break;
+ }
+
+ out:
+ return ret;
+}
+
+int32_t
+cdc_compress (xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci,
+ dict_t **xdata)
+{
+ int ret = -1;
+ int i = 0;
+
+ ci->iobref = iobref_new ();
+ if (!ci->iobref)
+ goto out;
+
+ if (!*xdata) {
+ *xdata = dict_new ();
+ if (!*xdata) {
+ gf_log (this->name, GF_LOG_ERROR, "Cannot allocate xdata"
+ " dict");
+ goto out;
+ }
+ }
+
+ /* data */
+ for (i = 0; i < ci->count; i++) {
+ ret = do_cdc_compress (&ci->vector[i], this, priv, ci);
+ if (ret != Z_OK)
+ goto deflate_cleanup_out;
+ }
+
+ /* flush zlib buffer */
+ ret = cdc_flush_libz_buffer (priv, this, ci, deflate, Z_FINISH);
+ if (!(ret == Z_OK || ret == Z_STREAM_END)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Compression Error: ret (%d)", ret);
+ ret = -1;
+ goto deflate_cleanup_out;
+ }
+
+ /* trailer */
+ ret = cdc_init_gzip_trailer (this, priv, ci);
+ if (ret)
+ goto deflate_cleanup_out;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Compressed %ld to %ld bytes",
+ ci->stream.total_in, ci->stream.total_out);
+
+ ci->nbytes = ci->stream.total_out + GF_CDC_VALIDATION_SIZE;
+
+ /* set deflated canary value for identification */
+ ret = dict_set_int32 (*xdata, GF_CDC_DEFLATE_CANARY_VAL, 1);
+ if (ret) {
+ /* Send uncompressed data if we can't _tell_ the client
+ * that deflated data is on it's way. So, we just log
+ * the faliure and continue as usual.
+ */
+ gf_log (this->name, GF_LOG_ERROR,
+ "Data deflated, but could not set canary"
+ " value in dict for identification");
+ }
+
+ /* This is to be used in testing */
+ if ( priv->debug ) {
+ cdc_dump_iovec_to_disk (this, ci, GF_CDC_DEBUG_DUMP_FILE );
+ }
+
+ deflate_cleanup_out:
+ (void) deflateEnd(&ci->stream);
+
+ out:
+ return ret;
+}
+
+
+/* deflate content is checked by the presence of a canary
+ * value in the dict as the key
+ */
+static int32_t
+cdc_check_content_for_deflate (dict_t *xdata)
+{
+ return dict_get (xdata, GF_CDC_DEFLATE_CANARY_VAL) ? -1 : 0;
+}
+
+static unsigned long
+cdc_extract_crc (char *trailer)
+{
+ return cdc_get_long ((unsigned char *) &trailer[0]);
+}
+
+static unsigned long
+cdc_extract_size (char *trailer)
+{
+ return cdc_get_long ((unsigned char *) &trailer[4]);
+}
+
+static int32_t
+cdc_validate_inflate (cdc_info_t *ci, unsigned long crc,
+ unsigned long len)
+{
+ return !((crc == ci->crc)
+ /* inflated length is hidden inside
+ * Zlib stream struct */
+ && (len == ci->stream.total_out));
+}
+
+static int32_t
+do_cdc_decompress (xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci)
+{
+ int ret = -1;
+ int i = 0;
+ int len = 0;
+ char *inflte = NULL;
+ char *trailer = NULL;
+ struct iovec vec = {0,};
+ unsigned long computed_crc = 0;
+ unsigned long computed_len = 0;
+
+ ret = inflateInit2 (&ci->stream, priv->window_size);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Zlib: Unable to initialize inflate");
+ goto out;
+ }
+
+ vec = THIS_VEC(ci, 0);
+
+ trailer = (char *) (((char *) vec.iov_base) + vec.iov_len
+ - GF_CDC_VALIDATION_SIZE);
+
+ /* CRC of uncompressed data */
+ computed_crc = cdc_extract_crc (trailer);
+
+ /* size of uncomrpessed data */
+ computed_len = cdc_extract_size (trailer);
+
+ gf_log (this->name, GF_LOG_DEBUG, "crc=%lu len=%lu buffer_size=%d",
+ computed_crc, computed_len, ci->buffer_size);
+
+ inflte = vec.iov_base ;
+ len = vec.iov_len - GF_CDC_VALIDATION_SIZE;
+
+ /* allocate buffer of the original length of the data */
+ ret = cdc_alloc_iobuf_and_init_vec (this, priv, ci, 0);
+ if (ret)
+ goto out;
+
+ /* setup output buffer */
+ cdc_init_zlib_output_stream (priv, ci, 0);
+
+ /* setup input buffer */
+ ci->stream.next_in = (unsigned char *) inflte;
+ ci->stream.avail_in = len;
+
+ while (ci->stream.avail_in != 0) {
+ if (ci->stream.avail_out == 0) {
+ CURR_VEC(ci).iov_len = ci->buffer_size;
+
+ ret = cdc_alloc_iobuf_and_init_vec (this, priv, ci, 0);
+ if (ret)
+ break;
+
+ /* Re-position Zlib output buffer */
+ cdc_init_zlib_output_stream (priv, ci, 0);
+ }
+
+ ret = inflate (&ci->stream, Z_NO_FLUSH);
+ if (ret == Z_STREAM_ERROR)
+ break;
+ }
+
+ /* flush zlib buffer */
+ ret = cdc_flush_libz_buffer (priv, this, ci, inflate, Z_SYNC_FLUSH);
+ if (!(ret == Z_OK || ret == Z_STREAM_END)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Decompression Error: ret (%d)", ret);
+ ret = -1;
+ goto out;
+ }
+
+ /* compute CRC of the uncompresses data to check for
+ * correctness */
+
+ for (i = 0; i < ci->ncount; i++) {
+ ci->crc = crc32 (ci->crc,
+ (const Bytef *) ci->vec[i].iov_base,
+ ci->vec[i].iov_len);
+ }
+
+ /* validate inflated data */
+ ret = cdc_validate_inflate (ci, computed_crc, computed_len);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Checksum or length mismatched in inflated data");
+ }
+
+ out:
+ return ret;
+}
+
+int32_t
+cdc_decompress (xlator_t *this, cdc_priv_t *priv, cdc_info_t *ci,
+ dict_t *xdata)
+{
+ int32_t ret = -1;
+
+ /* check for deflate content */
+ if (!cdc_check_content_for_deflate (xdata)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Content not deflated, passing through ...");
+ goto passthrough_out;
+ }
+
+ ci->iobref = iobref_new ();
+ if (!ci->iobref)
+ goto passthrough_out;
+
+ /* do we need to do this? can we assume that one iovec
+ * will hold per request data everytime?
+ *
+ * server/client protocol seems to deal with a single
+ * iovec even if op_ret > 1M. So, it looks ok to
+ * assume that a single iovec will contain all the
+ * data (This saves us a lot from finding the trailer
+ * and the data since it could have been split-up onto
+ * two adjacent iovec's.
+ *
+ * But, in case this translator is loaded above quick-read
+ * for some reason, then it's entirely possible that we get
+ * multiple iovec's...
+ *
+ * This case (handled below) is not tested. (by loading the
+ * xlator below quick-read)
+ */
+
+ /* @@ I_HOPE_THIS_IS_NEVER_HIT */
+ if (ci->count > 1) {
+ gf_log (this->name, GF_LOG_WARNING, "unable to handle"
+ " multiple iovecs (%d in number)", ci->count);
+ goto inflate_cleanup_out;
+ /* TODO: coallate all iovecs in one */
+ }
+
+ ret = do_cdc_decompress (this, priv, ci);
+ if (ret)
+ goto inflate_cleanup_out;
+
+ ci->nbytes = ci->stream.total_out;
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Inflated %ld to %ld bytes",
+ ci->stream.total_in, ci->stream.total_out);
+
+ inflate_cleanup_out:
+ (void) inflateEnd (&ci->stream);
+
+ passthrough_out:
+ return ret;
+}
+
+#endif
diff --git a/xlators/features/compress/src/cdc-mem-types.h b/xlators/features/compress/src/cdc-mem-types.h
new file mode 100644
index 000000000..ead2c70ba
--- /dev/null
+++ b/xlators/features/compress/src/cdc-mem-types.h
@@ -0,0 +1,23 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __CDC_MEM_TYPES_H
+#define __CDC_MEM_TYPES_H
+
+#include "mem-types.h"
+
+enum gf_cdc_mem_types {
+ gf_cdc_mt_priv_t = gf_common_mt_end + 1,
+ gf_cdc_mt_vec_t = gf_common_mt_end + 2,
+ gf_cdc_mt_gzip_trailer_t = gf_common_mt_end + 3,
+ gf_cdc_mt_end = gf_common_mt_end + 4,
+};
+
+#endif
diff --git a/xlators/features/compress/src/cdc.c b/xlators/features/compress/src/cdc.c
new file mode 100644
index 000000000..67fc52505
--- /dev/null
+++ b/xlators/features/compress/src/cdc.c
@@ -0,0 +1,361 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <sys/uio.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "xlator.h"
+#include "defaults.h"
+#include "logging.h"
+
+#include "cdc.h"
+#include "cdc-mem-types.h"
+
+static void
+cdc_cleanup_iobref (cdc_info_t *ci)
+{
+ assert(ci->iobref != NULL);
+ iobref_clear (ci->iobref);
+}
+
+int32_t
+cdc_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iovec *vector, int32_t count,
+ struct iatt *stbuf, struct iobref *iobref,
+ dict_t *xdata)
+{
+ int ret = -1;
+ cdc_priv_t *priv = NULL;
+ cdc_info_t ci = {0,};
+
+ GF_VALIDATE_OR_GOTO ("cdc", this, default_out);
+ GF_VALIDATE_OR_GOTO (this->name, frame, default_out);
+
+ priv = this->private;
+
+ if (op_ret <= 0)
+ goto default_out;
+
+ if ( (priv->min_file_size != 0)
+ && (op_ret < priv->min_file_size) )
+ goto default_out;
+
+ ci.count = count;
+ ci.ibytes = op_ret;
+ ci.vector = vector;
+ ci.buf = NULL;
+ ci.iobref = NULL;
+ ci.ncount = 0;
+ ci.crc = 0;
+ ci.buffer_size = GF_CDC_DEF_BUFFERSIZE;
+
+/* A readv compresses on the server side and decompresses on the client side
+ */
+ if (priv->op_mode == GF_CDC_MODE_SERVER) {
+ ret = cdc_compress (this, priv, &ci, &xdata);
+ } else if (priv->op_mode == GF_CDC_MODE_CLIENT) {
+ ret = cdc_decompress (this, priv, &ci, xdata);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Invalid operation mode (%d)", priv->op_mode);
+ }
+
+ if (ret)
+ goto default_out;
+
+ STACK_UNWIND_STRICT (readv, frame, ci.nbytes, op_errno,
+ ci.vec, ci.ncount, stbuf, iobref,
+ xdata);
+ cdc_cleanup_iobref (&ci);
+ return 0;
+
+ default_out:
+ STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno,
+ vector, count, stbuf, iobref, xdata);
+ return 0;
+}
+
+int32_t
+cdc_readv (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, size_t size, off_t offset, uint32_t flags,
+ dict_t *xdata)
+{
+ fop_readv_cbk_t cbk = NULL;
+
+#ifdef HAVE_LIB_Z
+ cbk = cdc_readv_cbk;
+#else
+ cbk = default_readv_cbk;
+#endif
+ STACK_WIND (frame, cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readv,
+ fd, size, offset, flags, xdata);
+ return 0;
+}
+
+int32_t
+cdc_writev_cbk (call_frame_t *frame,
+ void *cookie,
+ xlator_t *this,
+ int32_t op_ret,
+ int32_t op_errno,
+ struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+
+ STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+ return 0;
+}
+
+int32_t
+cdc_writev (call_frame_t *frame,
+ xlator_t *this,
+ fd_t *fd,
+ struct iovec *vector,
+ int32_t count,
+ off_t offset,
+ uint32_t flags,
+ struct iobref *iobref, dict_t *xdata)
+{
+ int ret = -1;
+ cdc_priv_t *priv = NULL;
+ cdc_info_t ci = {0,};
+ size_t isize = 0;
+
+ GF_VALIDATE_OR_GOTO ("cdc", this, default_out);
+ GF_VALIDATE_OR_GOTO (this->name, frame, default_out);
+
+ priv = this->private;
+
+ isize = iov_length(vector, count);
+
+ if (isize <= 0)
+ goto default_out;
+
+ if ( (priv->min_file_size != 0) && (isize < priv->min_file_size) )
+ goto default_out;
+
+ ci.count = count;
+ ci.ibytes = isize;
+ ci.vector = vector;
+ ci.buf = NULL;
+ ci.iobref = NULL;
+ ci.ncount = 0;
+ ci.crc = 0;
+ ci.buffer_size = GF_CDC_DEF_BUFFERSIZE;
+
+/* A writev compresses on the client side and decompresses on the server side
+ */
+ if (priv->op_mode == GF_CDC_MODE_CLIENT) {
+ ret = cdc_compress (this, priv, &ci, &xdata);
+ } else if (priv->op_mode == GF_CDC_MODE_SERVER) {
+ ret = cdc_decompress (this, priv, &ci, xdata);
+ } else {
+ gf_log (this->name, GF_LOG_ERROR, "Invalid operation mode (%d) ", priv->op_mode);
+ }
+
+ if (ret)
+ goto default_out;
+
+ STACK_WIND (frame,
+ cdc_writev_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->writev,
+ fd, ci.vec, ci.ncount, offset, flags,
+ iobref, xdata);
+
+ cdc_cleanup_iobref (&ci);
+ return 0;
+
+ default_out:
+ STACK_WIND (frame,
+ cdc_writev_cbk,
+ FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->writev,
+ fd, vector, count, offset, flags,
+ iobref, xdata);
+ return 0;
+}
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ if (!this)
+ return ret;
+
+ ret = xlator_mem_acct_init (this, gf_cdc_mt_end);
+
+ if (ret != 0) {
+ gf_log(this->name, GF_LOG_ERROR, "Memory accounting init"
+ "failed");
+ return ret;
+ }
+
+ return ret;
+}
+
+int32_t
+init (xlator_t *this)
+{
+ int ret = -1;
+ char *temp_str = NULL;
+ cdc_priv_t *priv = NULL;
+
+ GF_VALIDATE_OR_GOTO ("cdc", this, err);
+
+ if (!this->children || this->children->next) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Need subvolume == 1");
+ goto err;
+ }
+
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Dangling volume. Check volfile");
+ }
+
+ priv = GF_CALLOC (1, sizeof (*priv), gf_cdc_mt_priv_t);
+ if (!priv) {
+ goto err;
+ }
+
+ /* Check if debug mode is turned on */
+ GF_OPTION_INIT ("debug", priv->debug, bool, err);
+ if( priv->debug ) {
+ gf_log (this->name, GF_LOG_DEBUG, "CDC debug option turned on");
+ }
+
+ /* Set Gzip Window Size */
+ GF_OPTION_INIT ("window-size", priv->window_size, int32, err);
+ if ( (priv->window_size > GF_CDC_MAX_WINDOWSIZE)
+ || (priv->window_size < GF_CDC_DEF_WINDOWSIZE) ) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Invalid gzip window size (%d), using default",
+ priv->window_size);
+ priv->window_size = GF_CDC_DEF_WINDOWSIZE;
+ }
+
+ /* Set Gzip (De)Compression Level */
+ GF_OPTION_INIT ("compression-level", priv->cdc_level, int32, err);
+ if ( ((priv->cdc_level < 1) || (priv->cdc_level > 9))
+ && (priv->cdc_level != GF_CDC_DEF_COMPRESSION) ) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Invalid gzip (de)compression level (%d),"
+ " using default", priv->cdc_level);
+ priv->cdc_level = GF_CDC_DEF_COMPRESSION;
+ }
+
+ /* Set Gzip Memory Level */
+ GF_OPTION_INIT ("mem-level", priv->mem_level, int32, err);
+ if ( (priv->mem_level < 1) || (priv->mem_level > 9) ) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Invalid gzip memory level, using the default");
+ priv->mem_level = GF_CDC_DEF_MEMLEVEL;
+ }
+
+ /* Set min file size to enable compression */
+ GF_OPTION_INIT ("min-size", priv->min_file_size, int32, err);
+
+ /* Mode of operation - Server/Client */
+ ret = dict_get_str (this->options, "mode", &temp_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "Operation mode not specified !!");
+ goto err;
+ }
+
+ if (GF_CDC_MODE_IS_CLIENT (temp_str)) {
+ priv->op_mode = GF_CDC_MODE_CLIENT;
+ } else if (GF_CDC_MODE_IS_SERVER (temp_str)) {
+ priv->op_mode = GF_CDC_MODE_SERVER;
+ } else {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "Bogus operation mode (%s) specified", temp_str);
+ goto err;
+ }
+
+ this->private = priv;
+ gf_log (this->name, GF_LOG_DEBUG, "CDC xlator loaded in (%s) mode",temp_str);
+ return 0;
+
+ err:
+ if (priv)
+ GF_FREE (priv);
+
+ return -1;
+}
+
+void
+fini (xlator_t *this)
+{
+ cdc_priv_t *priv = this->private;
+
+ if (priv)
+ GF_FREE (priv);
+ this->private = NULL;
+ return;
+}
+
+struct xlator_fops fops = {
+ .readv = cdc_readv,
+ .writev = cdc_writev,
+};
+
+struct xlator_cbks cbks = {
+};
+
+struct volume_options options[] = {
+ { .key = {"window-size"},
+ .default_value = "-15",
+ .type = GF_OPTION_TYPE_INT,
+ .description = "Size of the zlib history buffer."
+ },
+ { .key = {"mem-level"},
+ .default_value = "8",
+ .type = GF_OPTION_TYPE_INT,
+ .description = "Memory allocated for internal compression state. "
+ "1 uses minimum memory but is slow and reduces "
+ "compression ratio; memLevel=9 uses maximum memory "
+ "for optimal speed. The default value is 8."
+ },
+ { .key = {"compression-level"},
+ .default_value = "-1",
+ .type = GF_OPTION_TYPE_INT,
+ .description = "Compression levels \n"
+ "0 : no compression, 1 : best speed, \n"
+ "9 : best compression, -1 : default compression "
+ },
+ { .key = {"min-size"},
+ .default_value = "0",
+ .type = GF_OPTION_TYPE_INT,
+ .description = "Data is compressed only when its size exceeds this."
+ },
+ { .key = {"mode"},
+ .value = {"server", "client"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "Set on the basis of where the xlator is loaded. "
+ "This option should NOT be configured by user."
+ },
+ { .key = {"debug"},
+ .default_value = "false",
+ .type = GF_OPTION_TYPE_BOOL,
+ .description = "This is used in testing. Will dump compressed data "
+ "to disk as a gzip file."
+ },
+ { .key = {NULL}
+ },
+};
diff --git a/xlators/features/compress/src/cdc.h b/xlators/features/compress/src/cdc.h
new file mode 100644
index 000000000..71f4d2317
--- /dev/null
+++ b/xlators/features/compress/src/cdc.h
@@ -0,0 +1,107 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __CDC_H
+#define __CDC_H
+
+#ifdef HAVE_LIB_Z
+#include "zlib.h"
+#endif
+
+#include "xlator.h"
+
+#ifndef MAX_IOVEC
+#define MAX_IOVEC 16
+#endif
+
+typedef struct cdc_priv {
+ int window_size;
+ int mem_level;
+ int cdc_level;
+ int min_file_size;
+ int op_mode;
+ gf_boolean_t debug;
+ gf_lock_t lock;
+} cdc_priv_t;
+
+typedef struct cdc_info {
+ /* input bits */
+ int count;
+ int32_t ibytes;
+ struct iovec *vector;
+ struct iatt *buf;
+
+ /* output bits */
+ int ncount;
+ int nbytes;
+ int buffer_size;
+ struct iovec vec[MAX_IOVEC];
+ struct iobref *iobref;
+
+ /* zlib bits */
+#ifdef HAVE_LIB_Z
+ z_stream stream;
+#endif
+ unsigned long crc;
+} cdc_info_t;
+
+#define NVEC(ci) (ci->ncount - 1)
+#define CURR_VEC(ci) ci->vec[ci->ncount - 1]
+#define THIS_VEC(ci, i) ci->vector[i]
+
+/* Gzip defaults */
+#define GF_CDC_DEF_WINDOWSIZE -15 /* default value */
+#define GF_CDC_MAX_WINDOWSIZE -8 /* max value */
+
+#ifdef HAVE_LIB_Z
+#define GF_CDC_DEF_COMPRESSION Z_DEFAULT_COMPRESSION
+#else
+#define GF_CDC_DEF_COMPRESSION -1
+#endif
+
+#define GF_CDC_DEF_MEMLEVEL 8
+#define GF_CDC_DEF_BUFFERSIZE 262144 // 256K - default compression buffer size
+
+/* Operation mode
+ * If xlator is loaded on client, readv decompresses and writev compresses
+ * If xlator is loaded on server, readv compresses and writev decompresses
+ */
+#define GF_CDC_MODE_CLIENT 0
+#define GF_CDC_MODE_SERVER 1
+
+/* min size of data to do cmpression
+ * 0 == compress even 1byte
+ */
+#define GF_CDC_MIN_CHUNK_SIZE 0
+
+#define GF_CDC_VALIDATION_SIZE 8
+
+#define GF_CDC_OS_ID 0xFF
+#define GF_CDC_DEFLATE_CANARY_VAL "deflate"
+#define GF_CDC_DEBUG_DUMP_FILE "/tmp/cdcdump.gz"
+
+#define GF_CDC_MODE_IS_CLIENT(m) \
+ (strcmp (m, "client") == 0)
+
+#define GF_CDC_MODE_IS_SERVER(m) \
+ (strcmp (m, "server") == 0)
+
+int32_t
+cdc_compress (xlator_t *this,
+ cdc_priv_t *priv,
+ cdc_info_t *ci,
+ dict_t **xdata);
+int32_t
+cdc_decompress (xlator_t *this,
+ cdc_priv_t *priv,
+ cdc_info_t *ci,
+ dict_t *xdata);
+
+#endif
diff --git a/xlators/features/filter/src/Makefile.am b/xlators/features/filter/src/Makefile.am
index a9f807320..d1fda8b0a 100644
--- a/xlators/features/filter/src/Makefile.am
+++ b/xlators/features/filter/src/Makefile.am
@@ -1,7 +1,7 @@
xlator_LTLIBRARIES = filter.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/features
-filter_la_LDFLAGS = -module -avoidversion
+filter_la_LDFLAGS = -module -avoid-version
filter_la_SOURCES = filter.c
filter_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
diff --git a/xlators/bindings/python/Makefile.am b/xlators/features/gfid-access/Makefile.am
index af437a64d..af437a64d 100644
--- a/xlators/bindings/python/Makefile.am
+++ b/xlators/features/gfid-access/Makefile.am
diff --git a/xlators/features/gfid-access/src/Makefile.am b/xlators/features/gfid-access/src/Makefile.am
new file mode 100644
index 000000000..db53affaa
--- /dev/null
+++ b/xlators/features/gfid-access/src/Makefile.am
@@ -0,0 +1,15 @@
+xlator_LTLIBRARIES = gfid-access.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+gfid_access_la_LDFLAGS = -module -avoid-version
+
+gfid_access_la_SOURCES = gfid-access.c
+gfid_access_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = gfid-access.h gfid-access-mem-types.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/features/gfid-access/src/gfid-access-mem-types.h b/xlators/features/gfid-access/src/gfid-access-mem-types.h
new file mode 100644
index 000000000..168d67b43
--- /dev/null
+++ b/xlators/features/gfid-access/src/gfid-access-mem-types.h
@@ -0,0 +1,23 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GFID_ACCESS_MEM_TYPES_H
+#define _GFID_ACCESS_MEM_TYPES_H
+
+#include "mem-types.h"
+
+enum gf_changelog_mem_types {
+ gf_gfid_access_mt_priv_t = gf_common_mt_end + 1,
+ gf_gfid_access_mt_gfid_t,
+ gf_gfid_access_mt_end
+};
+
+#endif
+
diff --git a/xlators/features/gfid-access/src/gfid-access.c b/xlators/features/gfid-access/src/gfid-access.c
new file mode 100644
index 000000000..5cb6ecfbd
--- /dev/null
+++ b/xlators/features/gfid-access/src/gfid-access.c
@@ -0,0 +1,1299 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "gfid-access.h"
+#include "inode.h"
+#include "byte-order.h"
+
+
+
+void
+ga_newfile_args_free (ga_newfile_args_t *args)
+{
+ if (!args)
+ goto out;
+
+ GF_FREE (args->bname);
+
+ if (S_ISLNK (args->st_mode) && args->args.symlink.linkpath) {
+ GF_FREE (args->args.symlink.linkpath);
+ args->args.symlink.linkpath = NULL;
+ }
+
+ mem_put (args);
+out:
+ return;
+}
+
+
+void
+ga_heal_args_free (ga_heal_args_t *args)
+{
+ if (!args)
+ goto out;
+
+ GF_FREE (args->bname);
+
+ mem_put (args);
+out:
+ return;
+}
+
+
+ga_newfile_args_t *
+ga_newfile_parse_args (xlator_t *this, data_t *data)
+{
+ ga_newfile_args_t *args = NULL;
+ ga_private_t *priv = NULL;
+ int len = 0;
+ int blob_len = 0;
+ int min_len = 0;
+ void *blob = NULL;
+
+ priv = this->private;
+
+ blob = data->data;
+ blob_len = data->len;
+
+ min_len = sizeof (args->uid) + sizeof (args->gid) + sizeof (args->gfid)
+ + sizeof (args->st_mode) + 2 + 2;
+ if (blob_len < min_len) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Invalid length: Total length is less "
+ "than minimum length.");
+ goto err;
+ }
+
+ args = mem_get0 (priv->newfile_args_pool);
+ if (args == NULL)
+ goto err;
+
+ args->uid = ntoh32 (*(uint32_t *)blob);
+ blob += sizeof (uint32_t);
+ blob_len -= sizeof (uint32_t);
+
+ args->gid = ntoh32 (*(uint32_t *)blob);
+ blob += sizeof (uint32_t);
+ blob_len -= sizeof (uint32_t);
+
+ memcpy (args->gfid, blob, sizeof (args->gfid));
+ blob += sizeof (args->gfid);
+ blob_len -= sizeof (args->gfid);
+
+ args->st_mode = ntoh32 (*(uint32_t *)blob);
+ blob += sizeof (uint32_t);
+ blob_len -= sizeof (uint32_t);
+
+ len = strnlen (blob, blob_len);
+ if (len == blob_len)
+ if (len == blob_len) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "gfid: %s. No null byte present.",
+ args->gfid);
+ goto err;
+ }
+
+ args->bname = GF_CALLOC (1, (len + 1), gf_common_mt_char);
+ if (args->bname == NULL)
+ goto err;
+
+ memcpy (args->bname, blob, (len + 1));
+ blob += (len + 1);
+ blob_len -= (len + 1);
+
+ if (S_ISDIR (args->st_mode)) {
+ if (blob_len < sizeof (uint32_t)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "gfid: %s. Invalid length",
+ args->gfid);
+ goto err;
+ }
+ args->args.mkdir.mode = ntoh32 (*(uint32_t *)blob);
+ blob += sizeof (uint32_t);
+ blob_len -= sizeof (uint32_t);
+
+ if (blob_len < sizeof (uint32_t)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "gfid: %s. Invalid length",
+ args->gfid);
+ goto err;
+ }
+ args->args.mkdir.umask = ntoh32 (*(uint32_t *)blob);
+ blob += sizeof (uint32_t);
+ blob_len -= sizeof (uint32_t);
+ if (blob_len < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "gfid: %s. Invalid length",
+ args->gfid);
+ goto err;
+ }
+ } else if (S_ISLNK (args->st_mode)) {
+ len = strnlen (blob, blob_len);
+ if (len == blob_len) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "gfid: %s. Invalid length",
+ args->gfid);
+ goto err;
+ }
+ args->args.symlink.linkpath = GF_CALLOC (1, len + 1,
+ gf_common_mt_char);
+ if (args->args.symlink.linkpath == NULL)
+ goto err;
+
+ memcpy (args->args.symlink.linkpath, blob, (len + 1));
+ blob += (len + 1);
+ blob_len -= (len + 1);
+ } else {
+ if (blob_len < sizeof (uint32_t)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "gfid: %s. Invalid length",
+ args->gfid);
+ goto err;
+ }
+ args->args.mknod.mode = ntoh32 (*(uint32_t *)blob);
+ blob += sizeof (uint32_t);
+ blob_len -= sizeof (uint32_t);
+
+ if (blob_len < sizeof (uint32_t)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "gfid: %s. Invalid length",
+ args->gfid);
+ goto err;
+ }
+ args->args.mknod.rdev = ntoh32 (*(uint32_t *)blob);
+ blob += sizeof (uint32_t);
+ blob_len -= sizeof (uint32_t);
+
+ if (blob_len < sizeof (uint32_t)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "gfid: %s. Invalid length",
+ args->gfid);
+ goto err;
+ }
+ args->args.mknod.umask = ntoh32 (*(uint32_t *)blob);
+ blob += sizeof (uint32_t);
+ blob_len -= sizeof (uint32_t);
+ }
+
+ if (blob_len) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "gfid: %s. Invalid length",
+ args->gfid);
+ goto err;
+ }
+
+ return args;
+
+err:
+ if (args)
+ ga_newfile_args_free (args);
+
+ return NULL;
+}
+
+ga_heal_args_t *
+ga_heal_parse_args (xlator_t *this, data_t *data)
+{
+ ga_heal_args_t *args = NULL;
+ ga_private_t *priv = NULL;
+ void *blob = NULL;
+ int len = 0;
+ int blob_len = 0;
+
+ blob = data->data;
+ blob_len = data->len;
+
+ priv = this->private;
+
+ /* bname should at least contain a character */
+ if (blob_len < (sizeof (args->gfid) + 2))
+ goto err;
+
+ args = mem_get0 (priv->heal_args_pool);
+ if (!args)
+ goto err;
+
+ memcpy (args->gfid, blob, sizeof (args->gfid));
+ blob += sizeof (args->gfid);
+ blob_len -= sizeof (args->gfid);
+
+ len = strnlen (blob, blob_len);
+ if (len == blob_len)
+ goto err;
+
+ args->bname = GF_CALLOC (1, len + 1, gf_common_mt_char);
+ if (!args->bname)
+ goto err;
+
+ memcpy (args->bname, blob, len);
+ blob_len -= (len + 1);
+
+ if (blob_len)
+ goto err;
+
+ return args;
+
+err:
+ if (args)
+ ga_heal_args_free (args);
+
+ return NULL;
+}
+
+static int32_t
+ga_fill_tmp_loc (loc_t *loc, xlator_t *this, uuid_t gfid,
+ char *bname, dict_t *xdata, loc_t *new_loc)
+{
+ int ret = -1;
+ uint64_t value = 0;
+ inode_t *parent = NULL;
+
+ parent = loc->inode;
+ ret = inode_ctx_get (loc->inode, this, &value);
+ if (!ret) {
+ parent = (void *)value;
+ if (uuid_is_null (parent->gfid))
+ parent = loc->inode;
+ }
+
+ /* parent itself should be looked up */
+ uuid_copy (new_loc->pargfid, parent->gfid);
+ new_loc->parent = inode_ref (parent);
+
+ new_loc->inode = inode_grep (parent->table, parent, bname);
+ if (!new_loc->inode)
+ new_loc->inode = inode_new (parent->table);
+
+ loc_path (new_loc, bname);
+ new_loc->name = basename (new_loc->path);
+
+ /* As GFID would not be set on the entry yet, lets not send entry
+ gfid in the request */
+ /*uuid_copy (new_loc->gfid, (const unsigned char *)gfid); */
+
+ ret = dict_set_static_bin (xdata, "gfid-req", gfid, 16);
+ if (ret < 0)
+ goto out;
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
+
+
+static gf_boolean_t
+__is_gfid_access_dir (uuid_t gfid)
+{
+ uuid_t aux_gfid;
+
+ memset (aux_gfid, 0, 16);
+ aux_gfid[15] = GF_AUX_GFID;
+
+ if (uuid_compare (gfid, aux_gfid) == 0)
+ return _gf_true;
+
+ return _gf_false;
+}
+
+int32_t
+ga_forget (xlator_t *this, inode_t *inode)
+{
+ int ret = -1;
+ uint64_t value = 0;
+ inode_t *tmp_inode = NULL;
+
+ ret = inode_ctx_del (inode, this, &value);
+ if (ret)
+ goto out;
+
+ tmp_inode = (void *)value;
+ inode_unref (tmp_inode);
+
+out:
+ return 0;
+}
+
+
+static int
+ga_heal_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct iatt *stat, dict_t *dict,
+ struct iatt *postparent)
+{
+ call_frame_t *orig_frame = NULL;
+
+ orig_frame = frame->local;
+ frame->local = NULL;
+
+ /* don't worry about inode linking and other stuff. They'll happen on
+ * the next lookup.
+ */
+ STACK_DESTROY (frame->root);
+
+ STACK_UNWIND_STRICT (setxattr, orig_frame, op_ret, op_errno, dict);
+
+ return 0;
+}
+
+static int32_t
+ga_newentry_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *statpre,
+ struct iatt *statpost,
+ dict_t *xdata)
+{
+ ga_local_t *local = NULL;
+
+ local = frame->local;
+ frame->local = NULL;
+
+ /* don't worry about inode linking and other stuff. They'll happen on
+ * the next lookup.
+ */
+ STACK_DESTROY (frame->root);
+
+ STACK_UNWIND_STRICT (setxattr, local->orig_frame, op_ret,
+ op_errno, xdata);
+
+ loc_wipe (&local->loc);
+ mem_put (local);
+
+ return 0;
+}
+
+static int
+ga_newentry_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct iatt *buf,
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
+{
+ ga_local_t *local = NULL;
+ struct iatt temp_stat = {0,};
+
+ local = frame->local;
+
+ if (!local->uid && !local->gid)
+ goto done;
+
+ temp_stat.ia_uid = local->uid;
+ temp_stat.ia_gid = local->gid;
+
+ STACK_WIND (frame, ga_newentry_setattr_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->setattr, &local->loc, &temp_stat,
+ (GF_SET_ATTR_UID | GF_SET_ATTR_GID), xdata);
+
+ return 0;
+
+done:
+ /* don't worry about inode linking and other stuff. They'll happen on
+ * the next lookup.
+ */
+ frame->local = NULL;
+ STACK_DESTROY (frame->root);
+
+ STACK_UNWIND_STRICT (setxattr, local->orig_frame, op_ret,
+ op_errno, xdata);
+
+ loc_wipe (&local->loc);
+ mem_put (local);
+
+ return 0;
+}
+
+int32_t
+ga_new_entry (call_frame_t *frame, xlator_t *this, loc_t *loc, data_t *data,
+ dict_t *xdata)
+{
+ int ret = -1;
+ ga_newfile_args_t *args = NULL;
+ loc_t tmp_loc = {0,};
+ call_frame_t *new_frame = NULL;
+ mode_t mode = 0;
+ ga_local_t *local = NULL;
+ uuid_t gfid = {0,};
+
+ args = ga_newfile_parse_args (this, data);
+ if (!args)
+ goto out;
+
+ ret = uuid_parse (args->gfid, gfid);
+ if (ret)
+ goto out;
+
+ if (!xdata)
+ xdata = dict_new ();
+
+ ret = ga_fill_tmp_loc (loc, this, gfid,
+ args->bname, xdata, &tmp_loc);
+ if (ret)
+ goto out;
+
+ new_frame = copy_frame (frame);
+ if (!new_frame)
+ goto out;
+
+ local = mem_get0 (this->local_pool);
+ local->orig_frame = frame;
+
+ local->uid = args->uid;
+ local->gid = args->gid;
+
+ loc_copy (&local->loc, &tmp_loc);
+
+ new_frame->local = local;
+
+ if (S_ISDIR (args->st_mode)) {
+ STACK_WIND (new_frame, ga_newentry_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir,
+ &tmp_loc, args->args.mkdir.mode,
+ args->args.mkdir.umask, xdata);
+ } else if (S_ISLNK (args->st_mode)) {
+ STACK_WIND (new_frame, ga_newentry_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->symlink,
+ args->args.symlink.linkpath,
+ &tmp_loc, 0, xdata);
+ } else {
+ /* use 07777 (4 7s) for considering the Sticky bits etc) */
+ mode = (S_IFMT & args->st_mode) |
+ (07777 & args->args.mknod.mode);;
+
+ STACK_WIND (new_frame, ga_newentry_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod,
+ &tmp_loc, mode,
+ args->args.mknod.rdev, args->args.mknod.umask,
+ xdata);
+ }
+
+ ret = 0;
+out:
+ ga_newfile_args_free (args);
+
+ return ret;
+}
+
+int32_t
+ga_heal_entry (call_frame_t *frame, xlator_t *this, loc_t *loc, data_t *data,
+ dict_t *xdata)
+{
+ int ret = -1;
+ ga_heal_args_t *args = NULL;
+ loc_t tmp_loc = {0,};
+ call_frame_t *new_frame = NULL;
+ uuid_t gfid = {0,};
+
+ args = ga_heal_parse_args (this, data);
+ if (!args)
+ goto out;
+
+ ret = uuid_parse (args->gfid, gfid);
+ if (ret)
+ goto out;
+
+ if (!xdata)
+ xdata = dict_new ();
+
+ ret = ga_fill_tmp_loc (loc, this, gfid, args->bname,
+ xdata, &tmp_loc);
+ if (ret)
+ goto out;
+
+ new_frame = copy_frame (frame);
+ if (!new_frame)
+ goto out;
+ new_frame->local = (void *)frame;
+
+ STACK_WIND (new_frame, ga_heal_cbk, FIRST_CHILD (this),
+ FIRST_CHILD(this)->fops->lookup,
+ &tmp_loc, xdata);
+
+ ret = 0;
+out:
+ if (args)
+ ga_heal_args_free (args);
+
+ return ret;
+}
+
+int32_t
+ga_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ dict_t *xdata)
+{
+ STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+int32_t
+ga_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+ int32_t flags, dict_t *xdata)
+{
+
+ data_t *data = NULL;
+ int op_errno = ENOMEM;
+ int ret = 0;
+ inode_t *unref = NULL;
+
+ if ((loc->name && !strcmp (GF_GFID_DIR, loc->name)) &&
+ ((loc->parent &&
+ __is_root_gfid (loc->parent->gfid)) ||
+ __is_root_gfid (loc->pargfid))) {
+ op_errno = EPERM;
+ goto err;
+ }
+
+ data = dict_get (dict, GF_FUSE_AUX_GFID_NEWFILE);
+ if (data) {
+ ret = ga_new_entry (frame, this, loc, data, xdata);
+ if (ret)
+ goto err;
+ return 0;
+ }
+
+ data = dict_get (dict, GF_FUSE_AUX_GFID_HEAL);
+ if (data) {
+ ret = ga_heal_entry (frame, this, loc, data, xdata);
+ if (ret)
+ goto err;
+ return 0;
+ }
+
+ //If the inode is a virtual inode change the inode otherwise perform
+ //the operation on same inode
+ GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind);
+
+wind:
+ STACK_WIND (frame, ga_setxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setxattr, loc, dict, flags,
+ xdata);
+ if (unref)
+ inode_unref (unref);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (setxattr, frame, -1, op_errno, xdata);
+ return 0;
+}
+
+
+int32_t
+ga_virtual_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *postparent)
+{
+ int j = 0;
+ int i = 0;
+ int ret = 0;
+ uint64_t temp_ino = 0;
+ inode_t *cbk_inode = NULL;
+ inode_t *true_inode = NULL;
+ uuid_t random_gfid = {0,};
+ inode_t *linked_inode = NULL;
+
+ if (frame->local)
+ cbk_inode = frame->local;
+ else
+ cbk_inode = inode_ref (inode);
+
+ frame->local = NULL;
+ if (op_ret)
+ goto unwind;
+
+ if (!IA_ISDIR (buf->ia_type))
+ goto unwind;
+
+ /* need to send back a different inode for linking in itable */
+ if (cbk_inode == inode) {
+ /* check if the inode is in the 'itable' or
+ if its just previously discover()'d inode */
+ true_inode = inode_find (inode->table, buf->ia_gfid);
+ if (!true_inode) {
+ /* This unref is for 'inode_ref()' done in beginning.
+ This is needed as cbk_inode is allocated new inode
+ whose unref is taken at the end*/
+ inode_unref (cbk_inode);
+ cbk_inode = inode_new (inode->table);
+
+ if (!cbk_inode) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+ /* the inode is not present in itable, ie, the actual
+ path is not yet looked up. Use the current inode
+ itself for now */
+
+ linked_inode = inode_link (inode, NULL, NULL, buf);
+ inode = linked_inode;
+ } else {
+ /* 'inode_ref()' has been done in inode_find() */
+ inode = true_inode;
+ }
+
+ ret = inode_ctx_put (cbk_inode, this, (uint64_t)inode);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to set the inode ctx with"
+ "the actual inode");
+ if (inode)
+ inode_unref (inode);
+ }
+ inode = NULL;
+ }
+
+ if (!uuid_is_null (cbk_inode->gfid)) {
+ /* if the previous linked inode is used, use the
+ same gfid */
+ uuid_copy (random_gfid, cbk_inode->gfid);
+ } else {
+ /* replace the buf->ia_gfid to a random gfid
+ for directory, for files, what we received is fine */
+ uuid_generate (random_gfid);
+ }
+
+ uuid_copy (buf->ia_gfid, random_gfid);
+
+ for (i = 15; i > (15 - 8); i--) {
+ temp_ino += (uint64_t)(buf->ia_gfid[i]) << j;
+ j += 8;
+ }
+ buf->ia_ino = temp_ino;
+
+unwind:
+ /* Lookup on non-existing gfid returns ESTALE.
+ Convert into ENOENT for virtual lookup*/
+ if (op_errno == ESTALE)
+ op_errno = ENOENT;
+
+ STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, cbk_inode, buf,
+ xdata, postparent);
+
+ /* Also handles inode_unref of frame->local if done in ga_lookup */
+ if (cbk_inode)
+ inode_unref (cbk_inode);
+
+ return 0;
+}
+
+int32_t
+ga_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *postparent)
+{
+ ga_private_t *priv = NULL;
+
+ /* if the entry in question is not 'root',
+ then follow the normal path */
+ if (op_ret || !__is_root_gfid(buf->ia_gfid))
+ goto unwind;
+
+ priv = this->private;
+
+ /* do we need to copy root stbuf everytime? */
+ /* mostly yes, as we want to have the 'stat' info show latest
+ in every _cbk() */
+
+ /* keep the reference for root stat buf */
+ priv->root_stbuf = *buf;
+ priv->gfiddir_stbuf = priv->root_stbuf;
+ priv->gfiddir_stbuf.ia_gfid[15] = GF_AUX_GFID;
+ priv->gfiddir_stbuf.ia_ino = GF_AUX_GFID;
+
+unwind:
+ STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf,
+ xdata, postparent);
+ return 0;
+}
+
+int32_t
+ga_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ ga_private_t *priv = NULL;
+ int ret = -1;
+ uuid_t tmp_gfid = {0,};
+ loc_t tmp_loc = {0,};
+ uint64_t value = 0;
+ inode_t *inode = NULL;
+ inode_t *true_inode = NULL;
+ int32_t op_errno = ENOENT;
+
+ /* if its discover(), no need for any action here */
+ if (!loc->name)
+ goto wind;
+
+ /* if its revalidate, and inode is not of type directory,
+ proceed with 'wind' */
+ if (loc->inode && loc->inode->ia_type &&
+ !IA_ISDIR (loc->inode->ia_type)) {
+
+ /* a revalidate on ".gfid/<dentry>" is possible, check for it */
+ if (((loc->parent &&
+ __is_gfid_access_dir (loc->parent->gfid)) ||
+ __is_gfid_access_dir (loc->pargfid))) {
+
+ /* here, just send 'loc->gfid' and 'loc->inode' */
+ tmp_loc.inode = inode_ref (loc->inode);
+ uuid_copy (tmp_loc.gfid, loc->inode->gfid);
+
+ STACK_WIND (frame, default_lookup_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lookup,
+ &tmp_loc, xdata);
+
+ inode_unref (tmp_loc.inode);
+
+ return 0;
+ }
+
+ /* not something to bother, continue the flow */
+ goto wind;
+ }
+
+ priv = this->private;
+
+ /* need to check if the lookup is on virtual dir */
+ if ((loc->name && !strcmp (GF_GFID_DIR, loc->name)) &&
+ ((loc->parent && __is_root_gfid (loc->parent->gfid)) ||
+ __is_root_gfid (loc->pargfid))) {
+ /* this means, the query is on '/.gfid', return the fake stat,
+ and say success */
+
+ STACK_UNWIND_STRICT (lookup, frame, 0, 0, loc->inode,
+ &priv->gfiddir_stbuf, xdata,
+ &priv->root_stbuf);
+ return 0;
+ }
+
+ /* now, check if the lookup() is on an existing entry,
+ but on gfid-path */
+ if (!((loc->parent && __is_gfid_access_dir (loc->parent->gfid)) ||
+ __is_gfid_access_dir (loc->pargfid))) {
+ if (!loc->parent)
+ goto wind;
+
+ ret = inode_ctx_get (loc->parent, this, &value);
+ if (ret)
+ goto wind;
+
+ inode = (inode_t *) value;
+
+ ret = loc_copy_overload_parent (&tmp_loc, loc, inode);
+ if (ret)
+ goto err;
+
+ STACK_WIND (frame, ga_lookup_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->lookup, &tmp_loc, xdata);
+
+ loc_wipe (&tmp_loc);
+ return 0;
+ }
+
+ /* make sure the 'basename' is actually a 'canonical-gfid',
+ otherwise, return error */
+ ret = uuid_parse (loc->name, tmp_gfid);
+ if (ret)
+ goto err;
+
+ /* if its fresh lookup, go ahead and send it down, if not,
+ for directory, we need indirection to actual dir inode */
+ if (!(loc->inode && loc->inode->ia_type))
+ goto discover;
+
+ /* revalidate on directory */
+ ret = inode_ctx_get (loc->inode, this, &value);
+ if (ret)
+ goto err;
+
+ inode = (void *)value;
+
+ /* valid inode, already looked up, work on that */
+ if (inode->ia_type)
+ goto discover;
+
+ /* check if the inode is in the 'itable' or
+ if its just previously discover()'d inode */
+ true_inode = inode_find (loc->inode->table, tmp_gfid);
+ if (true_inode) {
+ /* time do another lookup and update the context
+ with proper inode */
+ op_errno = ESTALE;
+ /* 'inode_ref()' done in inode_find */
+ inode_unref (true_inode);
+ goto err;
+ }
+
+discover:
+ /* for the virtual entries, we don't need to send 'gfid-req' key, as
+ for these entries, we don't want to 'set' a new gfid */
+ if (xdata)
+ dict_del (xdata, "gfid-req");
+
+ uuid_copy (tmp_loc.gfid, tmp_gfid);
+
+ /* if revalidate, then we need to have the proper reference */
+ if (inode) {
+ tmp_loc.inode = inode_ref (inode);
+ frame->local = inode_ref (loc->inode);
+ } else {
+ tmp_loc.inode = inode_ref (loc->inode);
+ }
+
+ STACK_WIND (frame, ga_virtual_lookup_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lookup, &tmp_loc, xdata);
+
+ inode_unref (tmp_loc.inode);
+
+ return 0;
+
+wind:
+ /* used for all the normal lookup path */
+ STACK_WIND (frame, ga_lookup_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lookup, loc, xdata);
+
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT (lookup, frame, -1, op_errno, loc->inode,
+ &priv->gfiddir_stbuf, xdata,
+ &priv->root_stbuf);
+ return 0;
+}
+
+int
+ga_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *xdata)
+{
+ int op_errno = 0;
+
+ GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err);
+
+ STACK_WIND (frame, default_mkdir_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->mkdir, loc, mode, umask,
+ xdata);
+
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT (mkdir, frame, -1, op_errno, loc->inode,
+ NULL, NULL, NULL, xdata);
+ return 0;
+}
+
+
+int
+ga_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+ int op_errno = 0;
+
+ GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err);
+
+ STACK_WIND (frame, default_create_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->create,
+ loc, flags, mode, umask, fd, xdata);
+ return 0;
+err:
+ STACK_UNWIND_STRICT (create, frame, -1, op_errno, NULL,
+ NULL, NULL, NULL, NULL, xdata);
+
+ return 0;
+
+}
+
+int
+ga_symlink (call_frame_t *frame, xlator_t *this, const char *linkname,
+ loc_t *loc, mode_t umask, dict_t *xdata)
+{
+ int op_errno = 0;
+
+ GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err);
+
+ STACK_WIND (frame, default_symlink_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->symlink,
+ linkname, loc, umask, xdata);
+ return 0;
+err:
+ STACK_UNWIND_STRICT (symlink, frame, -1, op_errno, NULL,
+ NULL, NULL, NULL, xdata);
+
+ return 0;
+}
+
+int
+ga_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ dev_t rdev, mode_t umask, dict_t *xdata)
+{
+ int op_errno = 0;
+
+ GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err);
+
+ STACK_WIND (frame, default_mknod_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->mknod, loc, mode, rdev,
+ umask, xdata);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (mknod, frame, -1, op_errno, NULL,
+ NULL, NULL, NULL, xdata);
+
+ return 0;
+}
+
+int
+ga_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flag,
+ dict_t *xdata)
+{
+ int op_errno = 0;
+ inode_t *unref = NULL;
+
+ GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err);
+
+ GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind);
+
+wind:
+ STACK_WIND (frame, default_rmdir_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->rmdir,
+ loc, flag, xdata);
+ if (unref)
+ inode_unref (unref);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (rmdir, frame, -1, op_errno, NULL,
+ NULL, xdata);
+
+ return 0;
+}
+
+int
+ga_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t xflag,
+ dict_t *xdata)
+{
+ int op_errno = 0;
+ inode_t *unref = NULL;
+
+ GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err);
+
+ GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind);
+
+wind:
+ STACK_WIND (frame, default_unlink_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->unlink,
+ loc, xflag, xdata);
+
+ if (unref)
+ inode_unref (unref);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (unlink, frame, -1, op_errno, NULL,
+ NULL, xdata);
+
+ return 0;
+}
+
+int
+ga_rename (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc, dict_t *xdata)
+{
+ int op_errno = 0;
+ inode_t *oldloc_unref = NULL;
+ inode_t *newloc_unref = NULL;
+
+ GFID_ACCESS_ENTRY_OP_CHECK (oldloc, op_errno, err);
+ GFID_ACCESS_ENTRY_OP_CHECK (newloc, op_errno, err);
+
+ GFID_ACCESS_GET_VALID_DIR_INODE (this, oldloc, oldloc_unref,
+ handle_newloc);
+
+handle_newloc:
+ GFID_ACCESS_GET_VALID_DIR_INODE (this, newloc, newloc_unref, wind);
+
+wind:
+ STACK_WIND (frame, default_rename_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename,
+ oldloc, newloc, xdata);
+
+ if (oldloc_unref)
+ inode_unref (oldloc_unref);
+
+ if (newloc_unref)
+ inode_unref (newloc_unref);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (rename, frame, -1, op_errno, NULL,
+ NULL, NULL, NULL, NULL, xdata);
+
+ return 0;
+}
+
+
+int
+ga_link (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc, dict_t *xdata)
+{
+ int op_errno = 0;
+ inode_t *oldloc_unref = NULL;
+ inode_t *newloc_unref = NULL;
+
+ GFID_ACCESS_ENTRY_OP_CHECK (oldloc, op_errno, err);
+ GFID_ACCESS_ENTRY_OP_CHECK (newloc, op_errno, err);
+
+ GFID_ACCESS_GET_VALID_DIR_INODE (this, oldloc, oldloc_unref,
+ handle_newloc);
+
+handle_newloc:
+ GFID_ACCESS_GET_VALID_DIR_INODE (this, newloc, newloc_unref, wind);
+
+wind:
+ STACK_WIND (frame, default_link_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->link,
+ oldloc, newloc, xdata);
+
+ if (oldloc_unref)
+ inode_unref (oldloc_unref);
+
+ if (newloc_unref)
+ inode_unref (newloc_unref);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (link, frame, -1, op_errno, NULL,
+ NULL, NULL, NULL, xdata);
+
+ return 0;
+}
+
+int32_t
+ga_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ fd_t *fd, dict_t *xdata)
+{
+ int op_errno = 0;
+
+ GFID_ACCESS_ENTRY_OP_CHECK (loc, op_errno, err);
+
+ /* also check if the loc->inode itself is virtual
+ inode, if yes, return with failure, mainly because we
+ can't handle all the readdirp and other things on it. */
+ if (inode_ctx_get (loc->inode, this, NULL) == 0) {
+ op_errno = ENOTSUP;
+ goto err;
+ }
+
+ STACK_WIND (frame, default_opendir_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->opendir,
+ loc, fd, xdata);
+ return 0;
+err:
+ STACK_UNWIND_STRICT (opendir, frame, -1, op_errno, NULL, xdata);
+
+ return 0;
+}
+
+int32_t
+ga_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ inode_t *unref = NULL;
+
+ GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind);
+
+wind:
+ STACK_WIND (frame, default_getxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->getxattr, loc, name, xdata);
+
+ if (unref)
+ inode_unref (unref);
+
+ return 0;
+}
+
+int32_t
+ga_stat (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ dict_t *xdata)
+{
+ inode_t *unref = NULL;
+
+ GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind);
+
+wind:
+ STACK_WIND (frame, default_stat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->stat, loc, xdata);
+ if (unref)
+ inode_unref (unref);
+
+ return 0;
+}
+
+int32_t
+ga_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ struct iatt *stbuf, int32_t valid,
+ dict_t *xdata)
+{
+ inode_t *unref = NULL;
+
+ GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind);
+
+wind:
+ STACK_WIND (frame, default_setattr_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->setattr, loc, stbuf, valid,
+ xdata);
+ if (unref)
+ inode_unref (unref);
+
+ return 0;
+}
+
+int32_t
+ga_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ inode_t *unref = NULL;
+
+ GFID_ACCESS_GET_VALID_DIR_INODE (this, loc, unref, wind);
+
+wind:
+ STACK_WIND (frame, default_removexattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->removexattr, loc, name,
+ xdata);
+ if (unref)
+ inode_unref (unref);
+
+ return 0;
+}
+
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ if (!this)
+ return ret;
+
+ ret = xlator_mem_acct_init (this, gf_gfid_access_mt_end + 1);
+
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_WARNING, "Memory accounting"
+ " init failed");
+ return ret;
+ }
+
+ return ret;
+}
+
+int32_t
+init (xlator_t *this)
+{
+ ga_private_t *priv = NULL;
+ int ret = -1;
+
+ if (!this->children || this->children->next) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "not configured with exactly one child. exiting");
+ goto out;
+ }
+
+ /* This can be the top of graph in certain cases */
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "dangling volume. check volfile ");
+ }
+
+ /* TODO: define a mem-type structure */
+ priv = GF_CALLOC (1, sizeof (*priv), gf_gfid_access_mt_priv_t);
+ if (!priv)
+ goto out;
+
+ priv->newfile_args_pool = mem_pool_new (ga_newfile_args_t, 512);
+ if (!priv->newfile_args_pool)
+ goto out;
+
+ priv->heal_args_pool = mem_pool_new (ga_heal_args_t, 512);
+ if (!priv->heal_args_pool)
+ goto out;
+
+ this->local_pool = mem_pool_new (ga_local_t, 16);
+ if (!this->local_pool) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to create local_t's memory pool");
+ goto out;
+ }
+
+ this->private = priv;
+
+ ret = 0;
+out:
+ if (ret && priv) {
+ if (priv->newfile_args_pool)
+ mem_pool_destroy (priv->newfile_args_pool);
+ GF_FREE (priv);
+ }
+
+ return ret;
+}
+
+void
+fini (xlator_t *this)
+{
+ ga_private_t *priv = NULL;
+ priv = this->private;
+ this->private = NULL;
+
+ if (priv) {
+ if (priv->newfile_args_pool)
+ mem_pool_destroy (priv->newfile_args_pool);
+ if (priv->heal_args_pool)
+ mem_pool_destroy (priv->heal_args_pool);
+ GF_FREE (priv);
+ }
+
+ return;
+}
+
+
+struct xlator_fops fops = {
+ .lookup = ga_lookup,
+
+ /* entry fops */
+ .mkdir = ga_mkdir,
+ .mknod = ga_mknod,
+ .create = ga_create,
+ .symlink = ga_symlink,
+ .link = ga_link,
+ .unlink = ga_unlink,
+ .rmdir = ga_rmdir,
+ .rename = ga_rename,
+
+ /* handle any other directory operations here */
+ .opendir = ga_opendir,
+ .stat = ga_stat,
+ .setattr = ga_setattr,
+ .getxattr = ga_getxattr,
+ .removexattr = ga_removexattr,
+
+ /* special fop to handle more entry creations */
+ .setxattr = ga_setxattr,
+};
+
+struct xlator_cbks cbks = {
+ .forget = ga_forget,
+};
+
+struct volume_options options[] = {
+ /* This translator doesn't take any options, or provide any options */
+ { .key = {NULL} },
+};
diff --git a/xlators/features/gfid-access/src/gfid-access.h b/xlators/features/gfid-access/src/gfid-access.h
new file mode 100644
index 000000000..e883eca69
--- /dev/null
+++ b/xlators/features/gfid-access/src/gfid-access.h
@@ -0,0 +1,134 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef __GFID_ACCESS_H__
+#define __GFID_ACCESS_H__
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "logging.h"
+#include "dict.h"
+#include "xlator.h"
+#include "defaults.h"
+#include "gfid-access-mem-types.h"
+
+#define UUID_CANONICAL_FORM_LEN 36
+
+#define GF_FUSE_AUX_GFID_NEWFILE "glusterfs.gfid.newfile"
+#define GF_FUSE_AUX_GFID_HEAL "glusterfs.gfid.heal"
+
+#define GF_GFID_KEY "GLUSTERFS_GFID"
+#define GF_GFID_DIR ".gfid"
+#define GF_AUX_GFID 0xd
+
+#define GFID_ACCESS_GET_VALID_DIR_INODE(x,l,unref,lbl) do { \
+ int ret = 0; \
+ uint64_t value = 0; \
+ inode_t *tmp_inode = NULL; \
+ \
+ /* if its an entry operation, on the virtual */ \
+ /* directory inode as parent, we need to handle */ \
+ /* it properly */ \
+ if (l->parent) { \
+ ret = inode_ctx_get (l->parent, x, &value); \
+ if (ret) \
+ goto lbl; \
+ tmp_inode = (inode_t *)value; \
+ l->parent = inode_ref (tmp_inode); \
+ /* if parent is virtual, no need to handle */ \
+ /* loc->inode */ \
+ break; \
+ } \
+ \
+ /* if its an inode operation, on the virtual */ \
+ /* directory inode itself, we need to handle */ \
+ /* it properly */ \
+ if (l->inode) { \
+ ret = inode_ctx_get (l->inode, x, &value); \
+ if (ret) \
+ goto lbl; \
+ tmp_inode = (inode_t *)value; \
+ l->inode = inode_ref (tmp_inode); \
+ } \
+ \
+ } while (0)
+
+#define GFID_ACCESS_ENTRY_OP_CHECK(loc,err,lbl) do { \
+ /* need to check if the lookup is on virtual dir */ \
+ if ((loc->name && !strcmp (GF_GFID_DIR, loc->name)) && \
+ ((loc->parent && \
+ __is_root_gfid (loc->parent->gfid)) || \
+ __is_root_gfid (loc->pargfid))) { \
+ err = EEXIST; \
+ goto lbl; \
+ } \
+ \
+ /* now, check if the lookup() is on an existing */ \
+ /* entry, but on gfid-path */ \
+ if ((loc->parent && \
+ __is_gfid_access_dir (loc->parent->gfid)) || \
+ __is_gfid_access_dir (loc->pargfid)) { \
+ err = EPERM; \
+ goto lbl; \
+ } \
+ } while (0)
+
+
+typedef struct {
+ unsigned int uid;
+ unsigned int gid;
+ char gfid[UUID_CANONICAL_FORM_LEN + 1];
+ unsigned int st_mode;
+ char *bname;
+
+ union {
+ struct _symlink_in {
+ char *linkpath;
+ } __attribute__ ((__packed__)) symlink;
+
+ struct _mknod_in {
+ unsigned int mode;
+ unsigned int rdev;
+ unsigned int umask;
+ } __attribute__ ((__packed__)) mknod;
+
+ struct _mkdir_in {
+ unsigned int mode;
+ unsigned int umask;
+ } __attribute__ ((__packed__)) mkdir;
+ } __attribute__ ((__packed__)) args;
+} __attribute__((__packed__)) ga_newfile_args_t;
+
+typedef struct {
+ char gfid[UUID_CANONICAL_FORM_LEN + 1];
+ char *bname; /* a null terminated basename */
+} __attribute__((__packed__)) ga_heal_args_t;
+
+struct ga_private {
+ /* root inode's stbuf */
+ struct iatt root_stbuf;
+ struct iatt gfiddir_stbuf;
+ struct mem_pool *newfile_args_pool;
+ struct mem_pool *heal_args_pool;
+};
+typedef struct ga_private ga_private_t;
+
+struct __ga_local {
+ call_frame_t *orig_frame;
+ unsigned int uid;
+ unsigned int gid;
+ loc_t loc;
+};
+typedef struct __ga_local ga_local_t;
+
+#endif /* __GFID_ACCESS_H__ */
diff --git a/xlators/features/glupy/Makefile.am b/xlators/features/glupy/Makefile.am
new file mode 100644
index 000000000..060429ecf
--- /dev/null
+++ b/xlators/features/glupy/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src examples
+
+CLEANFILES =
diff --git a/xlators/features/glupy/doc/README.md b/xlators/features/glupy/doc/README.md
new file mode 100644
index 000000000..2d7b30ef6
--- /dev/null
+++ b/xlators/features/glupy/doc/README.md
@@ -0,0 +1,44 @@
+This is just the very start for a GlusterFS[1] meta-translator that will
+allow translator code to be written in Python. It's based on the standard
+Python embedding (not extending) techniques, plus a dash of the ctypes module.
+The interface is a pretty minimal adaptation of the dispatches and callbacks
+from the C API[2] to Python, as follows:
+
+* Dispatch functions and callbacks must be defined on an "xlator" class
+ derived from gluster.Translator so that they'll be auto-registered with
+ the C translator during initialization.
+
+* For each dispatch or callback function you want to intercept, you define a
+ Python function using the xxx\_fop\_t or xxx\_cbk\_t decorator.
+
+* The arguments for each operation are different, so you'll need to refer to
+ the C API. GlusterFS-specific types are used (though only loc\_t is fully
+ defined so far) and type correctness is enforced by ctypes.
+
+* If you do intercept a dispatch function, it is your responsibility to call
+ xxx\_wind (like STACK\_WIND in the C API but operation-specific) to pass
+ the request to the next translator. If you do not intercept a function, it
+ will default the same way as for C (pass through to the same operation with
+ the same arguments on the first child translator).
+
+* If you intercept a callback function, it is your responsibility to call
+ xxx\_unwind (like STACK\_UNWIND\_STRICT in the C API) to pass the request back
+ to the caller.
+
+So far only the lookup and create operations are handled this way, to support
+the "negative lookup" example. Now that the basic infrastructure is in place,
+adding more functions should be very quick, though with that much boilerplate I
+might pause to write a code generator. I also plan to add structure
+definitions and interfaces for some of the utility functions in libglusterfs
+(especially those having to do with inode and fd context) in the fairly near
+future. Note that you can also use ctypes to get at anything not explicitly
+exposed to Python already.
+
+_If you're coming here because of the Linux Journal article, please note that
+the code has evolved since that was written. The version that matches the
+article is here:_
+
+https://github.com/jdarcy/glupy/tree/4bbae91ba459ea46ef32f2966562492e4ca9187a
+
+[1] http://www.gluster.org
+[2] http://hekafs.org/dist/xlator_api_2.html
diff --git a/xlators/features/glupy/doc/TESTING b/xlators/features/glupy/doc/TESTING
new file mode 100644
index 000000000..e05f17f49
--- /dev/null
+++ b/xlators/features/glupy/doc/TESTING
@@ -0,0 +1,9 @@
+Loading a translator written in Python using the glupy meta translator
+-------------------------------------------------------------------------------
+'test.vol' is a simple volfile with the debug-trace Python translator on top
+of a brick. The volfile can be mounted using the following command.
+
+$ glusterfs --debug -f test.vol /path/to/mntpt
+
+If then file operations are performed on the newly mounted file system, log
+output would be printed by the Python translator on the standard output.
diff --git a/xlators/features/glupy/doc/test.vol b/xlators/features/glupy/doc/test.vol
new file mode 100644
index 000000000..0751a488c
--- /dev/null
+++ b/xlators/features/glupy/doc/test.vol
@@ -0,0 +1,10 @@
+volume vol-posix
+ type storage/posix
+ option directory /path/to/brick
+end-volume
+
+volume vol-glupy
+ type features/glupy
+ option module-name debug-trace
+ subvolumes vol-posix
+end-volume
diff --git a/xlators/features/glupy/examples/Makefile.am b/xlators/features/glupy/examples/Makefile.am
new file mode 100644
index 000000000..c26abeaaf
--- /dev/null
+++ b/xlators/features/glupy/examples/Makefile.am
@@ -0,0 +1,5 @@
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+glupyexamplesdir = $(xlatordir)/glupy
+
+glupyexamples_PYTHON = negative.py helloworld.py debug-trace.py
diff --git a/xlators/features/glupy/examples/debug-trace.py b/xlators/features/glupy/examples/debug-trace.py
new file mode 100644
index 000000000..6eef1b58b
--- /dev/null
+++ b/xlators/features/glupy/examples/debug-trace.py
@@ -0,0 +1,775 @@
+import sys
+import stat
+from uuid import UUID
+from time import strftime, localtime
+from gluster.glupy import *
+
+# This translator was written primarily to test the fop entry point definitions
+# and structure definitions in 'glupy.py'.
+
+# It is similar to the C language debug-trace translator, which logs the
+# arguments passed to the fops and their corresponding cbk functions.
+
+dl.get_id.restype = c_long
+dl.get_id.argtypes = [ POINTER(call_frame_t) ]
+
+dl.get_rootunique.restype = c_uint64
+dl.get_rootunique.argtypes = [ POINTER(call_frame_t) ]
+
+def uuid2str (gfid):
+ return str(UUID(''.join(map("{0:02x}".format, gfid))))
+
+
+def st_mode_from_ia (prot, filetype):
+ st_mode = 0
+ type_bit = 0
+ prot_bit = 0
+
+ if filetype == IA_IFREG:
+ type_bit = stat.S_IFREG
+ elif filetype == IA_IFDIR:
+ type_bit = stat.S_IFDIR
+ elif filetype == IA_IFLNK:
+ type_bit = stat.S_IFLNK
+ elif filetype == IA_IFBLK:
+ type_bit = stat.S_IFBLK
+ elif filetype == IA_IFCHR:
+ type_bit = stat.S_IFCHR
+ elif filetype == IA_IFIFO:
+ type_bit = stat.S_IFIFO
+ elif filetype == IA_IFSOCK:
+ type_bit = stat.S_IFSOCK
+ elif filetype == IA_INVAL:
+ pass
+
+
+ if prot.suid:
+ prot_bit |= stat.S_ISUID
+ if prot.sgid:
+ prot_bit |= stat.S_ISGID
+ if prot.sticky:
+ prot_bit |= stat.S_ISVTX
+
+ if prot.owner.read:
+ prot_bit |= stat.S_IRUSR
+ if prot.owner.write:
+ prot_bit |= stat.S_IWUSR
+ if prot.owner.execn:
+ prot_bit |= stat.S_IXUSR
+
+ if prot.group.read:
+ prot_bit |= stat.S_IRGRP
+ if prot.group.write:
+ prot_bit |= stat.S_IWGRP
+ if prot.group.execn:
+ prot_bit |= stat.S_IXGRP
+
+ if prot.other.read:
+ prot_bit |= stat.S_IROTH
+ if prot.other.write:
+ prot_bit |= stat.S_IWOTH
+ if prot.other.execn:
+ prot_bit |= stat.S_IXOTH
+
+ st_mode = (type_bit | prot_bit)
+
+ return st_mode
+
+
+def trace_stat2str (buf):
+ gfid = uuid2str(buf.contents.ia_gfid)
+ mode = st_mode_from_ia(buf.contents.ia_prot, buf.contents.ia_type)
+ atime_buf = strftime("[%b %d %H:%M:%S]",
+ localtime(buf.contents.ia_atime))
+ mtime_buf = strftime("[%b %d %H:%M:%S]",
+ localtime(buf.contents.ia_mtime))
+ ctime_buf = strftime("[%b %d %H:%M:%S]",
+ localtime(buf.contents.ia_ctime))
+ return ("(gfid={0:s}, ino={1:d}, mode={2:o}, nlink={3:d}, uid ={4:d}, "+
+ "gid ={5:d}, size={6:d}, blocks={7:d}, atime={8:s}, mtime={9:s}, "+
+ "ctime={10:s})").format(gfid, buf.contents.ia_no, mode,
+ buf.contents.ia_nlink,
+ buf.contents.ia_uid,
+ buf.contents.ia_gid,
+ buf.contents.ia_size,
+ buf.contents.ia_blocks,
+ atime_buf, mtime_buf,
+ ctime_buf)
+
+class xlator(Translator):
+
+ def __init__(self, c_this):
+ Translator.__init__(self, c_this)
+ self.gfids = {}
+
+ def lookup_fop(self, frame, this, loc, xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = uuid2str(loc.contents.gfid)
+ print("GLUPY TRACE LOOKUP FOP- {0:d}: gfid={1:s}; " +
+ "path={2:s}").format(unique, gfid, loc.contents.path)
+ self.gfids[key] = gfid
+ dl.wind_lookup(frame, POINTER(xlator_t)(), loc, xdata)
+ return 0
+
+ def lookup_cbk(self, frame, cookie, this, op_ret, op_errno,
+ inode, buf, xdata, postparent):
+ unique =dl.get_rootunique(frame)
+ key =dl.get_id(frame)
+ if op_ret == 0:
+ gfid = uuid2str(buf.contents.ia_gfid)
+ statstr = trace_stat2str(buf)
+ postparentstr = trace_stat2str(postparent)
+ print("GLUPY TRACE LOOKUP CBK- {0:d}: gfid={1:s}; "+
+ "op_ret={2:d}; *buf={3:s}; " +
+ "*postparent={4:s}").format(unique, gfid,
+ op_ret, statstr,
+ postparentstr)
+ else:
+ gfid = self.gfids[key]
+ print("GLUPY TRACE LOOKUP CBK - {0:d}: gfid={1:s};" +
+ " op_ret={2:d}; op_errno={3:d}").format(unique,
+ gfid,
+ op_ret,
+ op_errno)
+ del self.gfids[key]
+ dl.unwind_lookup(frame, cookie, this, op_ret, op_errno,
+ inode, buf, xdata, postparent)
+ return 0
+
+ def create_fop(self, frame, this, loc, flags, mode, umask, fd,
+ xdata):
+ unique = dl.get_rootunique(frame)
+ gfid = uuid2str(loc.contents.gfid)
+ print("GLUPY TRACE CREATE FOP- {0:d}: gfid={1:s}; path={2:s}; " +
+ "fd={3:s}; flags=0{4:o}; mode=0{5:o}; " +
+ "umask=0{6:o}").format(unique, gfid, loc.contents.path,
+ fd, flags, mode, umask)
+ dl.wind_create(frame, POINTER(xlator_t)(), loc, flags,mode,
+ umask, fd, xdata)
+ return 0
+
+ def create_cbk(self, frame, cookie, this, op_ret, op_errno, fd,
+ inode, buf, preparent, postparent, xdata):
+ unique = dl.get_rootunique(frame)
+ if op_ret >= 0:
+ gfid = uuid2str(inode.contents.gfid)
+ statstr = trace_stat2str(buf)
+ preparentstr = trace_stat2str(preparent)
+ postparentstr = trace_stat2str(postparent)
+ print("GLUPY TRACE CREATE CBK- {0:d}: gfid={1:s};" +
+ " op_ret={2:d}; fd={3:s}; *stbuf={4:s}; " +
+ "*preparent={5:s};" +
+ " *postparent={6:s}").format(unique, gfid, op_ret,
+ fd, statstr,
+ preparentstr,
+ postparentstr)
+ else:
+ print ("GLUPY TRACE CREATE CBK- {0:d}: op_ret={1:d}; " +
+ "op_errno={2:d}").format(unique, op_ret, op_errno)
+ dl.unwind_create(frame, cookie, this, op_ret, op_errno, fd,
+ inode, buf, preparent, postparent, xdata)
+ return 0
+
+ def open_fop(self, frame, this, loc, flags, fd, xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = uuid2str(loc.contents.inode.contents.gfid)
+ print("GLUPY TRACE OPEN FOP- {0:d}: gfid={1:s}; path={2:s}; "+
+ "flags={3:d}; fd={4:s}").format(unique, gfid,
+ loc.contents.path, flags,
+ fd)
+ self.gfids[key] = gfid
+ dl.wind_open(frame, POINTER(xlator_t)(), loc, flags, fd, xdata)
+ return 0
+
+ def open_cbk(self, frame, cookie, this, op_ret, op_errno, fd, xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = self.gfids[key]
+ print("GLUPY TRACE OPEN CBK- {0:d}: gfid={1:s}; op_ret={2:d}; "
+ "op_errno={3:d}; *fd={4:s}").format(unique, gfid,
+ op_ret, op_errno, fd)
+ del self.gfids[key]
+ dl.unwind_open(frame, cookie, this, op_ret, op_errno, fd,
+ xdata)
+ return 0
+
+ def readv_fop(self, frame, this, fd, size, offset, flags, xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = uuid2str(fd.contents.inode.contents.gfid)
+ print("GLUPY TRACE READV FOP- {0:d}: gfid={1:s}; "+
+ "fd={2:s}; size ={3:d}; offset={4:d}; " +
+ "flags=0{5:x}").format(unique, gfid, fd, size, offset,
+ flags)
+ self.gfids[key] = gfid
+ dl.wind_readv (frame, POINTER(xlator_t)(), fd, size, offset,
+ flags, xdata)
+ return 0
+
+ def readv_cbk(self, frame, cookie, this, op_ret, op_errno, vector,
+ count, buf, iobref, xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = self.gfids[key]
+ if op_ret >= 0:
+ statstr = trace_stat2str(buf)
+ print("GLUPY TRACE READV CBK- {0:d}: gfid={1:s}, "+
+ "op_ret={2:d}; *buf={3:s};").format(unique, gfid,
+ op_ret,
+ statstr)
+
+ else:
+ print("GLUPY TRACE READV CBK- {0:d}: gfid={1:s}, "+
+ "op_ret={2:d}; op_errno={3:d}").format(unique,
+ gfid,
+ op_ret,
+ op_errno)
+ del self.gfids[key]
+ dl.unwind_readv (frame, cookie, this, op_ret, op_errno,
+ vector, count, buf, iobref, xdata)
+ return 0
+
+ def writev_fop(self, frame, this, fd, vector, count, offset, flags,
+ iobref, xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = uuid2str(fd.contents.inode.contents.gfid)
+ print("GLUPY TRACE WRITEV FOP- {0:d}: gfid={1:s}; " +
+ "fd={2:s}; count={3:d}; offset={4:d}; " +
+ "flags=0{5:x}").format(unique, gfid, fd, count, offset,
+ flags)
+ self.gfids[key] = gfid
+ dl.wind_writev(frame, POINTER(xlator_t)(), fd, vector, count,
+ offset, flags, iobref, xdata)
+ return 0
+
+ def writev_cbk(self, frame, cookie, this, op_ret, op_errno, prebuf,
+ postbuf, xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ if op_ret >= 0:
+ preopstr = trace_stat2str(prebuf)
+ postopstr = trace_stat2str(postbuf)
+ print("GLUPY TRACE WRITEV CBK- {0:d}: op_ret={1:d}; " +
+ "*prebuf={2:s}; " +
+ "*postbuf={3:s}").format(unique, op_ret, preopstr,
+ postopstr)
+ else:
+ gfid = self.gfids[key]
+ print("GLUPY TRACE WRITEV CBK- {0:d}: gfid={1:s}; "+
+ "op_ret={2:d}; op_errno={3:d}").format(unique,
+ gfid,
+ op_ret,
+ op_errno)
+ del self.gfids[key]
+ dl.unwind_writev (frame, cookie, this, op_ret, op_errno,
+ prebuf, postbuf, xdata)
+ return 0
+
+ def opendir_fop(self, frame, this, loc, fd, xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = uuid2str(loc.contents.inode.contents.gfid)
+ print("GLUPY TRACE OPENDIR FOP- {0:d}: gfid={1:s}; path={2:s}; "+
+ "fd={3:s}").format(unique, gfid, loc.contents.path, fd)
+ self.gfids[key] = gfid
+ dl.wind_opendir(frame, POINTER(xlator_t)(), loc, fd, xdata)
+ return 0
+
+ def opendir_cbk(self, frame, cookie, this, op_ret, op_errno, fd,
+ xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = self.gfids[key]
+ print("GLUPY TRACE OPENDIR CBK- {0:d}: gfid={1:s}; op_ret={2:d};"+
+ " op_errno={3:d}; fd={4:s}").format(unique, gfid, op_ret,
+ op_errno, fd)
+ del self.gfids[key]
+ dl.unwind_opendir(frame, cookie, this, op_ret, op_errno,
+ fd, xdata)
+ return 0
+
+ def readdir_fop(self, frame, this, fd, size, offset, xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = uuid2str(fd.contents.inode.contents.gfid)
+ print("GLUPY TRACE READDIR FOP- {0:d}: gfid={1:s}; fd={2:s}; " +
+ "size={3:d}; offset={4:d}").format(unique, gfid, fd, size,
+ offset)
+ self.gfids[key] = gfid
+ dl.wind_readdir(frame, POINTER(xlator_t)(), fd, size, offset,
+ xdata)
+ return 0
+
+ def readdir_cbk(self, frame, cookie, this, op_ret, op_errno, buf,
+ xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = self.gfids[key]
+ print("GLUPY TRACE READDIR CBK- {0:d}: gfid={1:s}; op_ret={2:d};"+
+ " op_errno={3:d}").format(unique, gfid, op_ret, op_errno)
+ del self.gfids[key]
+ dl.unwind_readdir(frame, cookie, this, op_ret, op_errno, buf,
+ xdata)
+ return 0
+
+ def readdirp_fop(self, frame, this, fd, size, offset, dictionary):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = uuid2str(fd.contents.inode.contents.gfid)
+ print("GLUPY TRACE READDIRP FOP- {0:d}: gfid={1:s}; fd={2:s}; "+
+ " size={3:d}; offset={4:d}").format(unique, gfid, fd, size,
+ offset)
+ self.gfids[key] = gfid
+ dl.wind_readdirp(frame, POINTER(xlator_t)(), fd, size, offset,
+ dictionary)
+ return 0
+
+ def readdirp_cbk(self, frame, cookie, this, op_ret, op_errno, buf,
+ xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = self.gfids[key]
+ print("GLUPY TRACE READDIRP CBK- {0:d}: gfid={1:s}; "+
+ "op_ret={2:d}; op_errno={3:d}").format(unique, gfid,
+ op_ret, op_errno)
+ del self.gfids[key]
+ dl.unwind_readdirp(frame, cookie, this, op_ret, op_errno, buf,
+ xdata)
+ return 0
+
+ def mkdir_fop(self, frame, this, loc, mode, umask, xdata):
+ unique = dl.get_rootunique(frame)
+ gfid = uuid2str(loc.contents.inode.contents.gfid)
+ print("GLUPY TRACE MKDIR FOP- {0:d}: gfid={1:s}; path={2:s}; " +
+ "mode={3:d}; umask=0{4:o}").format(unique, gfid,
+ loc.contents.path, mode,
+ umask)
+ dl.wind_mkdir(frame, POINTER(xlator_t)(), loc, mode, umask,
+ xdata)
+ return 0
+
+ def mkdir_cbk(self, frame, cookie, this, op_ret, op_errno, inode, buf,
+ preparent, postparent, xdata):
+ unique = dl.get_rootunique(frame)
+ if op_ret == 0:
+ gfid = uuid2str(inode.contents.gfid)
+ statstr = trace_stat2str(buf)
+ preparentstr = trace_stat2str(preparent)
+ postparentstr = trace_stat2str(postparent)
+ print("GLUPY TRACE MKDIR CBK- {0:d}: gfid={1:s}; "+
+ "op_ret={2:d}; *stbuf={3:s}; *prebuf={4:s}; "+
+ "*postbuf={5:s} ").format(unique, gfid, op_ret,
+ statstr,
+ preparentstr,
+ postparentstr)
+ else:
+ print("GLUPY TRACE MKDIR CBK- {0:d}: op_ret={1:d}; "+
+ "op_errno={2:d}").format(unique, op_ret, op_errno)
+ dl.unwind_mkdir(frame, cookie, this, op_ret, op_errno, inode,
+ buf, preparent, postparent, xdata)
+ return 0
+
+ def rmdir_fop(self, frame, this, loc, flags, xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = uuid2str(loc.contents.inode.contents.gfid)
+ print("GLUPY TRACE RMDIR FOP- {0:d}: gfid={1:s}; path={2:s}; "+
+ "flags={3:d}").format(unique, gfid, loc.contents.path,
+ flags)
+ self.gfids[key] = gfid
+ dl.wind_rmdir(frame, POINTER(xlator_t)(), loc, flags, xdata)
+ return 0
+
+ def rmdir_cbk(self, frame, cookie, this, op_ret, op_errno, preparent,
+ postparent, xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = self.gfids[key]
+ if op_ret == 0:
+ preparentstr = trace_stat2str(preparent)
+ postparentstr = trace_stat2str(postparent)
+ print("GLUPY TRACE RMDIR CBK- {0:d}: gfid={1:s}; "+
+ "op_ret={2:d}; *prebuf={3:s}; "+
+ "*postbuf={4:s}").format(unique, gfid, op_ret,
+ preparentstr,
+ postparentstr)
+ else:
+ print("GLUPY TRACE RMDIR CBK- {0:d}: gfid={1:s}; "+
+ "op_ret={2:d}; op_errno={3:d}").format(unique,
+ gfid,
+ op_ret,
+ op_errno)
+ del self.gfids[key]
+ dl.unwind_rmdir(frame, cookie, this, op_ret, op_errno,
+ preparent, postparent, xdata)
+ return 0
+
+ def stat_fop(self, frame, this, loc, xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = uuid2str(loc.contents.inode.contents.gfid)
+ print("GLUPY TRACE STAT FOP- {0:d}: gfid={1:s}; " +
+ " path={2:s}").format(unique, gfid, loc.contents.path)
+ self.gfids[key] = gfid
+ dl.wind_stat(frame, POINTER(xlator_t)(), loc, xdata)
+ return 0
+
+ def stat_cbk(self, frame, cookie, this, op_ret, op_errno, buf,
+ xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = self.gfids[key]
+ if op_ret == 0:
+ statstr = trace_stat2str(buf)
+ print("GLUPY TRACE STAT CBK- {0:d}: gfid={1:s}; "+
+ "op_ret={2:d}; *buf={3:s};").format(unique,
+ gfid,
+ op_ret,
+ statstr)
+ else:
+ print("GLUPY TRACE STAT CBK- {0:d}: gfid={1:s}; "+
+ "op_ret={2:d}; op_errno={3:d}").format(unique,
+ gfid,
+ op_ret,
+ op_errno)
+ del self.gfids[key]
+ dl.unwind_stat(frame, cookie, this, op_ret, op_errno,
+ buf, xdata)
+ return 0
+
+ def fstat_fop(self, frame, this, fd, xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = uuid2str(fd.contents.inode.contents.gfid)
+ print("GLUPY TRACE FSTAT FOP- {0:d}: gfid={1:s}; " +
+ "fd={2:s}").format(unique, gfid, fd)
+ self.gfids[key] = gfid
+ dl.wind_fstat(frame, POINTER(xlator_t)(), fd, xdata)
+ return 0
+
+ def fstat_cbk(self, frame, cookie, this, op_ret, op_errno, buf,
+ xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = self.gfids[key]
+ if op_ret == 0:
+ statstr = trace_stat2str(buf)
+ print("GLUPY TRACE FSTAT CBK- {0:d}: gfid={1:s} "+
+ " op_ret={2:d}; *buf={3:s}").format(unique,
+ gfid,
+ op_ret,
+ statstr)
+ else:
+ print("GLUPY TRACE FSTAT CBK- {0:d}: gfid={1:s} "+
+ "op_ret={2:d}; op_errno={3:d}").format(unique.
+ gfid,
+ op_ret,
+ op_errno)
+ del self.gfids[key]
+ dl.unwind_fstat(frame, cookie, this, op_ret, op_errno,
+ buf, xdata)
+ return 0
+
+ def statfs_fop(self, frame, this, loc, xdata):
+ unique = dl.get_rootunique(frame)
+ if loc.contents.inode:
+ gfid = uuid2str(loc.contents.inode.contents.gfid)
+ else:
+ gfid = "0"
+ print("GLUPY TRACE STATFS FOP- {0:d}: gfid={1:s}; "+
+ "path={2:s}").format(unique, gfid, loc.contents.path)
+ dl.wind_statfs(frame, POINTER(xlator_t)(), loc, xdata)
+ return 0
+
+ def statfs_cbk(self, frame, cookie, this, op_ret, op_errno, buf,
+ xdata):
+ unique = dl.get_rootunique(frame)
+ if op_ret == 0:
+ #TBD: print buf (pointer to an iovec type object)
+ print("GLUPY TRACE STATFS CBK {0:d}: "+
+ "op_ret={1:d}").format(unique, op_ret)
+ else:
+ print("GLUPY TRACE STATFS CBK- {0:d}"+
+ "op_ret={1:d}; op_errno={2:d}").format(unique,
+ op_ret,
+ op_errno)
+ dl.unwind_statfs(frame, cookie, this, op_ret, op_errno,
+ buf, xdata)
+ return 0
+
+ def getxattr_fop(self, frame, this, loc, name, xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = uuid2str(loc.contents.inode.contents.gfid)
+ print("GLUPY TRACE GETXATTR FOP- {0:d}: gfid={1:s}; path={2:s};"+
+ " name={3:s}").format(unique, gfid, loc.contents.path,
+ name)
+ self.gfids[key]=gfid
+ dl.wind_getxattr(frame, POINTER(xlator_t)(), loc, name, xdata)
+ return 0
+
+ def getxattr_cbk(self, frame, cookie, this, op_ret, op_errno,
+ dictionary, xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = self.gfids[key]
+ print("GLUPY TRACE GETXATTR CBK- {0:d}: gfid={1:s}; "+
+ "op_ret={2:d}; op_errno={3:d}; "+
+ " dictionary={4:s}").format(unique, gfid, op_ret, op_errno,
+ dictionary)
+ del self.gfids[key]
+ dl.unwind_getxattr(frame, cookie, this, op_ret, op_errno,
+ dictionary, xdata)
+ return 0
+
+ def fgetxattr_fop(self, frame, this, fd, name, xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = uuid2str(fd.contents.inode.contents.gfid)
+ print("GLUPY TRACE FGETXATTR FOP- {0:d}: gfid={1:s}; fd={2:s}; "+
+ "name={3:s}").format(unique, gfid, fd, name)
+ self.gfids[key] = gfid
+ dl.wind_fgetxattr(frame, POINTER(xlator_t)(), fd, name, xdata)
+ return 0
+
+ def fgetxattr_cbk(self, frame, cookie, this, op_ret, op_errno,
+ dictionary, xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = self.gfids[key]
+ print("GLUPY TRACE FGETXATTR CBK- {0:d}: gfid={1:s}; "+
+ "op_ret={2:d}; op_errno={3:d};"+
+ " dictionary={4:s}").format(unique, gfid, op_ret,
+ op_errno, dictionary)
+ del self.gfids[key]
+ dl.unwind_fgetxattr(frame, cookie, this, op_ret, op_errno,
+ dictionary, xdata)
+ return 0
+
+ def setxattr_fop(self, frame, this, loc, dictionary, flags, xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = uuid2str(loc.contents.inode.contents.gfid)
+ print("GLUPY TRACE SETXATTR FOP- {0:d}: gfid={1:s}; path={2:s};"+
+ " flags={3:d}").format(unique, gfid, loc.contents.path,
+ flags)
+ self.gfids[key] = gfid
+ dl.wind_setxattr(frame, POINTER(xlator_t)(), loc, dictionary,
+ flags, xdata)
+ return 0
+
+ def setxattr_cbk(self, frame, cookie, this, op_ret, op_errno, xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = self.gfids[key]
+ print("GLUPY TRACE SETXATTR CBK- {0:d}: gfid={1:s}; "+
+ "op_ret={2:d}; op_errno={3:d}").format(unique, gfid,
+ op_ret, op_errno)
+ del self.gfids[key]
+ dl.unwind_setxattr(frame, cookie, this, op_ret, op_errno,
+ xdata)
+ return 0
+
+ def fsetxattr_fop(self, frame, this, fd, dictionary, flags, xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = uuid2str(fd.contents.inode.contents.gfid)
+ print("GLUPY TRACE FSETXATTR FOP- {0:d}: gfid={1:s}; fd={2:p}; "+
+ "flags={3:d}").format(unique, gfid, fd, flags)
+ self.gfids[key] = gfid
+ dl.wind_fsetxattr(frame, POINTER(xlator_t)(), fd, dictionary,
+ flags, xdata)
+ return 0
+
+ def fsetxattr_cbk(self, frame, cookie, this, op_ret, op_errno, xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = self.gfids[key]
+ print("GLUPY TRACE FSETXATTR CBK- {0:d}: gfid={1:s}; "+
+ "op_ret={2:d}; op_errno={3:d}").format(unique, gfid,
+ op_ret, op_errno)
+ del self.gfids[key]
+ dl.unwind_fsetxattr(frame, cookie, this, op_ret, op_errno,
+ xdata)
+ return 0
+
+ def removexattr_fop(self, frame, this, loc, name, xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = uuid2str(loc.contents.inode.contents.gfid)
+ print("GLUPY TRACE REMOVEXATTR FOP- {0:d}: gfid={1:s}; "+
+ "path={2:s}; name={3:s}").format(unique, gfid,
+ loc.contents.path,
+ name)
+ self.gfids[key] = gfid
+ dl.wind_removexattr(frame, POINTER(xlator_t)(), loc, name,
+ xdata)
+ return 0
+
+ def removexattr_cbk(self, frame, cookie, this, op_ret, op_errno,
+ xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = self.gfids[key]
+ print("GLUPY TRACE REMOVEXATTR CBK- {0:d}: gfid={1:s} "+
+ " op_ret={2:d}; op_errno={3:d}").format(unique, gfid,
+ op_ret, op_errno)
+ del self.gfids[key]
+ dl.unwind_removexattr(frame, cookie, this, op_ret, op_errno,
+ xdata)
+ return 0
+
+ def link_fop(self, frame, this, oldloc, newloc, xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ if (newloc.contents.inode):
+ newgfid = uuid2str(newloc.contents.inode.contents.gfid)
+ else:
+ newgfid = "0"
+ oldgfid = uuid2str(oldloc.contents.inode.contents.gfid)
+ print("GLUPY TRACE LINK FOP-{0:d}: oldgfid={1:s}; oldpath={2:s};"+
+ "newgfid={3:s};"+
+ "newpath={4:s}").format(unique, oldgfid,
+ oldloc.contents.path,
+ newgfid,
+ newloc.contents.path)
+ self.gfids[key] = oldgfid
+ dl.wind_link(frame, POINTER(xlator_t)(), oldloc, newloc,
+ xdata)
+ return 0
+
+ def link_cbk(self, frame, cookie, this, op_ret, op_errno, inode, buf,
+ preparent, postparent, xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = self.gfids[key]
+ if op_ret == 0:
+ statstr = trace_stat2str(buf)
+ preparentstr = trace_stat2str(preparent)
+ postparentstr = trace_stat2str(postparent)
+ print("GLUPY TRACE LINK CBK- {0:d}: op_ret={1:d} "+
+ "*stbuf={2:s}; *prebuf={3:s}; "+
+ "*postbuf={4:s} ").format(unique, op_ret, statstr,
+ preparentstr,
+ postparentstr)
+ else:
+ print("GLUPY TRACE LINK CBK- {0:d}: gfid={1:s}; "+
+ "op_ret={2:d}; "+
+ "op_errno={3:d}").format(unique, gfid,
+ op_ret, op_errno)
+ del self.gfids[key]
+ dl.unwind_link(frame, cookie, this, op_ret, op_errno, inode,
+ buf, preparent, postparent, xdata)
+ return 0
+
+ def unlink_fop(self, frame, this, loc, xflag, xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = uuid2str(loc.contents.inode.contents.gfid)
+ print("GLUPY TRACE UNLINK FOP- {0:d}; gfid={1:s}; path={2:s}; "+
+ "flag={3:d}").format(unique, gfid, loc.contents.path,
+ xflag)
+ self.gfids[key] = gfid
+ dl.wind_unlink(frame, POINTER(xlator_t)(), loc, xflag,
+ xdata)
+ return 0
+
+ def unlink_cbk(self, frame, cookie, this, op_ret, op_errno,
+ preparent, postparent, xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = self.gfids[key]
+ if op_ret == 0:
+ preparentstr = trace_stat2str(preparent)
+ postparentstr = trace_stat2str(postparent)
+ print("GLUPY TRACE UNLINK CBK- {0:d}: gfid ={1:s}; "+
+ "op_ret={2:d}; *prebuf={3:s}; "+
+ "*postbuf={4:s} ").format(unique, gfid, op_ret,
+ preparentstr,
+ postparentstr)
+ else:
+ print("GLUPY TRACE UNLINK CBK: {0:d}: gfid ={1:s}; "+
+ "op_ret={2:d}; "+
+ "op_errno={3:d}").format(unique, gfid, op_ret,
+ op_errno)
+ del self.gfids[key]
+ dl.unwind_unlink(frame, cookie, this, op_ret, op_errno,
+ preparent, postparent, xdata)
+ return 0
+
+ def readlink_fop(self, frame, this, loc, size, xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = uuid2str(loc.contents.inode.contents.gfid)
+ print("GLUPY TRACE READLINK FOP- {0:d}: gfid={1:s}; path={2:s};"+
+ " size={3:d}").format(unique, gfid, loc.contents.path,
+ size)
+ self.gfids[key] = gfid
+ dl.wind_readlink(frame, POINTER(xlator_t)(), loc, size,
+ xdata)
+ return 0
+
+ def readlink_cbk(self, frame, cookie, this, op_ret, op_errno,
+ buf, stbuf, xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = self.gfids[key]
+ if op_ret == 0:
+ statstr = trace_stat2str(stbuf)
+ print("GLUPY TRACE READLINK CBK- {0:d}: gfid={1:s} "+
+ " op_ret={2:d}; op_errno={3:d}; *prebuf={4:s}; "+
+ "*postbuf={5:s} ").format(unique, gfid,
+ op_ret, op_errno,
+ buf, statstr)
+ else:
+ print("GLUPY TRACE READLINK CBK- {0:d}: gfid={1:s} "+
+ " op_ret={2:d}; op_errno={3:d}").format(unique,
+ gfid,
+ op_ret,
+ op_errno)
+ del self.gfids[key]
+ dl.unwind_readlink(frame, cookie, this, op_ret, op_errno, buf,
+ stbuf, xdata)
+ return 0
+
+ def symlink_fop(self, frame, this, linkpath, loc, umask, xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = uuid2str(loc.contents.inode.contents.gfid)
+ print("GLUPY TRACE SYMLINK FOP- {0:d}: gfid={1:s}; "+
+ "linkpath={2:s}; path={3:s};"+
+ "umask=0{4:o}").format(unique, gfid, linkpath,
+ loc.contents.path, umask)
+ self.gfids[key] = gfid
+ dl.wind_symlink(frame, POINTER(xlator_t)(), linkpath, loc,
+ umask, xdata)
+ return 0
+
+ def symlink_cbk(self, frame, cookie, this, op_ret, op_errno,
+ inode, buf, preparent, postparent, xdata):
+ unique = dl.get_rootunique(frame)
+ key = dl.get_id(frame)
+ gfid = self.gfids[key]
+ if op_ret == 0:
+ statstr = trace_stat2str(buf)
+ preparentstr = trace_stat2str(preparent)
+ postparentstr = trace_stat2str(postparent)
+ print("GLUPY TRACE SYMLINK CBK- {0:d}: gfid={1:s}; "+
+ "op_ret={2:d}; *stbuf={3:s}; *preparent={4:s}; "+
+ "*postparent={5:s}").format(unique, gfid,
+ op_ret, statstr,
+ preparentstr,
+ postparentstr)
+ else:
+ print("GLUPY TRACE SYMLINK CBK- {0:d}: gfid={1:s}; "+
+ "op_ret={2:d}; op_errno={3:d}").format(unique,
+ gfid,
+ op_ret,
+ op_errno)
+ del self.gfids[key]
+ dl.unwind_symlink(frame, cookie, this, op_ret, op_errno,
+ inode, buf, preparent, postparent, xdata)
+ return 0
diff --git a/xlators/features/glupy/examples/helloworld.py b/xlators/features/glupy/examples/helloworld.py
new file mode 100644
index 000000000..b565a4e5b
--- /dev/null
+++ b/xlators/features/glupy/examples/helloworld.py
@@ -0,0 +1,19 @@
+import sys
+from gluster.glupy import *
+
+class xlator (Translator):
+
+ def __init__(self, c_this):
+ Translator.__init__(self, c_this)
+
+ def lookup_fop(self, frame, this, loc, xdata):
+ print "Python xlator: Hello!"
+ dl.wind_lookup(frame, POINTER(xlator_t)(), loc, xdata)
+ return 0
+
+ def lookup_cbk(self, frame, cookie, this, op_ret, op_errno, inode, buf,
+ xdata, postparent):
+ print "Python xlator: Hello again!"
+ dl.unwind_lookup(frame, cookie, this, op_ret, op_errno, inode, buf,
+ xdata, postparent)
+ return 0
diff --git a/xlators/features/glupy/examples/negative.py b/xlators/features/glupy/examples/negative.py
new file mode 100644
index 000000000..e7a4fc07c
--- /dev/null
+++ b/xlators/features/glupy/examples/negative.py
@@ -0,0 +1,91 @@
+import sys
+from uuid import UUID
+from gluster.glupy import *
+
+# Negative-lookup-caching example. If a file wasn't there the last time we
+# looked, it's probably still not there. This translator keeps track of
+# those failed lookups for us, and returns ENOENT without needing to pass the
+# call any further for repeated requests.
+
+# If we were doing this for real, we'd need separate caches for each xlator
+# instance. The easiest way to do this would be to have xlator.__init__
+# "register" each instance in a module-global dict, with the key as the C
+# translator address and the value as the xlator object itself. For testing
+# and teaching, it's sufficient just to have one cache. The keys are parent
+# GFIDs, and the entries are lists of names within that parent that we know
+# don't exist.
+cache = {}
+
+# TBD: we need a better way of handling per-request data (frame->local in C).
+dl.get_id.restype = c_long
+dl.get_id.argtypes = [ POINTER(call_frame_t) ]
+
+def uuid2str (gfid):
+ return str(UUID(''.join(map("{0:02x}".format, gfid))))
+
+class xlator (Translator):
+
+ def __init__ (self, c_this):
+ self.requests = {}
+ Translator.__init__(self,c_this)
+
+ def lookup_fop (self, frame, this, loc, xdata):
+ pargfid = uuid2str(loc.contents.pargfid)
+ print "lookup FOP: %s:%s" % (pargfid, loc.contents.name)
+ # Check the cache.
+ if cache.has_key(pargfid):
+ if loc.contents.name in cache[pargfid]:
+ print "short-circuiting for %s:%s" % (pargfid,
+ loc.contents.name)
+ dl.unwind_lookup(frame,0,this,-1,2,None,None,None,None)
+ return 0
+ key = dl.get_id(frame)
+ self.requests[key] = (pargfid, loc.contents.name[:])
+ # TBD: get real child xl from init, pass it here
+ dl.wind_lookup(frame,POINTER(xlator_t)(),loc,xdata)
+ return 0
+
+ def lookup_cbk (self, frame, cookie, this, op_ret, op_errno, inode, buf,
+ xdata, postparent):
+ print "lookup CBK: %d (%d)" % (op_ret, op_errno)
+ key = dl.get_id(frame)
+ pargfid, name = self.requests[key]
+ # Update the cache.
+ if op_ret == 0:
+ print "found %s, removing from cache" % name
+ if cache.has_key(pargfid):
+ cache[pargfid].discard(name)
+ elif op_errno == 2: # ENOENT
+ print "failed to find %s, adding to cache" % name
+ if cache.has_key(pargfid):
+ cache[pargfid].add(name)
+ else:
+ cache[pargfid] = set([name])
+ del self.requests[key]
+ dl.unwind_lookup(frame,cookie,this,op_ret,op_errno,
+ inode,buf,xdata,postparent)
+ return 0
+
+ def create_fop (self, frame, this, loc, flags, mode, umask, fd, xdata):
+ pargfid = uuid2str(loc.contents.pargfid)
+ print "create FOP: %s:%s" % (pargfid, loc.contents.name)
+ key = dl.get_id(frame)
+ self.requests[key] = (pargfid, loc.contents.name[:])
+ # TBD: get real child xl from init, pass it here
+ dl.wind_create(frame,POINTER(xlator_t)(),loc,flags,mode,umask,fd,xdata)
+ return 0
+
+ def create_cbk (self, frame, cookie, this, op_ret, op_errno, fd, inode,
+ buf, preparent, postparent, xdata):
+ print "create CBK: %d (%d)" % (op_ret, op_errno)
+ key = dl.get_id(frame)
+ pargfid, name = self.requests[key]
+ # Update the cache.
+ if op_ret == 0:
+ print "created %s, removing from cache" % name
+ if cache.has_key(pargfid):
+ cache[pargfid].discard(name)
+ del self.requests[key]
+ dl.unwind_create(frame,cookie,this,op_ret,op_errno,fd,inode,buf,
+ preparent,postparent,xdata)
+ return 0
diff --git a/xlators/features/glupy/src/Makefile.am b/xlators/features/glupy/src/Makefile.am
new file mode 100644
index 000000000..ae7b6d14d
--- /dev/null
+++ b/xlators/features/glupy/src/Makefile.am
@@ -0,0 +1,21 @@
+xlator_LTLIBRARIES = glupy.la
+
+# Ensure GLUSTER_PYTHON_PATH is passed to glupy.so
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+glupydir = $(xlatordir)/glupy
+AM_CPPFLAGS = $(PYTHONDEV_CPPFLAGS) $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src -isystem $(BUILD_PYTHON_INC)
+AM_CFLAGS = $(PYTHONDEV_CFLAGS) -Wall -fno-strict-aliasing -DGLUSTER_PYTHON_PATH=\"$(glupydir)\" $(GF_CFLAGS)
+
+# Flags to build glupy.so with
+glupy_la_LDFLAGS = $(PYTHONDEV_LDFLAGS) -module -avoid-version -shared -nostartfiles
+glupy_la_SOURCES = glupy.c
+glupy_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
+ -lpthread -l$(BUILD_PYTHON_LIB)
+
+noinst_HEADERS = glupy.h
+
+# Install glupy.py into the Python site-packages area
+pyglupydir = $(pythondir)/gluster
+pyglupy_PYTHON = glupy.py
+
+CLEANFILES =
diff --git a/xlators/features/glupy/src/glupy.c b/xlators/features/glupy/src/glupy.c
new file mode 100644
index 000000000..7492124dd
--- /dev/null
+++ b/xlators/features/glupy/src/glupy.c
@@ -0,0 +1,2490 @@
+/*
+ Copyright (c) 2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#include <ctype.h>
+#include <sys/uio.h>
+#include <Python.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "xlator.h"
+#include "logging.h"
+#include "defaults.h"
+
+#include "glupy.h"
+
+/* UTILITY FUNCTIONS FOR FOP-SPECIFIC CODE */
+
+pthread_key_t gil_init_key;
+
+PyGILState_STATE
+glupy_enter (void)
+{
+#if 0
+ if (!pthread_getspecific(gil_init_key)) {
+ PyEval_ReleaseLock();
+ (void)pthread_setspecific(gil_init_key,(void *)1);
+ }
+#endif
+
+ return PyGILState_Ensure();
+}
+
+void
+glupy_leave (PyGILState_STATE gstate)
+{
+ PyGILState_Release(gstate);
+}
+
+/* FOP: LOOKUP */
+
+int32_t
+glupy_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *postparent)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+
+ if (!priv->cbks[GLUPY_LOOKUP]) {
+ goto unwind;
+ }
+
+ gstate = glupy_enter();
+ ret = ((fop_lookup_cbk_t)(priv->cbks[GLUPY_LOOKUP]))(
+ frame, cookie, this, op_ret, op_errno,
+ inode, buf, xdata, postparent);
+ glupy_leave(gstate);
+
+ return ret;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf,
+ xdata, postparent);
+ return 0;
+}
+
+int32_t
+glupy_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+ static long next_id = 0;
+
+ if (!priv->fops[GLUPY_LOOKUP]) {
+ goto wind;
+ }
+
+ gstate = glupy_enter();
+ frame->local = (void *)++next_id;
+ ret = ((fop_lookup_t)(priv->fops[GLUPY_LOOKUP]))(
+ frame, this, loc, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+wind:
+ STACK_WIND (frame, glupy_lookup_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lookup, loc, xdata);
+ return 0;
+}
+
+void
+wind_lookup (call_frame_t *frame, xlator_t *xl, loc_t *loc, dict_t *xdata)
+{
+ xlator_t *this = THIS;
+
+ if (!xl || (xl == this)) {
+ xl = FIRST_CHILD(this);
+ }
+
+ STACK_WIND(frame,glupy_lookup_cbk,xl,xl->fops->lookup,loc,xdata);
+}
+
+void
+unwind_lookup (call_frame_t *frame, long cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *postparent)
+{
+ frame->local = NULL;
+ STACK_UNWIND_STRICT(lookup,frame,op_ret,op_errno,
+ inode,buf,xdata,postparent);
+}
+
+void
+set_lookup_fop (long py_this, fop_lookup_t fop)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->fops[GLUPY_LOOKUP] = (long)fop;
+}
+
+void
+set_lookup_cbk (long py_this, fop_lookup_cbk_t cbk)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->cbks[GLUPY_LOOKUP] = (long)cbk;
+}
+
+/* FOP: CREATE */
+
+int32_t
+glupy_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+
+ if (!priv->cbks[GLUPY_CREATE]) {
+ goto unwind;
+ }
+
+ gstate = glupy_enter();
+ ret = ((fop_create_cbk_t)(priv->cbks[GLUPY_CREATE]))(
+ frame, cookie, this, op_ret, op_errno,
+ fd, inode, buf, preparent, postparent, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, buf,
+ preparent, postparent, xdata);
+ return 0;
+}
+
+int32_t
+glupy_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+ static long next_id = 0;
+
+ if (!priv->fops[GLUPY_CREATE]) {
+ goto wind;
+ }
+
+ gstate = glupy_enter();
+ frame->local = (void *)++next_id;
+ ret = ((fop_create_t)(priv->fops[GLUPY_CREATE]))(
+ frame, this, loc, flags, mode, umask, fd, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+wind:
+ STACK_WIND (frame, glupy_create_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->create, loc, flags, mode, umask,
+ fd, xdata);
+ return 0;
+}
+
+void
+wind_create (call_frame_t *frame, xlator_t *xl, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+ xlator_t *this = THIS;
+
+ if (!xl || (xl == this)) {
+ xl = FIRST_CHILD(this);
+ }
+
+ STACK_WIND (frame, glupy_create_cbk,xl, xl->fops->create,
+ loc, flags, mode, umask, fd, xdata);
+}
+
+void
+unwind_create (call_frame_t *frame, long cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, buf,
+ preparent, postparent, xdata);
+}
+
+void
+set_create_fop (long py_this, fop_create_t fop)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->fops[GLUPY_CREATE] = (long)fop;
+}
+
+void
+set_create_cbk (long py_this, fop_create_cbk_t cbk)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->cbks[GLUPY_CREATE] = (long)cbk;
+}
+
+/* FOP: OPEN */
+
+int32_t
+glupy_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+
+ if (!priv->cbks[GLUPY_OPEN]) {
+ goto unwind;
+ }
+
+ gstate = glupy_enter();
+ ret = ((fop_open_cbk_t)(priv->cbks[GLUPY_OPEN]))(
+ frame, cookie, this, op_ret, op_errno,
+ fd, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, xdata);
+ return 0;
+}
+
+int32_t
+glupy_open (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ int32_t flags, fd_t *fd, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+ static long next_id = 0;
+
+ if (!priv->fops[GLUPY_OPEN]) {
+ goto wind;
+ }
+
+ gstate = glupy_enter();
+ frame->local = (void *)++next_id;
+ ret = ((fop_open_t)(priv->fops[GLUPY_OPEN]))(
+ frame, this, loc, flags, fd, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+wind:
+ STACK_WIND (frame, glupy_open_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
+ return 0;
+}
+
+void
+wind_open (call_frame_t *frame, xlator_t *xl, loc_t *loc, int32_t flags,
+ fd_t *fd, dict_t *xdata)
+{
+ xlator_t *this = THIS;
+
+ if (!xl || (xl == this)) {
+ xl = FIRST_CHILD(this);
+ }
+
+ STACK_WIND (frame, glupy_open_cbk, xl, xl->fops->open, loc, flags,
+ fd, xdata);
+}
+
+void
+unwind_open (call_frame_t *frame, long cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, xdata);
+}
+
+void
+set_open_fop (long py_this, fop_open_t fop)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+ priv->fops[GLUPY_OPEN] = (long)fop;
+}
+
+void
+set_open_cbk (long py_this, fop_open_cbk_t cbk)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+ priv->cbks[GLUPY_OPEN] = (long)cbk;
+}
+
+/* FOP: READV */
+
+int32_t
+glupy_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iovec *vector,
+ int32_t count, struct iatt *stbuf, struct iobref *iobref,
+ dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+
+ if (!priv->cbks[GLUPY_READV]) {
+ goto unwind;
+ }
+
+ gstate = glupy_enter();
+ ret = ((fop_readv_cbk_t)(priv->cbks[GLUPY_READV]))(
+ frame, cookie, this, op_ret, op_errno,
+ vector, count, stbuf, iobref, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, vector,
+ count, stbuf, iobref, xdata);
+ return 0;
+}
+
+int32_t
+glupy_readv (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ size_t size, off_t offset, uint32_t flags, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+ static long next_id = 0;
+
+ if (!priv->fops[GLUPY_READV]) {
+ goto wind;
+ }
+
+ gstate = glupy_enter();
+ frame->local = (void *)++next_id;
+ ret = ((fop_readv_t)(priv->fops[GLUPY_READV]))(
+ frame, this, fd, size, offset, flags, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+wind:
+ STACK_WIND (frame, glupy_readv_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readv, fd, size, offset,
+ flags, xdata);
+ return 0;
+}
+
+void
+wind_readv (call_frame_t *frame, xlator_t *xl, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata)
+{
+ xlator_t *this = THIS;
+
+ if (!xl || (xl == this)) {
+ xl = FIRST_CHILD(this);
+ }
+
+ STACK_WIND (frame, glupy_readv_cbk, xl, xl->fops->readv, fd, size,
+ offset, flags, xdata);
+}
+
+void
+unwind_readv (call_frame_t *frame, long cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iovec *vector,
+ int32_t count, struct iatt *stbuf, struct iobref *iobref,
+ dict_t *xdata)
+{
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, vector,
+ count, stbuf, iobref, xdata);
+}
+
+void
+set_readv_fop (long py_this, fop_readv_t fop)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+ priv->fops[GLUPY_READV] = (long)fop;
+}
+
+void
+set_readv_cbk (long py_this, fop_readv_cbk_t cbk)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+ priv->cbks[GLUPY_READV] = (long)cbk;
+}
+
+/* FOP: WRITEV */
+
+int32_t
+glupy_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+
+ if (!priv->cbks[GLUPY_WRITEV]) {
+ goto unwind;
+ }
+
+ gstate = glupy_enter();
+ ret = ((fop_writev_cbk_t)(priv->cbks[GLUPY_WRITEV]))(
+ frame, cookie, this, op_ret, op_errno,
+ prebuf, postbuf, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+ return 0;
+}
+
+int32_t
+glupy_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset,
+ uint32_t flags, struct iobref *iobref, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+ static long next_id = 0;
+
+ if (!priv->fops[GLUPY_WRITEV]) {
+ goto wind;
+ }
+
+ gstate = glupy_enter();
+ frame->local = (void *)++next_id;
+ ret = ((fop_writev_t)(priv->fops[GLUPY_WRITEV]))(
+ frame, this, fd, vector, count, offset, flags,
+ iobref, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+wind:
+ STACK_WIND (frame, glupy_writev_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev, fd, vector, count,
+ offset, flags, iobref, xdata);
+ return 0;
+}
+
+void
+wind_writev (call_frame_t *frame, xlator_t *xl, fd_t *fd, struct iovec *vector,
+ int32_t count, off_t offset, uint32_t flags, struct iobref *iobref,
+ dict_t *xdata)
+{
+ xlator_t *this = THIS;
+
+ if (!xl || (xl == this)) {
+ xl = FIRST_CHILD(this);
+ }
+
+ STACK_WIND (frame, glupy_writev_cbk, xl, xl->fops->writev, fd, vector,
+ count, offset, flags, iobref, xdata);
+}
+
+void
+unwind_writev (call_frame_t *frame, long cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+}
+
+void
+set_writev_fop (long py_this, fop_writev_t fop)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+ priv->fops[GLUPY_WRITEV] = (long)fop;
+}
+
+void
+set_writev_cbk (long py_this, fop_writev_cbk_t cbk)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+ priv->cbks[GLUPY_WRITEV] = (long)cbk;
+}
+
+
+/* FOP: OPENDIR */
+
+int32_t
+glupy_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd,
+ dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+
+ if (!priv->cbks[GLUPY_OPENDIR]) {
+ goto unwind;
+ }
+
+ gstate = glupy_enter();
+ ret = ((fop_opendir_cbk_t)(priv->cbks[GLUPY_OPENDIR]))(
+ frame, cookie, this, op_ret, op_errno,
+ fd, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (opendir, frame, op_ret, op_errno, fd, xdata);
+ return 0;
+}
+
+int32_t
+glupy_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ fd_t *fd, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+ static long next_id = 0;
+
+ if (!priv->fops[GLUPY_OPENDIR]) {
+ goto wind;
+ }
+
+ gstate = glupy_enter();
+ frame->local = (void *)++next_id;
+ ret = ((fop_opendir_t)(priv->fops[GLUPY_OPENDIR]))(
+ frame, this, loc, fd, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+wind:
+ STACK_WIND (frame, glupy_opendir_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->opendir, loc, fd, xdata);
+ return 0;
+}
+
+void
+wind_opendir (call_frame_t *frame, xlator_t *xl, loc_t *loc, fd_t *fd, dict_t *xdata)
+{
+ xlator_t *this = THIS;
+
+ if (!xl || (xl == this)) {
+ xl = FIRST_CHILD(this);
+ }
+
+ STACK_WIND(frame,glupy_opendir_cbk,xl,xl->fops->opendir,loc,fd,xdata);
+}
+
+void
+unwind_opendir (call_frame_t *frame, long cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+ frame->local = NULL;
+ STACK_UNWIND_STRICT(opendir,frame,op_ret,op_errno,
+ fd,xdata);
+}
+
+void
+set_opendir_fop (long py_this, fop_opendir_t fop)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->fops[GLUPY_OPENDIR] = (long)fop;
+}
+
+void
+set_opendir_cbk (long py_this, fop_opendir_cbk_t cbk)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->cbks[GLUPY_OPENDIR] = (long)cbk;
+}
+
+/* FOP: READDIR */
+
+int32_t
+glupy_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+ dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+
+ if (!priv->cbks[GLUPY_READDIR]) {
+ goto unwind;
+ }
+
+ gstate = glupy_enter();
+ ret = ((fop_readdir_cbk_t)(priv->cbks[GLUPY_READDIR]))(
+ frame, cookie, this, op_ret, op_errno,
+ entries, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, entries,
+ xdata);
+ return 0;
+}
+
+int32_t
+glupy_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ size_t size, off_t offset, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+ static long next_id = 0;
+
+ if (!priv->fops[GLUPY_READDIR]) {
+ goto wind;
+ }
+
+ gstate = glupy_enter();
+ frame->local = (void *)++next_id;
+ ret = ((fop_readdir_t)(priv->fops[GLUPY_READDIR]))(
+ frame, this, fd, size, offset, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+wind:
+ STACK_WIND (frame, glupy_readdir_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdir,fd, size, offset, xdata);
+ return 0;
+}
+
+void
+wind_readdir(call_frame_t *frame, xlator_t *xl, fd_t *fd, size_t size,
+ off_t offset, dict_t *xdata)
+{
+ xlator_t *this = THIS;
+
+ if (!xl || (xl == this)) {
+ xl = FIRST_CHILD(this);
+ }
+
+ STACK_WIND(frame,glupy_readdir_cbk,xl,xl->fops->readdir,fd,size,offset,xdata);
+}
+
+void
+unwind_readdir (call_frame_t *frame, long cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+ dict_t *xdata)
+{
+ frame->local = NULL;
+ STACK_UNWIND_STRICT(readdir,frame,op_ret,op_errno,
+ entries, xdata);
+}
+
+void
+set_readdir_fop (long py_this, fop_readdir_t fop)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->fops[GLUPY_READDIR] = (long)fop;
+}
+
+void
+set_readdir_cbk (long py_this, fop_readdir_cbk_t cbk)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->cbks[GLUPY_READDIR] = (long)cbk;
+}
+
+
+/* FOP: READDIRP */
+
+int32_t
+glupy_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+ dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+
+ if (!priv->cbks[GLUPY_READDIRP]) {
+ goto unwind;
+ }
+
+ gstate = glupy_enter();
+ ret = ((fop_readdirp_cbk_t)(priv->cbks[GLUPY_READDIRP]))(
+ frame, cookie, this, op_ret, op_errno,
+ entries, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries,
+ xdata);
+ return 0;
+}
+
+int32_t
+glupy_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ size_t size, off_t offset, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+ static long next_id = 0;
+
+ if (!priv->fops[GLUPY_READDIRP]) {
+ goto wind;
+ }
+
+ gstate = glupy_enter();
+ frame->local = (void *)++next_id;
+ ret = ((fop_readdirp_t)(priv->fops[GLUPY_READDIRP]))(
+ frame, this, fd, size, offset, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+wind:
+ STACK_WIND (frame, glupy_readdirp_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdirp,fd, size, offset, xdata);
+ return 0;
+}
+
+void
+wind_readdirp (call_frame_t *frame, xlator_t *xl, fd_t *fd, size_t size,
+ off_t offset, dict_t *xdata)
+{
+ xlator_t *this = THIS;
+
+ if (!xl || (xl == this)) {
+ xl = FIRST_CHILD(this);
+ }
+
+ STACK_WIND(frame,glupy_readdirp_cbk,xl,xl->fops->readdirp,fd,size,offset,xdata);
+}
+
+void
+unwind_readdirp (call_frame_t *frame, long cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+ dict_t *xdata)
+{
+ frame->local = NULL;
+ STACK_UNWIND_STRICT(readdirp,frame,op_ret,op_errno,
+ entries, xdata);
+}
+
+void
+set_readdirp_fop (long py_this, fop_readdirp_t fop)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->fops[GLUPY_READDIRP] = (long)fop;
+}
+
+void
+set_readdirp_cbk (long py_this, fop_readdirp_cbk_t cbk)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->cbks[GLUPY_READDIRP] = (long)cbk;
+}
+
+
+/* FOP:STAT */
+
+int32_t
+glupy_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+
+ if (!priv->cbks[GLUPY_STAT]) {
+ goto unwind;
+ }
+
+ gstate = glupy_enter();
+ ret = ((fop_stat_cbk_t)(priv->cbks[GLUPY_STAT]))(
+ frame, cookie, this, op_ret, op_errno,
+ buf, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, buf, xdata);
+ return 0;
+}
+
+int32_t
+glupy_stat (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+ static long next_id = 0;
+
+ if (!priv->fops[GLUPY_STAT]) {
+ goto wind;
+ }
+
+ gstate = glupy_enter();
+ frame->local = (void *)++next_id;
+ ret = ((fop_stat_t)(priv->fops[GLUPY_STAT]))(
+ frame, this, loc, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+wind:
+ STACK_WIND (frame, glupy_stat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->stat, loc, xdata);
+ return 0;
+}
+
+void
+wind_stat (call_frame_t *frame, xlator_t *xl, loc_t *loc, dict_t *xdata)
+{
+ xlator_t *this = THIS;
+
+ if (!xl || (xl == this)) {
+ xl = FIRST_CHILD(this);
+ }
+
+ STACK_WIND(frame,glupy_stat_cbk,xl,xl->fops->stat,loc,xdata);
+}
+
+void
+unwind_stat (call_frame_t *frame, long cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ dict_t *xdata)
+{
+ frame->local = NULL;
+ STACK_UNWIND_STRICT(stat,frame,op_ret,op_errno,
+ buf,xdata);
+}
+
+void
+set_stat_fop (long py_this, fop_stat_t fop)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->fops[GLUPY_STAT] = (long)fop;
+}
+
+void
+set_stat_cbk (long py_this, fop_stat_cbk_t cbk)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->cbks[GLUPY_STAT] = (long)cbk;
+}
+
+
+/* FOP: FSTAT */
+
+int32_t
+glupy_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+
+ if (!priv->cbks[GLUPY_FSTAT]) {
+ goto unwind;
+ }
+
+ gstate = glupy_enter();
+ ret = ((fop_fstat_cbk_t)(priv->cbks[GLUPY_FSTAT]))(
+ frame, cookie, this, op_ret, op_errno,
+ buf, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (fstat, frame, op_ret, op_errno, buf, xdata);
+ return 0;
+}
+
+int32_t
+glupy_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+ static long next_id = 0;
+
+ if (!priv->fops[GLUPY_FSTAT]) {
+ goto wind;
+ }
+
+ gstate = glupy_enter();
+ frame->local = (void *)++next_id;
+ ret = ((fop_fstat_t)(priv->fops[GLUPY_FSTAT]))(
+ frame, this, fd, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+wind:
+ STACK_WIND (frame, glupy_fstat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fstat, fd, xdata);
+ return 0;
+}
+
+void
+wind_fstat (call_frame_t *frame, xlator_t *xl, fd_t *fd, dict_t *xdata)
+{
+ xlator_t *this = THIS;
+
+ if (!xl || (xl == this)) {
+ xl = FIRST_CHILD(this);
+ }
+
+ STACK_WIND(frame,glupy_fstat_cbk,xl,xl->fops->fstat,fd,xdata);
+}
+
+void
+unwind_fstat (call_frame_t *frame, long cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ dict_t *xdata)
+{
+ frame->local = NULL;
+ STACK_UNWIND_STRICT(fstat,frame,op_ret,op_errno,
+ buf,xdata);
+}
+
+void
+set_fstat_fop (long py_this, fop_fstat_t fop)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->fops[GLUPY_FSTAT] = (long)fop;
+}
+
+void
+set_fstat_cbk (long py_this, fop_fstat_cbk_t cbk)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->cbks[GLUPY_FSTAT] = (long)cbk;
+}
+
+/* FOP:STATFS */
+
+int32_t
+glupy_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct statvfs *buf, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+
+ if (!priv->cbks[GLUPY_STATFS]) {
+ goto unwind;
+ }
+
+ gstate = glupy_enter();
+ ret = ((fop_statfs_cbk_t)(priv->cbks[GLUPY_STATFS]))(
+ frame, cookie, this, op_ret, op_errno,
+ buf, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (statfs, frame, op_ret, op_errno, buf, xdata);
+ return 0;
+}
+
+int32_t
+glupy_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+ static long next_id = 0;
+
+ if (!priv->fops[GLUPY_STATFS]) {
+ goto wind;
+ }
+
+ gstate = glupy_enter();
+ frame->local = (void *)++next_id;
+ ret = ((fop_statfs_t)(priv->fops[GLUPY_STATFS]))(
+ frame, this, loc, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+wind:
+ STACK_WIND (frame, glupy_statfs_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->statfs, loc, xdata);
+ return 0;
+}
+
+void
+wind_statfs (call_frame_t *frame, xlator_t *xl, loc_t *loc, dict_t *xdata)
+{
+ xlator_t *this = THIS;
+
+ if (!xl || (xl == this)) {
+ xl = FIRST_CHILD(this);
+ }
+
+ STACK_WIND(frame,glupy_statfs_cbk,xl,xl->fops->statfs,loc,xdata);
+}
+
+void
+unwind_statfs (call_frame_t *frame, long cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct statvfs *buf,
+ dict_t *xdata)
+{
+ frame->local = NULL;
+ STACK_UNWIND_STRICT(statfs,frame,op_ret,op_errno,
+ buf,xdata);
+}
+
+void
+set_statfs_fop (long py_this, fop_statfs_t fop)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->fops[GLUPY_STATFS] = (long)fop;
+}
+
+void
+set_statfs_cbk (long py_this, fop_statfs_cbk_t cbk)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->cbks[GLUPY_STATFS] = (long)cbk;
+}
+
+
+/* FOP: SETXATTR */
+
+int32_t
+glupy_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+
+ if (!priv->cbks[GLUPY_SETXATTR]) {
+ goto unwind;
+ }
+
+ gstate = glupy_enter();
+ ret = ((fop_setxattr_cbk_t)(priv->cbks[GLUPY_SETXATTR]))(
+ frame, cookie, this, op_ret, op_errno,
+ xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+int32_t
+glupy_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ dict_t *dict, int32_t flags, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+ static long next_id = 0;
+
+ if (!priv->fops[GLUPY_SETXATTR]) {
+ goto wind;
+ }
+
+ gstate = glupy_enter();
+ frame->local = (void *)++next_id;
+ ret = ((fop_setxattr_t)(priv->fops[GLUPY_SETXATTR]))(
+ frame, this, loc, dict, flags, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+wind:
+ STACK_WIND (frame, glupy_setxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setxattr, loc, dict,
+ flags, xdata);
+ return 0;
+}
+
+void
+wind_setxattr (call_frame_t *frame, xlator_t *xl, loc_t *loc,
+ dict_t *dict, int32_t flags, dict_t *xdata)
+{
+ xlator_t *this = THIS;
+
+ if (!xl || (xl == this)) {
+ xl = FIRST_CHILD(this);
+ }
+
+ STACK_WIND (frame, glupy_setxattr_cbk, xl, xl->fops->setxattr,
+ loc, dict, flags, xdata);
+}
+
+
+void
+unwind_setxattr (call_frame_t *frame, long cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, xdata);
+
+}
+
+void
+set_setxattr_fop (long py_this, fop_setxattr_t fop)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->fops[GLUPY_SETXATTR] = (long)fop;
+}
+
+void
+set_setxattr_cbk (long py_this, fop_setxattr_cbk_t cbk)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->cbks[GLUPY_SETXATTR] = (long)cbk;
+}
+
+/* FOP: GETXATTR */
+
+int32_t
+glupy_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+
+ if (!priv->cbks[GLUPY_GETXATTR]) {
+ goto unwind;
+ }
+
+ gstate = glupy_enter();
+ ret = ((fop_getxattr_cbk_t)(priv->cbks[GLUPY_GETXATTR]))(
+ frame, cookie, this, op_ret, op_errno, dict,
+ xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict,
+ xdata);
+ return 0;
+}
+
+int32_t
+glupy_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+ static long next_id = 0;
+
+ if (!priv->fops[GLUPY_GETXATTR]) {
+ goto wind;
+ }
+
+ gstate = glupy_enter();
+ frame->local = (void *)++next_id;
+ ret = ((fop_getxattr_t)(priv->fops[GLUPY_GETXATTR]))(
+ frame, this, loc, name, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+wind:
+ STACK_WIND (frame, glupy_getxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->getxattr, loc, name,
+ xdata);
+ return 0;
+}
+
+void
+wind_getxattr (call_frame_t *frame, xlator_t *xl, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ xlator_t *this = THIS;
+
+ if (!xl || (xl == this)) {
+ xl = FIRST_CHILD(this);
+ }
+
+ STACK_WIND (frame, glupy_getxattr_cbk, xl, xl->fops->getxattr,
+ loc, name, xdata);
+}
+
+
+void
+unwind_getxattr (call_frame_t *frame, long cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict,
+ xdata);
+
+}
+
+
+void
+set_getxattr_fop (long py_this, fop_getxattr_t fop)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->fops[GLUPY_GETXATTR] = (long)fop;
+}
+
+
+void
+set_getxattr_cbk (long py_this, fop_getxattr_cbk_t cbk)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->cbks[GLUPY_GETXATTR] = (long)cbk;
+}
+
+/* FOP: FSETXATTR */
+
+int32_t
+glupy_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+
+ if (!priv->cbks[GLUPY_FSETXATTR]) {
+ goto unwind;
+ }
+
+ gstate = glupy_enter();
+ ret = ((fop_fsetxattr_cbk_t)(priv->cbks[GLUPY_FSETXATTR]))(
+ frame, cookie, this, op_ret, op_errno,
+ xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (fsetxattr, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+int32_t
+glupy_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ dict_t *dict, int32_t flags, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+ static long next_id = 0;
+
+ if (!priv->fops[GLUPY_FSETXATTR]) {
+ goto wind;
+ }
+
+ gstate = glupy_enter();
+ frame->local = (void *)++next_id;
+ ret = ((fop_fsetxattr_t)(priv->fops[GLUPY_FSETXATTR]))(
+ frame, this, fd, dict, flags, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+wind:
+ STACK_WIND (frame, glupy_fsetxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr, fd, dict,
+ flags, xdata);
+ return 0;
+}
+
+void
+wind_fsetxattr (call_frame_t *frame, xlator_t *xl, fd_t *fd,
+ dict_t *dict, int32_t flags, dict_t *xdata)
+{
+ xlator_t *this = THIS;
+
+ if (!xl || (xl == this)) {
+ xl = FIRST_CHILD(this);
+ }
+
+ STACK_WIND (frame, glupy_fsetxattr_cbk, xl, xl->fops->fsetxattr,
+ fd, dict, flags, xdata);
+}
+
+
+void
+unwind_fsetxattr (call_frame_t *frame, long cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (fsetxattr, frame, op_ret, op_errno, xdata);
+
+}
+
+void
+set_fsetxattr_fop (long py_this, fop_fsetxattr_t fop)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->fops[GLUPY_FSETXATTR] = (long)fop;
+}
+
+void
+set_fsetxattr_cbk (long py_this, fop_fsetxattr_cbk_t cbk)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->cbks[GLUPY_FSETXATTR] = (long)cbk;
+}
+
+/* FOP: FGETXATTR */
+
+int32_t
+glupy_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+
+ if (!priv->cbks[GLUPY_FGETXATTR]) {
+ goto unwind;
+ }
+
+ gstate = glupy_enter();
+ ret = ((fop_fgetxattr_cbk_t)(priv->cbks[GLUPY_FGETXATTR]))(
+ frame, cookie, this, op_ret, op_errno, dict,
+ xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (fgetxattr, frame, op_ret, op_errno, dict,
+ xdata);
+ return 0;
+}
+
+int32_t
+glupy_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+ static long next_id = 0;
+
+ if (!priv->fops[GLUPY_FGETXATTR]) {
+ goto wind;
+ }
+
+ gstate = glupy_enter();
+ frame->local = (void *)++next_id;
+ ret = ((fop_fgetxattr_t)(priv->fops[GLUPY_FGETXATTR]))(
+ frame, this, fd, name, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+wind:
+ STACK_WIND (frame, glupy_fgetxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fgetxattr, fd, name,
+ xdata);
+ return 0;
+}
+
+void
+wind_fgetxattr (call_frame_t *frame, xlator_t *xl, fd_t *fd,
+ const char *name, dict_t *xdata)
+{
+ xlator_t *this = THIS;
+
+ if (!xl || (xl == this)) {
+ xl = FIRST_CHILD(this);
+ }
+
+ STACK_WIND (frame, glupy_fgetxattr_cbk, xl, xl->fops->fgetxattr,
+ fd, name, xdata);
+}
+
+
+void
+unwind_fgetxattr (call_frame_t *frame, long cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (fgetxattr, frame, op_ret, op_errno, dict,
+ xdata);
+
+}
+
+
+void
+set_fgetxattr_fop (long py_this, fop_fgetxattr_t fop)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->fops[GLUPY_FGETXATTR] = (long)fop;
+}
+
+
+void
+set_fgetxattr_cbk (long py_this, fop_fgetxattr_cbk_t cbk)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->cbks[GLUPY_FGETXATTR] = (long)cbk;
+}
+
+/* FOP:REMOVEXATTR */
+
+int32_t
+glupy_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+
+ if (!priv->cbks[GLUPY_REMOVEXATTR]) {
+ goto unwind;
+ }
+
+ gstate = glupy_enter();
+ ret = ((fop_removexattr_cbk_t)(priv->cbks[GLUPY_REMOVEXATTR]))(
+ frame, cookie, this, op_ret, op_errno, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (removexattr, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+int32_t
+glupy_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+ static long next_id = 0;
+
+ if (!priv->fops[GLUPY_REMOVEXATTR]) {
+ goto wind;
+ }
+
+ gstate = glupy_enter();
+ frame->local = (void *)++next_id;
+ ret = ((fop_removexattr_t)(priv->fops[GLUPY_REMOVEXATTR]))(
+ frame, this, loc, name, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+wind:
+ STACK_WIND (frame, glupy_removexattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->removexattr, loc, name,
+ xdata);
+ return 0;
+}
+
+void
+wind_removexattr (call_frame_t *frame, xlator_t *xl, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ xlator_t *this = THIS;
+
+ if (!xl || (xl == this)) {
+ xl = FIRST_CHILD(this);
+ }
+
+ STACK_WIND (frame, glupy_removexattr_cbk, xl, xl->fops->removexattr,
+ loc, name, xdata);
+}
+
+
+void
+unwind_removexattr (call_frame_t *frame, long cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (removexattr, frame, op_ret, op_errno, xdata);
+
+}
+
+void
+set_removexattr_fop (long py_this, fop_removexattr_t fop)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->fops[GLUPY_REMOVEXATTR] = (long)fop;
+}
+
+void
+set_removexattr_cbk (long py_this, fop_removexattr_cbk_t cbk)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->cbks[GLUPY_REMOVEXATTR] = (long)cbk;
+}
+
+
+/* FOP:FREMOVEXATTR */
+
+int32_t
+glupy_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+
+ if (!priv->cbks[GLUPY_FREMOVEXATTR]) {
+ goto unwind;
+ }
+
+ gstate = glupy_enter();
+ ret = ((fop_fremovexattr_cbk_t)(priv->cbks[GLUPY_FREMOVEXATTR]))(
+ frame, cookie, this, op_ret, op_errno, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (fremovexattr, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+int32_t
+glupy_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+ static long next_id = 0;
+
+ if (!priv->fops[GLUPY_FREMOVEXATTR]) {
+ goto wind;
+ }
+
+ gstate = glupy_enter();
+ frame->local = (void *)++next_id;
+ ret = ((fop_fremovexattr_t)(priv->fops[GLUPY_FREMOVEXATTR]))(
+ frame, this, fd, name, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+wind:
+ STACK_WIND (frame, glupy_fremovexattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fremovexattr, fd, name,
+ xdata);
+ return 0;
+}
+
+void
+wind_fremovexattr (call_frame_t *frame, xlator_t *xl, fd_t *fd,
+ const char *name, dict_t *xdata)
+{
+ xlator_t *this = THIS;
+
+ if (!xl || (xl == this)) {
+ xl = FIRST_CHILD(this);
+ }
+
+ STACK_WIND (frame, glupy_fremovexattr_cbk, xl, xl->fops->fremovexattr,
+ fd, name, xdata);
+}
+
+
+void
+unwind_fremovexattr (call_frame_t *frame, long cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (fremovexattr, frame, op_ret, op_errno, xdata);
+
+}
+
+void
+set_fremovexattr_fop (long py_this, fop_fremovexattr_t fop)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->fops[GLUPY_FREMOVEXATTR] = (long)fop;
+}
+
+void
+set_fremovexattr_cbk (long py_this, fop_fremovexattr_cbk_t cbk)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->cbks[GLUPY_FREMOVEXATTR] = (long)cbk;
+}
+
+
+/* FOP: LINK*/
+int32_t
+glupy_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+
+ if (!priv->cbks[GLUPY_LINK]) {
+ goto unwind;
+ }
+
+ gstate = glupy_enter();
+ ret = ((fop_link_cbk_t)(priv->cbks[GLUPY_LINK]))(
+ frame, cookie, this, op_ret, op_errno,
+ inode, buf, preparent, postparent, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (link, frame, op_ret, op_errno, inode, buf,
+ preparent, postparent, xdata);
+ return 0;
+}
+
+int32_t
+glupy_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+ static long next_id = 0;
+
+ if (!priv->fops[GLUPY_LINK]) {
+ goto wind;
+ }
+
+ gstate = glupy_enter();
+ frame->local = (void *)++next_id;
+ ret = ((fop_link_t)(priv->fops[GLUPY_LINK]))(
+ frame, this, oldloc, newloc, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+wind:
+ STACK_WIND (frame, glupy_link_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->link, oldloc, newloc,
+ xdata);
+ return 0;
+}
+
+void
+wind_link (call_frame_t *frame, xlator_t *xl, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
+{
+ xlator_t *this = THIS;
+
+ if (!xl || (xl == this)) {
+ xl = FIRST_CHILD(this);
+ }
+
+ STACK_WIND (frame, glupy_link_cbk, xl, xl->fops->link,
+ oldloc, newloc, xdata);
+}
+
+void
+unwind_link (call_frame_t *frame, long cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (link, frame, op_ret, op_errno, inode, buf,
+ preparent, postparent, xdata);
+}
+
+void
+set_link_fop (long py_this, fop_link_t fop)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->fops[GLUPY_LINK] = (long)fop;
+}
+
+void
+set_link_cbk (long py_this, fop_link_cbk_t cbk)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->cbks[GLUPY_LINK] = (long)cbk;
+}
+
+/* FOP: SYMLINK*/
+int32_t
+glupy_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+
+ if (!priv->cbks[GLUPY_SYMLINK]) {
+ goto unwind;
+ }
+
+ gstate = glupy_enter();
+ ret = ((fop_symlink_cbk_t)(priv->cbks[GLUPY_SYMLINK]))(
+ frame, cookie, this, op_ret, op_errno,
+ inode, buf, preparent, postparent, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno, inode, buf,
+ preparent, postparent, xdata);
+ return 0;
+}
+
+int32_t
+glupy_symlink(call_frame_t *frame, xlator_t *this, const char *linkname,
+ loc_t *loc, mode_t umask, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+ static long next_id = 0;
+
+ if (!priv->fops[GLUPY_SYMLINK]) {
+ goto wind;
+ }
+
+ gstate = glupy_enter();
+ frame->local = (void *)++next_id;
+ ret = ((fop_symlink_t)(priv->fops[GLUPY_SYMLINK]))(
+ frame, this, linkname, loc, umask, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+wind:
+ STACK_WIND (frame, glupy_symlink_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->symlink, linkname, loc,
+ umask, xdata);
+ return 0;
+}
+
+void
+wind_symlink (call_frame_t *frame, xlator_t *xl, const char *linkname,
+ loc_t *loc, mode_t umask, dict_t *xdata)
+{
+ xlator_t *this = THIS;
+
+ if (!xl || (xl == this)) {
+ xl = FIRST_CHILD(this);
+ }
+
+ STACK_WIND (frame, glupy_symlink_cbk, xl, xl->fops->symlink,
+ linkname, loc, umask, xdata);
+}
+
+void
+unwind_symlink (call_frame_t *frame, long cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno, inode, buf,
+ preparent, postparent, xdata);
+}
+
+void
+set_symlink_fop (long py_this, fop_symlink_t fop)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->fops[GLUPY_SYMLINK] = (long)fop;
+}
+
+void
+set_symlink_cbk (long py_this, fop_symlink_cbk_t cbk)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->cbks[GLUPY_SYMLINK] = (long)cbk;
+}
+
+
+/* FOP: READLINK */
+int32_t
+glupy_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, const char *path,
+ struct iatt *buf, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+
+ if (!priv->cbks[GLUPY_READLINK]) {
+ goto unwind;
+ }
+
+ gstate = glupy_enter();
+ ret = ((fop_readlink_cbk_t)(priv->cbks[GLUPY_READLINK]))(
+ frame, cookie, this, op_ret, op_errno,
+ path, buf, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (readlink, frame, op_ret, op_errno, path,
+ buf, xdata);
+ return 0;
+}
+
+int32_t
+glupy_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ size_t size, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+ static long next_id = 0;
+
+ if (!priv->fops[GLUPY_READLINK]) {
+ goto wind;
+ }
+
+ gstate = glupy_enter();
+ frame->local = (void *)++next_id;
+ ret = ((fop_readlink_t)(priv->fops[GLUPY_READLINK]))(
+ frame, this, loc, size, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+wind:
+ STACK_WIND (frame, glupy_readlink_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readlink, loc,
+ size, xdata);
+ return 0;
+}
+
+void
+wind_readlink (call_frame_t *frame, xlator_t *xl, loc_t *loc,
+ size_t size, dict_t *xdata)
+{
+ xlator_t *this = THIS;
+
+ if (!xl || (xl == this)) {
+ xl = FIRST_CHILD(this);
+ }
+
+ STACK_WIND (frame, glupy_readlink_cbk, xl, xl->fops->readlink,
+ loc, size, xdata);
+}
+
+void
+unwind_readlink (call_frame_t *frame, long cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, const char *path,
+ struct iatt *buf, dict_t *xdata)
+{
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (readlink, frame, op_ret, op_errno, path, buf,
+ xdata);
+}
+
+void
+set_readlink_fop (long py_this, fop_readlink_t fop)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->fops[GLUPY_READLINK] = (long)fop;
+}
+
+void
+set_readlink_cbk (long py_this, fop_readlink_cbk_t cbk)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->cbks[GLUPY_READLINK] = (long)cbk;
+}
+
+
+/* FOP: UNLINK */
+
+int32_t
+glupy_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+
+ if (!priv->cbks[GLUPY_UNLINK]) {
+ goto unwind;
+ }
+
+ gstate = glupy_enter();
+ ret = ((fop_unlink_cbk_t)(priv->cbks[GLUPY_UNLINK]))(
+ frame, cookie, this, op_ret, op_errno,
+ preparent, postparent, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (unlink, frame, op_ret, op_errno, preparent,
+ postparent, xdata);
+ return 0;
+}
+
+int32_t
+glupy_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ int xflags, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+ static long next_id = 0;
+
+ if (!priv->fops[GLUPY_UNLINK]) {
+ goto wind;
+ }
+
+ gstate = glupy_enter();
+ frame->local = (void *)++next_id;
+ ret = ((fop_unlink_t)(priv->fops[GLUPY_UNLINK]))(
+ frame, this, loc, xflags, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+wind:
+ STACK_WIND (frame, glupy_unlink_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->unlink, loc,
+ xflags, xdata);
+ return 0;
+}
+
+void
+wind_unlink (call_frame_t *frame, xlator_t *xl, loc_t *loc,
+ int xflags, dict_t *xdata)
+{
+ xlator_t *this = THIS;
+
+ if (!xl || (xl == this)) {
+ xl = FIRST_CHILD(this);
+ }
+
+ STACK_WIND (frame, glupy_unlink_cbk, xl, xl->fops->unlink,
+ loc, xflags, xdata);
+}
+
+void
+unwind_unlink (call_frame_t *frame, long cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
+{
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (unlink, frame, op_ret, op_errno,
+ preparent, postparent, xdata);
+}
+
+void
+set_unlink_fop (long py_this, fop_unlink_t fop)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->fops[GLUPY_UNLINK] = (long)fop;
+}
+
+void
+set_unlink_cbk (long py_this, fop_unlink_cbk_t cbk)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->cbks[GLUPY_UNLINK] = (long)cbk;
+}
+
+
+/* FOP: MKDIR */
+
+int32_t
+glupy_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+
+ if (!priv->cbks[GLUPY_MKDIR]) {
+ goto unwind;
+ }
+
+ gstate = glupy_enter();
+ ret = ((fop_mkdir_cbk_t)(priv->cbks[GLUPY_MKDIR]))(
+ frame, cookie, this, op_ret, op_errno,
+ inode, buf, preparent, postparent, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (mkdir, frame, op_ret, op_errno, inode, buf,
+ preparent, postparent, xdata);
+ return 0;
+}
+
+int32_t
+glupy_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+ static long next_id = 0;
+
+ if (!priv->fops[GLUPY_MKDIR]) {
+ goto wind;
+ }
+
+ gstate = glupy_enter();
+ frame->local = (void *)++next_id;
+ ret = ((fop_mkdir_t)(priv->fops[GLUPY_MKDIR]))(
+ frame, this, loc, mode, umask, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+wind:
+ STACK_WIND (frame, glupy_mkdir_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->mkdir, loc, mode, umask,
+ xdata);
+ return 0;
+}
+
+void
+wind_mkdir (call_frame_t *frame, xlator_t *xl, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *xdata)
+{
+
+ xlator_t *this = THIS;
+
+ if (!xl || (xl == this)) {
+ xl = FIRST_CHILD(this);
+ }
+
+ STACK_WIND (frame, glupy_mkdir_cbk, xl, xl->fops->mkdir,
+ loc, mode, umask, xdata);
+}
+
+void
+unwind_mkdir (call_frame_t *frame, long cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (mkdir, frame, op_ret, op_errno, inode, buf,
+ preparent, postparent, xdata);
+}
+
+void
+set_mkdir_fop (long py_this, fop_mkdir_t fop)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->fops[GLUPY_MKDIR] = (long)fop;
+}
+
+void
+set_mkdir_cbk (long py_this, fop_mkdir_cbk_t cbk)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->cbks[GLUPY_MKDIR] = (long)cbk;
+}
+
+
+/* FOP: RMDIR */
+
+int32_t
+glupy_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+
+ if (!priv->cbks[GLUPY_RMDIR]) {
+ goto unwind;
+ }
+
+ gstate = glupy_enter();
+ ret = ((fop_rmdir_cbk_t)(priv->cbks[GLUPY_RMDIR]))(
+ frame, cookie, this, op_ret, op_errno,
+ preparent, postparent, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+unwind:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (rmdir, frame, op_ret, op_errno, preparent,
+ postparent, xdata);
+ return 0;
+}
+
+int32_t
+glupy_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ int xflags, dict_t *xdata)
+{
+ glupy_private_t *priv = this->private;
+ PyGILState_STATE gstate;
+ int32_t ret;
+ static long next_id = 0;
+
+ if (!priv->fops[GLUPY_RMDIR]) {
+ goto wind;
+ }
+
+ gstate = glupy_enter();
+ frame->local = (void *)++next_id;
+ ret = ((fop_rmdir_t)(priv->fops[GLUPY_RMDIR]))(
+ frame, this, loc, xflags, xdata);
+ glupy_leave(gstate);
+
+ return ret;
+
+wind:
+ STACK_WIND (frame, glupy_rmdir_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->rmdir, loc,
+ xflags, xdata);
+ return 0;
+}
+
+void
+wind_rmdir (call_frame_t *frame, xlator_t *xl, loc_t *loc,
+ int xflags, dict_t *xdata)
+{
+
+ xlator_t *this = THIS;
+
+ if (!xl || (xl == this)) {
+ xl = FIRST_CHILD(this);
+ }
+
+ STACK_WIND (frame, glupy_rmdir_cbk, xl, xl->fops->rmdir,
+ loc, xflags, xdata);
+}
+
+void
+unwind_rmdir (call_frame_t *frame, long cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
+{
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (rmdir, frame, op_ret, op_errno,
+ preparent, postparent, xdata);
+}
+
+void
+set_rmdir_fop (long py_this, fop_rmdir_t fop)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->fops[GLUPY_RMDIR] = (long)fop;
+}
+
+void
+set_rmdir_cbk (long py_this, fop_rmdir_cbk_t cbk)
+{
+ glupy_private_t *priv = ((xlator_t *)py_this)->private;
+
+ priv->cbks[GLUPY_RMDIR] = (long)cbk;
+}
+
+
+/* NON-FOP-SPECIFIC CODE */
+
+
+long
+get_id (call_frame_t *frame)
+{
+ return (long)(frame->local);
+}
+
+uint64_t
+get_rootunique (call_frame_t *frame)
+{
+ return frame->root->unique;
+}
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ if (!this)
+ return ret;
+
+ ret = xlator_mem_acct_init (this, gf_glupy_mt_end);
+
+ if (ret != 0) {
+ gf_log(this->name, GF_LOG_ERROR, "Memory accounting init"
+ " failed");
+ return ret;
+ }
+
+ return ret;
+}
+
+int32_t
+init (xlator_t *this)
+{
+ glupy_private_t *priv = NULL;
+ char *module_name = NULL;
+ PyObject *py_mod_name = NULL;
+ PyObject *py_init_func = NULL;
+ PyObject *py_args = NULL;
+ PyObject *syspath = NULL;
+ PyObject *path = NULL;
+ static gf_boolean_t py_inited = _gf_false;
+ void * err_cleanup = &&err_return;
+
+ if (dict_get_str(this->options,"module-name",&module_name) != 0) {
+ gf_log (this->name, GF_LOG_ERROR, "missing module-name");
+ return -1;
+ }
+
+ priv = GF_CALLOC (1, sizeof (glupy_private_t), gf_glupy_mt_priv);
+ if (!priv) {
+ goto *err_cleanup;
+ }
+ this->private = priv;
+ err_cleanup = &&err_free_priv;
+
+ if (!py_inited) {
+ Py_Initialize();
+ PyEval_InitThreads();
+#if 0
+ (void)pthread_key_create(&gil_init_key,NULL);
+ (void)pthread_setspecific(gil_init_key,(void *)1);
+#endif
+ /* PyEval_InitThreads takes this "for" us. No thanks. */
+ PyEval_ReleaseLock();
+ py_inited = _gf_true;
+ }
+
+ /* Adjust python's path */
+ syspath = PySys_GetObject("path");
+ path = PyString_FromString(GLUSTER_PYTHON_PATH);
+ PyList_Append(syspath, path);
+ Py_DECREF(path);
+
+ py_mod_name = PyString_FromString(module_name);
+ if (!py_mod_name) {
+ gf_log (this->name, GF_LOG_ERROR, "could not create name");
+ if (PyErr_Occurred()) {
+ PyErr_Print();
+ }
+ goto *err_cleanup;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG, "py_mod_name = %s", module_name);
+ priv->py_module = PyImport_Import(py_mod_name);
+ Py_DECREF(py_mod_name);
+ if (!priv->py_module) {
+ gf_log (this->name, GF_LOG_ERROR, "Python import failed");
+ if (PyErr_Occurred()) {
+ PyErr_Print();
+ }
+ goto *err_cleanup;
+ }
+ gf_log (this->name, GF_LOG_INFO, "Import of %s succeeded", module_name);
+ err_cleanup = &&err_deref_module;
+
+ py_init_func = PyObject_GetAttrString(priv->py_module, "xlator");
+ if (!py_init_func || !PyCallable_Check(py_init_func)) {
+ gf_log (this->name, GF_LOG_ERROR, "missing init func");
+ if (PyErr_Occurred()) {
+ PyErr_Print();
+ }
+ goto *err_cleanup;
+ }
+ err_cleanup = &&err_deref_init;
+
+ py_args = PyTuple_New(1);
+ if (!py_args) {
+ gf_log (this->name, GF_LOG_ERROR, "could not create args");
+ if (PyErr_Occurred()) {
+ PyErr_Print();
+ }
+ goto *err_cleanup;
+ }
+ PyTuple_SetItem(py_args,0,PyLong_FromLong((long)this));
+
+ /* TBD: pass in list of children */
+ priv->py_xlator = PyObject_CallObject(py_init_func, py_args);
+ Py_DECREF(py_args);
+ if (!priv->py_xlator) {
+ gf_log (this->name, GF_LOG_ERROR, "Python init failed");
+ if (PyErr_Occurred()) {
+ PyErr_Print();
+ }
+ goto *err_cleanup;
+ }
+ gf_log (this->name, GF_LOG_DEBUG, "init returned %p", priv->py_xlator);
+
+ return 0;
+
+err_deref_init:
+ Py_DECREF(py_init_func);
+err_deref_module:
+ Py_DECREF(priv->py_module);
+err_free_priv:
+ GF_FREE(priv);
+err_return:
+ return -1;
+}
+
+void
+fini (xlator_t *this)
+{
+ glupy_private_t *priv = this->private;
+
+ if (!priv)
+ return;
+ Py_DECREF(priv->py_xlator);
+ Py_DECREF(priv->py_module);
+ this->private = NULL;
+ GF_FREE (priv);
+
+ return;
+}
+
+struct xlator_fops fops = {
+ .lookup = glupy_lookup,
+ .create = glupy_create,
+ .open = glupy_open,
+ .readv = glupy_readv,
+ .writev = glupy_writev,
+ .opendir = glupy_opendir,
+ .readdir = glupy_readdir,
+ .stat = glupy_stat,
+ .fstat = glupy_fstat,
+ .setxattr = glupy_setxattr,
+ .getxattr = glupy_getxattr,
+ .fsetxattr = glupy_fsetxattr,
+ .fgetxattr = glupy_fgetxattr,
+ .removexattr = glupy_removexattr,
+ .fremovexattr = glupy_fremovexattr,
+ .link = glupy_link,
+ .unlink = glupy_unlink,
+ .readlink = glupy_readlink,
+ .symlink = glupy_symlink,
+ .mkdir = glupy_mkdir,
+ .rmdir = glupy_rmdir,
+ .statfs = glupy_statfs,
+ .readdirp = glupy_readdirp
+};
+
+struct xlator_cbks cbks = {
+};
+
+struct volume_options options[] = {
+ { .key = {NULL} },
+};
diff --git a/xlators/features/glupy/src/glupy.h b/xlators/features/glupy/src/glupy.h
new file mode 100644
index 000000000..8661fce88
--- /dev/null
+++ b/xlators/features/glupy/src/glupy.h
@@ -0,0 +1,69 @@
+/*
+ Copyright (c) 2006-2011 Gluster, Inc. <http://www.gluster.com>
+ This file is part of GlusterFS.
+
+ GlusterFS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3 of the License,
+ or (at your option) any later version.
+
+ GlusterFS is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see
+ <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __GLUPY_H__
+#define __GLUPY_H__
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+#include "mem-types.h"
+
+enum {
+ GLUPY_LOOKUP = 0,
+ GLUPY_CREATE,
+ GLUPY_OPEN,
+ GLUPY_READV,
+ GLUPY_WRITEV,
+ GLUPY_OPENDIR,
+ GLUPY_READDIR,
+ GLUPY_READDIRP,
+ GLUPY_STAT,
+ GLUPY_FSTAT,
+ GLUPY_STATFS,
+ GLUPY_SETXATTR,
+ GLUPY_GETXATTR,
+ GLUPY_FSETXATTR,
+ GLUPY_FGETXATTR,
+ GLUPY_REMOVEXATTR,
+ GLUPY_FREMOVEXATTR,
+ GLUPY_LINK,
+ GLUPY_UNLINK,
+ GLUPY_READLINK,
+ GLUPY_SYMLINK,
+ GLUPY_MKNOD,
+ GLUPY_MKDIR,
+ GLUPY_RMDIR,
+ GLUPY_N_FUNCS
+};
+
+typedef struct {
+ PyObject *py_module;
+ PyObject *py_xlator;
+ long fops[GLUPY_N_FUNCS];
+ long cbks[GLUPY_N_FUNCS];
+} glupy_private_t;
+
+enum gf_glupy_mem_types_ {
+ gf_glupy_mt_priv = gf_common_mt_end + 1,
+ gf_glupy_mt_end
+};
+
+#endif /* __GLUPY_H__ */
diff --git a/xlators/features/glupy/src/glupy.py b/xlators/features/glupy/src/glupy.py
new file mode 100644
index 000000000..a5daa77d3
--- /dev/null
+++ b/xlators/features/glupy/src/glupy.py
@@ -0,0 +1,841 @@
+import sys
+from ctypes import *
+
+dl = CDLL("",RTLD_GLOBAL)
+
+
+class call_frame_t (Structure):
+ pass
+
+class dev_t (Structure):
+ pass
+
+
+class dict_t (Structure):
+ pass
+
+
+class gf_dirent_t (Structure):
+ pass
+
+
+class iobref_t (Structure):
+ pass
+
+
+class iovec_t (Structure):
+ pass
+
+
+class list_head (Structure):
+ pass
+
+list_head._fields_ = [
+ ("next", POINTER(list_head)),
+ ("prev", POINTER(list_head))
+ ]
+
+
+class rwxperm_t (Structure):
+ _fields_ = [
+ ("read", c_uint8, 1),
+ ("write", c_uint8, 1),
+ ("execn", c_uint8, 1)
+ ]
+
+
+class statvfs_t (Structure):
+ pass
+
+
+class xlator_t (Structure):
+ pass
+
+
+class ia_prot_t (Structure):
+ _fields_ = [
+ ("suid", c_uint8, 1),
+ ("sgid", c_uint8, 1),
+ ("sticky", c_uint8, 1),
+ ("owner", rwxperm_t),
+ ("group", rwxperm_t),
+ ("other", rwxperm_t)
+ ]
+
+# For checking file type.
+(IA_INVAL, IA_IFREG, IA_IFDIR, IA_IFLNK, IA_IFBLK, IA_IFCHR, IA_IFIFO,
+ IA_IFSOCK) = xrange(8)
+
+
+class iatt_t (Structure):
+ _fields_ = [
+ ("ia_no", c_uint64),
+ ("ia_gfid", c_ubyte * 16),
+ ("ia_dev", c_uint64),
+ ("ia_type", c_uint),
+ ("ia_prot", ia_prot_t),
+ ("ia_nlink", c_uint32),
+ ("ia_uid", c_uint32),
+ ("ia_gid", c_uint32),
+ ("ia_rdev", c_uint64),
+ ("ia_size", c_uint64),
+ ("ia_blksize", c_uint32),
+ ("ia_blocks", c_uint64),
+ ("ia_atime", c_uint32 ),
+ ("ia_atime_nsec", c_uint32),
+ ("ia_mtime", c_uint32),
+ ("ia_mtime_nsec", c_uint32),
+ ("ia_ctime", c_uint32),
+ ("ia_ctime_nsec", c_uint32)
+ ]
+
+
+class mem_pool (Structure):
+ _fields_ = [
+ ("list", list_head),
+ ("hot_count", c_int),
+ ("cold_count", c_int),
+ ("lock", c_void_p),
+ ("padded_sizeof_type", c_ulong),
+ ("pool", c_void_p),
+ ("pool_end", c_void_p),
+ ("real_sizeof_type", c_int),
+ ("alloc_count", c_uint64),
+ ("pool_misses", c_uint64),
+ ("max_alloc", c_int),
+ ("curr_stdalloc", c_int),
+ ("max_stdalloc", c_int),
+ ("name", c_char_p),
+ ("global_list", list_head)
+ ]
+
+
+class U_ctx_key_inode (Union):
+ _fields_ = [
+ ("key", c_uint64),
+ ("xl_key", POINTER(xlator_t))
+ ]
+
+
+class U_ctx_value1 (Union):
+ _fields_ = [
+ ("value1", c_uint64),
+ ("ptr1", c_void_p)
+ ]
+
+
+class U_ctx_value2 (Union):
+ _fields_ = [
+ ("value2", c_uint64),
+ ("ptr2", c_void_p)
+ ]
+
+class inode_ctx (Structure):
+ _anonymous_ = ("u_key","u_value1","u_value2",)
+ _fields_ = [
+ ("u_key", U_ctx_key_inode),
+ ("u_value1", U_ctx_value1),
+ ("u_value2", U_ctx_value2)
+ ]
+
+class inode_t (Structure):
+ pass
+
+class inode_table_t (Structure):
+ _fields_ = [
+ ("lock", c_void_p),
+ ("hashsize", c_size_t),
+ ("name", c_char_p),
+ ("root", POINTER(inode_t)),
+ ("xl", POINTER(xlator_t)),
+ ("lru_limit", c_uint32),
+ ("inode_hash", POINTER(list_head)),
+ ("name_hash", POINTER(list_head)),
+ ("active", list_head),
+ ("active_size", c_uint32),
+ ("lru", list_head),
+ ("lru_size", c_uint32),
+ ("purge", list_head),
+ ("purge_size", c_uint32),
+ ("inode_pool", POINTER(mem_pool)),
+ ("dentry_pool", POINTER(mem_pool)),
+ ("fd_mem_pool", POINTER(mem_pool))
+ ]
+
+inode_t._fields_ = [
+ ("table", POINTER(inode_table_t)),
+ ("gfid", c_ubyte * 16),
+ ("lock", c_void_p),
+ ("nlookup", c_uint64),
+ ("fd_count", c_uint32),
+ ("ref", c_uint32),
+ ("ia_type", c_uint),
+ ("fd_list", list_head),
+ ("dentry_list", list_head),
+ ("hashv", list_head),
+ ("listv", list_head),
+ ("ctx", POINTER(inode_ctx))
+ ]
+
+
+
+class U_ctx_key_fd (Union):
+ _fields_ = [
+ ("key", c_uint64),
+ ("xl_key", c_void_p)
+ ]
+
+class fd_lk_ctx (Structure):
+ _fields_ = [
+ ("lk_list", list_head),
+ ("ref", c_int),
+ ("lock", c_void_p)
+ ]
+
+class fd_ctx (Structure):
+ _anonymous_ = ("u_key","u_value1")
+ _fields_ = [
+ ("u_key", U_ctx_key_fd),
+ ("u_value1", U_ctx_value1)
+ ]
+
+class fd_t (Structure):
+ _fields_ = [
+ ("pid", c_uint64),
+ ("flags", c_int32),
+ ("refcount", c_int32),
+ ("inode_list", list_head),
+ ("inode", POINTER(inode_t)),
+ ("lock", c_void_p),
+ ("ctx", POINTER(fd_ctx)),
+ ("xl_count", c_int),
+ ("lk_ctx", POINTER(fd_lk_ctx)),
+ ("anonymous", c_uint)
+ ]
+
+class loc_t (Structure):
+ _fields_ = [
+ ("path", c_char_p),
+ ("name", c_char_p),
+ ("inode", POINTER(inode_t)),
+ ("parent", POINTER(inode_t)),
+ ("gfid", c_ubyte * 16),
+ ("pargfid", c_ubyte * 16),
+ ]
+
+
+
+def _init_op (a_class, fop, cbk, wind, unwind):
+ # Decorators, used by translators. We could pass the signatures as
+ # parameters, but it's actually kind of nice to keep them around for
+ # inspection.
+ a_class.fop_type = apply(CFUNCTYPE,a_class.fop_sig)
+ a_class.cbk_type = apply(CFUNCTYPE,a_class.cbk_sig)
+ # Dispatch-function registration.
+ fop.restype = None
+ fop.argtypes = [ c_long, a_class.fop_type ]
+ # Callback-function registration.
+ cbk.restype = None
+ cbk.argtypes = [ c_long, a_class.cbk_type ]
+ # STACK_WIND function.
+ wind.restype = None
+ wind.argtypes = list(a_class.fop_sig[1:])
+ # STACK_UNWIND function.
+ unwind.restype = None
+ unwind.argtypes = list(a_class.cbk_sig[1:])
+
+class OpLookup:
+ fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t),
+ POINTER(loc_t), POINTER(dict_t))
+ cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t),
+ c_int, c_int, POINTER(inode_t), POINTER(iatt_t),
+ POINTER(dict_t), POINTER(iatt_t))
+_init_op (OpLookup, dl.set_lookup_fop, dl.set_lookup_cbk,
+ dl.wind_lookup, dl.unwind_lookup)
+
+class OpCreate:
+ fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t),
+ POINTER(loc_t), c_int, c_uint, c_uint, POINTER(fd_t),
+ POINTER(dict_t))
+ cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t),
+ c_int, c_int, POINTER(fd_t), POINTER(inode_t),
+ POINTER(iatt_t), POINTER(iatt_t), POINTER(iatt_t),
+ POINTER(dict_t))
+_init_op (OpCreate, dl.set_create_fop, dl.set_create_cbk,
+ dl.wind_create, dl.unwind_create)
+
+class OpOpen:
+ fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t),
+ POINTER(loc_t), c_int, POINTER(fd_t), POINTER(dict_t))
+ cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t),
+ c_int, c_int, POINTER(fd_t), POINTER(dict_t))
+_init_op (OpOpen, dl.set_open_fop, dl.set_open_cbk,
+ dl.wind_open, dl.unwind_open)
+
+class OpReadv:
+ fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t),
+ POINTER(fd_t), c_size_t, c_long, c_uint32, POINTER(dict_t))
+ cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t),
+ c_int, c_int, POINTER(iovec_t), c_int, POINTER(iatt_t),
+ POINTER(iobref_t), POINTER(dict_t))
+_init_op (OpReadv, dl.set_readv_fop, dl.set_readv_cbk,
+ dl.wind_readv, dl.unwind_readv)
+class OpWritev:
+ fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t),
+ POINTER(fd_t), POINTER(iovec_t), c_int, c_long, c_uint32,
+ POINTER(iobref_t), POINTER(dict_t))
+ cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t),
+ c_int, c_int, POINTER(iatt_t), POINTER(iatt_t),
+ POINTER(dict_t))
+_init_op (OpWritev, dl.set_writev_fop, dl.set_writev_cbk,
+ dl.wind_writev, dl.unwind_writev)
+
+class OpOpendir:
+ fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t),
+ POINTER(loc_t), POINTER(fd_t) ,POINTER(dict_t))
+ cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t),
+ c_int, c_int, POINTER(fd_t), POINTER(dict_t))
+_init_op (OpOpendir, dl.set_opendir_fop, dl.set_opendir_cbk,
+ dl.wind_opendir, dl.unwind_opendir)
+
+class OpReaddir:
+ fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t),
+ POINTER(fd_t), c_size_t, c_long, POINTER(dict_t))
+ cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t),
+ c_int, c_int, POINTER(gf_dirent_t), POINTER(dict_t))
+_init_op (OpReaddir, dl.set_readdir_fop, dl.set_readdir_cbk,
+ dl.wind_readdir, dl.unwind_readdir)
+
+class OpReaddirp:
+ fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t),
+ POINTER(fd_t), c_size_t, c_long, POINTER(dict_t))
+ cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t),
+ c_int, c_int, POINTER(gf_dirent_t), POINTER(dict_t))
+_init_op (OpReaddirp, dl.set_readdirp_fop, dl.set_readdirp_cbk,
+ dl.wind_readdirp, dl.unwind_readdirp)
+
+class OpStat:
+ fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t),
+ POINTER(loc_t), POINTER(dict_t))
+ cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t),
+ c_int, c_int, POINTER(iatt_t), POINTER(dict_t))
+_init_op (OpStat, dl.set_stat_fop, dl.set_stat_cbk,
+ dl.wind_stat, dl.unwind_stat)
+
+class OpFstat:
+ fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t),
+ POINTER(fd_t), POINTER(dict_t))
+ cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t),
+ c_int, c_int, POINTER(iatt_t), POINTER(dict_t))
+_init_op (OpFstat, dl.set_fstat_fop, dl.set_fstat_cbk,
+ dl.wind_fstat, dl.unwind_fstat)
+
+class OpStatfs:
+ fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t),
+ POINTER(loc_t), POINTER(dict_t))
+ cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t),
+ c_int, c_int, POINTER(statvfs_t), POINTER(dict_t))
+_init_op (OpStatfs, dl.set_statfs_fop, dl.set_statfs_cbk,
+ dl.wind_statfs, dl.unwind_statfs)
+
+
+class OpSetxattr:
+ fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t),
+ POINTER(loc_t), POINTER(dict_t), c_int32,
+ POINTER (dict_t))
+ cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t),
+ c_int, c_int, POINTER(dict_t))
+_init_op (OpSetxattr, dl.set_setxattr_fop, dl.set_setxattr_cbk,
+ dl.wind_setxattr, dl.unwind_setxattr)
+
+class OpGetxattr:
+ fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t),
+ POINTER(loc_t), c_char_p, POINTER (dict_t))
+ cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t),
+ c_int, c_int, POINTER(dict_t), POINTER(dict_t))
+_init_op (OpGetxattr, dl.set_getxattr_fop, dl.set_getxattr_cbk,
+ dl.wind_getxattr, dl.unwind_getxattr)
+
+class OpFsetxattr:
+ fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t),
+ POINTER(fd_t), POINTER(dict_t), c_int32,
+ POINTER (dict_t))
+ cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t),
+ c_int, c_int, POINTER(dict_t))
+_init_op (OpFsetxattr, dl.set_fsetxattr_fop, dl.set_fsetxattr_cbk,
+ dl.wind_fsetxattr, dl.unwind_fsetxattr)
+
+class OpFgetxattr:
+ fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t),
+ POINTER(fd_t), c_char_p, POINTER (dict_t))
+ cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t),
+ c_int, c_int, POINTER(dict_t), POINTER(dict_t))
+_init_op (OpFgetxattr, dl.set_fgetxattr_fop, dl.set_fgetxattr_cbk,
+ dl.wind_fgetxattr, dl.unwind_fgetxattr)
+
+class OpRemovexattr:
+ fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t),
+ POINTER(loc_t), c_char_p, POINTER(dict_t))
+ cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t),
+ c_int, c_int, POINTER(dict_t))
+_init_op (OpRemovexattr, dl.set_removexattr_fop, dl.set_removexattr_cbk,
+ dl.wind_removexattr, dl.unwind_removexattr)
+
+
+class OpFremovexattr:
+ fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t),
+ POINTER(fd_t), c_char_p, POINTER(dict_t))
+ cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t),
+ c_int, c_int, POINTER(dict_t))
+_init_op (OpFremovexattr, dl.set_fremovexattr_fop, dl.set_fremovexattr_cbk,
+ dl.wind_fremovexattr, dl.unwind_fremovexattr)
+
+class OpLink:
+ fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t),
+ POINTER(loc_t), POINTER(loc_t), POINTER(dict_t))
+ cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t),
+ c_int, c_int, POINTER(inode_t), POINTER(iatt_t),
+ POINTER(iatt_t), POINTER(iatt_t), POINTER(dict_t))
+_init_op (OpLink, dl.set_link_fop, dl.set_link_cbk,
+ dl.wind_link, dl.unwind_link)
+
+class OpSymlink:
+ fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t),
+ c_char_p, POINTER(loc_t), c_uint, POINTER(dict_t))
+ cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t),
+ c_int, c_int, POINTER(inode_t), POINTER(iatt_t),
+ POINTER(iatt_t), POINTER(iatt_t), POINTER(dict_t))
+_init_op (OpSymlink, dl.set_symlink_fop, dl.set_symlink_cbk,
+ dl.wind_symlink, dl.unwind_symlink)
+
+class OpUnlink:
+ fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t),
+ POINTER(loc_t), c_int, POINTER(dict_t))
+ cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t),
+ c_int, c_int, POINTER(iatt_t), POINTER(iatt_t),
+ POINTER(dict_t))
+_init_op (OpUnlink, dl.set_unlink_fop, dl.set_unlink_cbk,
+ dl.wind_unlink, dl.unwind_unlink)
+
+class OpReadlink:
+ fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t),
+ POINTER(loc_t), c_size_t, POINTER(dict_t))
+ cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t),
+ c_int, c_int, c_char_p, POINTER(iatt_t), POINTER(dict_t))
+_init_op (OpReadlink, dl.set_readlink_fop, dl.set_readlink_cbk,
+ dl.wind_readlink, dl.unwind_readlink)
+
+class OpMkdir:
+ fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t),
+ POINTER(loc_t), c_uint, c_uint, POINTER(dict_t))
+ cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t),
+ c_int, c_int, POINTER(inode_t), POINTER(iatt_t),
+ POINTER(iatt_t), POINTER(iatt_t), POINTER(dict_t))
+_init_op (OpMkdir, dl.set_mkdir_fop, dl.set_mkdir_cbk,
+ dl.wind_mkdir, dl.unwind_mkdir)
+
+class OpRmdir:
+ fop_sig = (c_int, POINTER(call_frame_t), POINTER(xlator_t),
+ POINTER(loc_t), c_int, POINTER(dict_t))
+ cbk_sig = (c_int, POINTER(call_frame_t), c_long, POINTER(xlator_t),
+ c_int, c_int, POINTER(iatt_t), POINTER(iatt_t),
+ POINTER(dict_t))
+_init_op (OpRmdir, dl.set_rmdir_fop, dl.set_rmdir_cbk,
+ dl.wind_rmdir, dl.unwind_rmdir)
+
+
+class Translator:
+ def __init__ (self, c_this):
+ # This is only here to keep references to the stubs we create,
+ # because ctypes doesn't and glupy.so can't because it doesn't
+ # get a pointer to the actual Python object. It's a dictionary
+ # instead of a list in case we ever allow changing fops/cbks
+ # after initialization and need to look them up.
+ self.stub_refs = {}
+ funcs = dir(self.__class__)
+ if "lookup_fop" in funcs:
+ @OpLookup.fop_type
+ def stub (frame, this, loc, xdata, s=self):
+ return s.lookup_fop (frame, this, loc, xdata)
+ self.stub_refs["lookup_fop"] = stub
+ dl.set_lookup_fop(c_this,stub)
+ if "lookup_cbk" in funcs:
+ @OpLookup.cbk_type
+ def stub (frame, cookie, this, op_ret, op_errno, inode,
+ buf, xdata, postparent, s=self):
+ return s.lookup_cbk(frame, cookie, this, op_ret,
+ op_errno, inode, buf, xdata,
+ postparent)
+ self.stub_refs["lookup_cbk"] = stub
+ dl.set_lookup_cbk(c_this,stub)
+ if "create_fop" in funcs:
+ @OpCreate.fop_type
+ def stub (frame, this, loc, flags, mode, umask, fd,
+ xdata, s=self):
+ return s.create_fop (frame, this, loc, flags,
+ mode, umask, fd, xdata)
+ self.stub_refs["create_fop"] = stub
+ dl.set_create_fop(c_this,stub)
+ if "create_cbk" in funcs:
+ @OpCreate.cbk_type
+ def stub (frame, cookie, this, op_ret, op_errno, fd,
+ inode, buf, preparent, postparent, xdata,
+ s=self):
+ return s.create_cbk (frame, cookie, this,
+ op_ret, op_errno, fd,
+ inode, buf, preparent,
+ postparent, xdata)
+ self.stub_refs["create_cbk"] = stub
+ dl.set_create_cbk(c_this,stub)
+ if "open_fop" in funcs:
+ @OpOpen.fop_type
+ def stub (frame, this, loc, flags, fd,
+ xdata, s=self):
+ return s.open_fop (frame, this, loc, flags,
+ fd, xdata)
+ self.stub_refs["open_fop"] = stub
+ dl.set_open_fop(c_this,stub)
+ if "open_cbk" in funcs:
+ @OpOpen.cbk_type
+ def stub (frame, cookie, this, op_ret, op_errno, fd,
+ xdata, s=self):
+ return s.open_cbk (frame, cookie, this,
+ op_ret, op_errno, fd,
+ xdata)
+ self.stub_refs["open_cbk"] = stub
+ dl.set_open_cbk(c_this,stub)
+ if "readv_fop" in funcs:
+ @OpReadv.fop_type
+ def stub (frame, this, fd, size, offset, flags,
+ xdata, s=self):
+ return s.readv_fop (frame, this, fd, size,
+ offset, flags, xdata)
+ self.stub_refs["readv_fop"] = stub
+ dl.set_readv_fop(c_this,stub)
+ if "readv_cbk" in funcs:
+ @OpReadv.cbk_type
+ def stub (frame, cookie, this, op_ret, op_errno,
+ vector, count, stbuf, iobref, xdata,
+ s=self):
+ return s.readv_cbk (frame, cookie, this,
+ op_ret, op_errno, vector,
+ count, stbuf, iobref,
+ xdata)
+ self.stub_refs["readv_cbk"] = stub
+ dl.set_readv_cbk(c_this,stub)
+ if "writev_fop" in funcs:
+ @OpWritev.fop_type
+ def stub (frame, this, fd, vector, count,
+ offset, flags, iobref, xdata, s=self):
+ return s.writev_fop (frame, this, fd, vector,
+ count, offset, flags,
+ iobref, xdata)
+ self.stub_refs["writev_fop"] = stub
+ dl.set_writev_fop(c_this,stub)
+ if "writev_cbk" in funcs:
+ @OpWritev.cbk_type
+ def stub (frame, cookie, this, op_ret, op_errno,
+ prebuf, postbuf, xdata, s=self):
+ return s.writev_cbk (frame, cookie, this,
+ op_ret, op_errno, prebuf,
+ postbuf, xdata)
+ self.stub_refs["writev_cbk"] = stub
+ dl.set_writev_cbk(c_this,stub)
+ if "opendir_fop" in funcs:
+ @OpOpendir.fop_type
+ def stub (frame, this, loc, fd, xdata, s=self):
+ return s.opendir_fop (frame, this, loc, fd,
+ xdata)
+ self.stub_refs["opendir_fop"] = stub
+ dl.set_opendir_fop(c_this,stub)
+ if "opendir_cbk" in funcs:
+ @OpOpendir.cbk_type
+ def stub (frame, cookie, this, op_ret, op_errno, fd,
+ xdata, s=self):
+ return s.opendir_cbk(frame, cookie, this,
+ op_ret, op_errno, fd,
+ xdata)
+ self.stub_refs["opendir_cbk"] = stub
+ dl.set_opendir_cbk(c_this,stub)
+ if "readdir_fop" in funcs:
+ @OpReaddir.fop_type
+ def stub (frame, this, fd, size, offset, xdata, s=self):
+ return s.readdir_fop (frame, this, fd, size,
+ offset, xdata)
+ self.stub_refs["readdir_fop"] = stub
+ dl.set_readdir_fop(c_this,stub)
+ if "readdir_cbk" in funcs:
+ @OpReaddir.cbk_type
+ def stub (frame, cookie, this, op_ret, op_errno,
+ entries, xdata, s=self):
+ return s.readdir_cbk(frame, cookie, this,
+ op_ret, op_errno, entries,
+ xdata)
+ self.stub_refs["readdir_cbk"] = stub
+ dl.set_readdir_cbk(c_this,stub)
+ if "readdirp_fop" in funcs:
+ @OpReaddirp.fop_type
+ def stub (frame, this, fd, size, offset, xdata, s=self):
+ return s.readdirp_fop (frame, this, fd, size,
+ offset, xdata)
+ self.stub_refs["readdirp_fop"] = stub
+ dl.set_readdirp_fop(c_this,stub)
+ if "readdirp_cbk" in funcs:
+ @OpReaddirp.cbk_type
+ def stub (frame, cookie, this, op_ret, op_errno,
+ entries, xdata, s=self):
+ return s.readdirp_cbk (frame, cookie, this,
+ op_ret, op_errno,
+ entries, xdata)
+ self.stub_refs["readdirp_cbk"] = stub
+ dl.set_readdirp_cbk(c_this,stub)
+ if "stat_fop" in funcs:
+ @OpStat.fop_type
+ def stub (frame, this, loc, xdata, s=self):
+ return s.stat_fop (frame, this, loc, xdata)
+ self.stub_refs["stat_fop"] = stub
+ dl.set_stat_fop(c_this,stub)
+ if "stat_cbk" in funcs:
+ @OpStat.cbk_type
+ def stub (frame, cookie, this, op_ret, op_errno, buf,
+ xdata, s=self):
+ return s.stat_cbk(frame, cookie, this, op_ret,
+ op_errno, buf, xdata)
+ self.stub_refs["stat_cbk"] = stub
+ dl.set_stat_cbk(c_this,stub)
+ if "fstat_fop" in funcs:
+ @OpFstat.fop_type
+ def stub (frame, this, fd, xdata, s=self):
+ return s.fstat_fop (frame, this, fd, xdata)
+ self.stub_refs["fstat_fop"] = stub
+ dl.set_fstat_fop(c_this,stub)
+ if "fstat_cbk" in funcs:
+ @OpFstat.cbk_type
+ def stub (frame, cookie, this, op_ret, op_errno, buf,
+ xdata, s=self):
+ return s.fstat_cbk(frame, cookie, this, op_ret,
+ op_errno, buf, xdata)
+ self.stub_refs["fstat_cbk"] = stub
+ dl.set_fstat_cbk(c_this,stub)
+ if "statfs_fop" in funcs:
+ @OpStatfs.fop_type
+ def stub (frame, this, loc, xdata, s=self):
+ return s.statfs_fop (frame, this, loc, xdata)
+ self.stub_refs["statfs_fop"] = stub
+ dl.set_statfs_fop(c_this,stub)
+ if "statfs_cbk" in funcs:
+ @OpStatfs.cbk_type
+ def stub (frame, cookie, this, op_ret, op_errno, buf,
+ xdata, s=self):
+ return s.statfs_cbk (frame, cookie, this,
+ op_ret, op_errno, buf,
+ xdata)
+ self.stub_refs["statfs_cbk"] = stub
+ dl.set_statfs_cbk(c_this,stub)
+ if "setxattr_fop" in funcs:
+ @OpSetxattr.fop_type
+ def stub (frame, this, loc, dictionary, flags, xdata,
+ s=self):
+ return s.setxattr_fop (frame, this, loc,
+ dictionary, flags,
+ xdata)
+ self.stub_refs["setxattr_fop"] = stub
+ dl.set_setxattr_fop(c_this,stub)
+ if "setxattr_cbk" in funcs:
+ @OpSetxattr.cbk_type
+ def stub (frame, cookie, this, op_ret, op_errno, xdata,
+ s=self):
+ return s.setxattr_cbk(frame, cookie, this,
+ op_ret, op_errno, xdata)
+ self.stub_refs["setxattr_cbk"] = stub
+ dl.set_setxattr_cbk(c_this,stub)
+ if "getxattr_fop" in funcs:
+ @OpGetxattr.fop_type
+ def stub (frame, this, loc, name, xdata, s=self):
+ return s.getxattr_fop (frame, this, loc, name,
+ xdata)
+ self.stub_refs["getxattr_fop"] = stub
+ dl.set_getxattr_fop(c_this,stub)
+ if "getxattr_cbk" in funcs:
+ @OpGetxattr.cbk_type
+ def stub (frame, cookie, this, op_ret, op_errno,
+ dictionary, xdata, s=self):
+ return s.getxattr_cbk(frame, cookie, this,
+ op_ret, op_errno,
+ dictionary, xdata)
+ self.stub_refs["getxattr_cbk"] = stub
+ dl.set_getxattr_cbk(c_this,stub)
+ if "fsetxattr_fop" in funcs:
+ @OpFsetxattr.fop_type
+ def stub (frame, this, fd, dictionary, flags, xdata,
+ s=self):
+ return s.fsetxattr_fop (frame, this, fd,
+ dictionary, flags,
+ xdata)
+ self.stub_refs["fsetxattr_fop"] = stub
+ dl.set_fsetxattr_fop(c_this,stub)
+ if "fsetxattr_cbk" in funcs:
+ @OpFsetxattr.cbk_type
+ def stub (frame, cookie, this, op_ret, op_errno, xdata,
+ s=self):
+ return s.fsetxattr_cbk(frame, cookie, this,
+ op_ret, op_errno, xdata)
+ self.stub_refs["fsetxattr_cbk"] = stub
+ dl.set_fsetxattr_cbk(c_this,stub)
+ if "fgetxattr_fop" in funcs:
+ @OpFgetxattr.fop_type
+ def stub (frame, this, fd, name, xdata, s=self):
+ return s.fgetxattr_fop (frame, this, fd, name,
+ xdata)
+ self.stub_refs["fgetxattr_fop"] = stub
+ dl.set_fgetxattr_fop(c_this,stub)
+ if "fgetxattr_cbk" in funcs:
+ @OpFgetxattr.cbk_type
+ def stub (frame, cookie, this, op_ret, op_errno,
+ dictionary, xdata, s=self):
+ return s.fgetxattr_cbk(frame, cookie, this,
+ op_ret, op_errno,
+ dictionary, xdata)
+ self.stub_refs["fgetxattr_cbk"] = stub
+ dl.set_fgetxattr_cbk(c_this,stub)
+ if "removexattr_fop" in funcs:
+ @OpRemovexattr.fop_type
+ def stub (frame, this, loc, name, xdata, s=self):
+ return s.removexattr_fop (frame, this, loc,
+ name, xdata)
+ self.stub_refs["removexattr_fop"] = stub
+ dl.set_removexattr_fop(c_this,stub)
+ if "removexattr_cbk" in funcs:
+ @OpRemovexattr.cbk_type
+ def stub (frame, cookie, this, op_ret, op_errno,
+ xdata, s=self):
+ return s.removexattr_cbk(frame, cookie, this,
+ op_ret, op_errno,
+ xdata)
+ self.stub_refs["removexattr_cbk"] = stub
+ dl.set_removexattr_cbk(c_this,stub)
+ if "fremovexattr_fop" in funcs:
+ @OpFremovexattr.fop_type
+ def stub (frame, this, fd, name, xdata, s=self):
+ return s.fremovexattr_fop (frame, this, fd,
+ name, xdata)
+ self.stub_refs["fremovexattr_fop"] = stub
+ dl.set_fremovexattr_fop(c_this,stub)
+ if "fremovexattr_cbk" in funcs:
+ @OpFremovexattr.cbk_type
+ def stub (frame, cookie, this, op_ret, op_errno,
+ xdata, s=self):
+ return s.fremovexattr_cbk(frame, cookie, this,
+ op_ret, op_errno,
+ xdata)
+ self.stub_refs["fremovexattr_cbk"] = stub
+ dl.set_fremovexattr_cbk(c_this,stub)
+ if "link_fop" in funcs:
+ @OpLink.fop_type
+ def stub (frame, this, oldloc, newloc,
+ xdata, s=self):
+ return s.link_fop (frame, this, oldloc,
+ newloc, xdata)
+ self.stub_refs["link_fop"] = stub
+ dl.set_link_fop(c_this,stub)
+ if "link_cbk" in funcs:
+ @OpLink.cbk_type
+ def stub (frame, cookie, this, op_ret, op_errno,
+ inode, buf, preparent, postparent, xdata,
+ s=self):
+ return s.link_cbk (frame, cookie, this,
+ op_ret, op_errno, inode,
+ buf, preparent,
+ postparent, xdata)
+ self.stub_refs["link_cbk"] = stub
+ dl.set_link_cbk(c_this,stub)
+ if "symlink_fop" in funcs:
+ @OpSymlink.fop_type
+ def stub (frame, this, linkname, loc,
+ umask, xdata, s=self):
+ return s.symlink_fop (frame, this, linkname,
+ loc, umask, xdata)
+ self.stub_refs["symlink_fop"] = stub
+ dl.set_symlink_fop(c_this,stub)
+ if "symlink_cbk" in funcs:
+ @OpSymlink.cbk_type
+ def stub (frame, cookie, this, op_ret, op_errno,
+ inode, buf, preparent, postparent, xdata,
+ s=self):
+ return s.symlink_cbk (frame, cookie, this,
+ op_ret, op_errno, inode,
+ buf, preparent,
+ postparent, xdata)
+ self.stub_refs["symlink_cbk"] = stub
+ dl.set_symlink_cbk(c_this,stub)
+ if "unlink_fop" in funcs:
+ @OpUnlink.fop_type
+ def stub (frame, this, loc, xflags,
+ xdata, s=self):
+ return s.unlink_fop (frame, this, loc,
+ xflags, xdata)
+ self.stub_refs["unlink_fop"] = stub
+ dl.set_unlink_fop(c_this,stub)
+ if "unlink_cbk" in funcs:
+ @OpUnlink.cbk_type
+ def stub (frame, cookie, this, op_ret, op_errno,
+ preparent, postparent, xdata, s=self):
+ return s.unlink_cbk (frame, cookie, this,
+ op_ret, op_errno,
+ preparent, postparent,
+ xdata)
+ self.stub_refs["unlink_cbk"] = stub
+ dl.set_unlink_cbk(c_this,stub)
+ if "readlink_fop" in funcs:
+ @OpReadlink.fop_type
+ def stub (frame, this, loc, size,
+ xdata, s=self):
+ return s.readlink_fop (frame, this, loc,
+ size, xdata)
+ self.stub_refs["readlink_fop"] = stub
+ dl.set_readlink_fop(c_this,stub)
+ if "readlink_cbk" in funcs:
+ @OpReadlink.cbk_type
+ def stub (frame, cookie, this, op_ret, op_errno,
+ path, buf, xdata, s=self):
+ return s.readlink_cbk (frame, cookie, this,
+ op_ret, op_errno,
+ path, buf, xdata)
+ self.stub_refs["readlink_cbk"] = stub
+ dl.set_readlink_cbk(c_this,stub)
+ if "mkdir_fop" in funcs:
+ @OpMkdir.fop_type
+ def stub (frame, this, loc, mode, umask, xdata,
+ s=self):
+ return s.mkdir_fop (frame, this, loc, mode,
+ umask, xdata)
+ self.stub_refs["mkdir_fop"] = stub
+ dl.set_mkdir_fop(c_this,stub)
+ if "mkdir_cbk" in funcs:
+ @OpMkdir.cbk_type
+ def stub (frame, cookie, this, op_ret, op_errno, inode,
+ buf, preparent, postparent, xdata, s=self):
+ return s.mkdir_cbk (frame, cookie, this,
+ op_ret, op_errno, inode,
+ buf, preparent,
+ postparent, xdata)
+ self.stub_refs["mkdir_cbk"] = stub
+ dl.set_mkdir_cbk(c_this,stub)
+ if "rmdir_fop" in funcs:
+ @OpRmdir.fop_type
+ def stub (frame, this, loc, xflags,
+ xdata, s=self):
+ return s.rmdir_fop (frame, this, loc,
+ xflags, xdata)
+ self.stub_refs["rmdir_fop"] = stub
+ dl.set_rmdir_fop(c_this,stub)
+ if "rmdir_cbk" in funcs:
+ @OpRmdir.cbk_type
+ def stub (frame, cookie, this, op_ret, op_errno,
+ preparent, postparent, xdata, s=self):
+ return s.rmdir_cbk (frame, cookie, this,
+ op_ret, op_errno,
+ preparent, postparent,
+ xdata)
+ self.stub_refs["rmdir_cbk"] = stub
+ dl.set_rmdir_cbk(c_this,stub)
diff --git a/xlators/features/glupy/src/setup.py.in b/xlators/features/glupy/src/setup.py.in
new file mode 100644
index 000000000..b9ee02c2b
--- /dev/null
+++ b/xlators/features/glupy/src/setup.py.in
@@ -0,0 +1,24 @@
+from distutils.core import setup
+
+DESC = """GlusterFS is a clustered file-system capable of scaling to
+several petabytes. It aggregates various storage bricks over Infiniband
+RDMA or TCP/IP interconnect into one large parallel network file system.
+GlusterFS is one of the most sophisticated file systems in terms of
+features and extensibility. It borrows a powerful concept called
+Translators from GNU Hurd kernel. Much of the code in GlusterFS is in
+user space and easily manageable.
+
+This package contains Glupy, the Python translator interface for GlusterFS."""
+
+setup(
+ name='glusterfs-glupy',
+ version='@PACKAGE_VERSION@',
+ description='Glupy is the Python translator interface for GlusterFS',
+ long_description=DESC,
+ author='Gluster Community',
+ author_email='gluster-devel@gluster.org',
+ license='LGPLv3',
+ url='http://gluster.org/',
+ package_dir={'gluster':''},
+ packages=['gluster']
+)
diff --git a/xlators/features/index/src/Makefile.am b/xlators/features/index/src/Makefile.am
index a550c1f57..73bb8972e 100644
--- a/xlators/features/index/src/Makefile.am
+++ b/xlators/features/index/src/Makefile.am
@@ -1,7 +1,7 @@
xlator_LTLIBRARIES = index.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
-index_la_LDFLAGS = -module -avoidversion
+index_la_LDFLAGS = -module -avoid-version
index_la_SOURCES = index.c
index_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
diff --git a/xlators/features/index/src/index.c b/xlators/features/index/src/index.c
index 24efcbb3a..4b2eb2e29 100644
--- a/xlators/features/index/src/index.c
+++ b/xlators/features/index/src/index.c
@@ -15,6 +15,7 @@
#include "index.h"
#include "options.h"
#include "glusterfs3-xdr.h"
+#include "syscall.h"
#define XATTROP_SUBDIR "xattrop"
@@ -361,7 +362,7 @@ index_add (xlator_t *this, uuid_t gfid, const char *subdir)
index_get_index (priv, index);
make_index_path (priv->index_basepath, subdir,
index, index_path, sizeof (index_path));
- ret = link (index_path, gfid_path);
+ ret = sys_link (index_path, gfid_path);
if (!ret || (errno == EEXIST)) {
ret = 0;
goto out;
@@ -392,7 +393,7 @@ index_add (xlator_t *this, uuid_t gfid, const char *subdir)
if (fd >= 0)
close (fd);
- ret = link (index_path, gfid_path);
+ ret = sys_link (index_path, gfid_path);
if (ret && (errno != EEXIST)) {
gf_log (this->name, GF_LOG_ERROR, "%s: Not able to "
"add to index (%s)", uuid_utoa (gfid),
@@ -431,29 +432,27 @@ out:
return ret;
}
+static int
+_check_key_is_zero_filled (dict_t *d, char *k, data_t *v,
+ void *tmp)
+{
+ if (mem_0filled ((const char*)v->data, v->len)) {
+ /* -1 means, no more iterations, treat as 'break' */
+ return -1;
+ }
+ return 0;
+}
+
void
-_xattrop_index_action (xlator_t *this, inode_t *inode, dict_t *xattr)
+_index_action (xlator_t *this, inode_t *inode, gf_boolean_t zero_xattr)
{
- gf_boolean_t zero_xattr = _gf_true;
+ int ret = 0;
index_inode_ctx_t *ctx = NULL;
- int ret = 0;
-
- int _check_key_is_zero_filled (dict_t *d, char *k, data_t *v,
- void *tmp)
- {
- if (mem_0filled ((const char*)v->data, v->len)) {
- zero_xattr = _gf_false;
- /* -1 means, no more iterations, treat as 'break' */
- return -1;
- }
- return 0;
- }
- dict_foreach (xattr, _check_key_is_zero_filled, NULL);
ret = index_inode_ctx_get (inode, this, &ctx);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "Not able to %s %s -> index",
- zero_xattr?"add":"del", uuid_utoa (inode->gfid));
+ zero_xattr?"del":"add", uuid_utoa (inode->gfid));
goto out;
}
if (zero_xattr) {
@@ -474,6 +473,19 @@ out:
}
void
+_xattrop_index_action (xlator_t *this, inode_t *inode, dict_t *xattr)
+{
+ gf_boolean_t zero_xattr = _gf_true;
+ int ret = 0;
+
+ ret = dict_foreach (xattr, _check_key_is_zero_filled, NULL);
+ if (ret == -1)
+ zero_xattr = _gf_false;
+ _index_action (this, inode, zero_xattr);
+ return;
+}
+
+void
fop_xattrop_index_action (xlator_t *this, inode_t *inode, dict_t *xattr)
{
_xattrop_index_action (this, inode, xattr);
@@ -485,13 +497,13 @@ fop_fxattrop_index_action (xlator_t *this, inode_t *inode, dict_t *xattr)
_xattrop_index_action (this, inode, xattr);
}
-inline gf_boolean_t
+static inline gf_boolean_t
index_xattrop_track (loc_t *loc, gf_xattrop_flags_t flags, dict_t *dict)
{
return (flags == GF_XATTROP_ADD_ARRAY);
}
-inline gf_boolean_t
+static inline gf_boolean_t
index_fxattrop_track (fd_t *fd, gf_xattrop_flags_t flags, dict_t *dict)
{
return (flags == GF_XATTROP_ADD_ARRAY);
@@ -651,6 +663,11 @@ int
index_xattrop_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc,
gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
{
+ //In wind phase bring the gfid into index. This way if the brick crashes
+ //just after posix performs xattrop before _cbk reaches index xlator
+ //we will still have the gfid in index.
+ _index_action (this, frame->local, _gf_false);
+
STACK_WIND (frame, index_xattrop_cbk, FIRST_CHILD (this),
FIRST_CHILD (this)->fops->xattrop, loc, optype, xattr,
xdata);
@@ -661,6 +678,10 @@ int
index_fxattrop_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd,
gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
{
+ //In wind phase bring the gfid into index. This way if the brick crashes
+ //just after posix performs xattrop before _cbk reaches index xlator
+ //we will still have the gfid in index.
+ _index_action (this, frame->local, _gf_false);
STACK_WIND (frame, index_fxattrop_cbk, FIRST_CHILD (this),
FIRST_CHILD (this)->fops->fxattrop, fd, optype, xattr,
xdata);
@@ -717,6 +738,41 @@ out:
return 0;
}
+uint64_t
+index_entry_count (xlator_t *this, char *subdir)
+{
+ index_priv_t *priv = NULL;
+ char index_dir[PATH_MAX];
+ DIR *dirp = NULL;
+ uint64_t count = 0;
+ struct dirent buf;
+ struct dirent *entry = NULL;
+
+ priv = this->private;
+
+ make_index_dir_path (priv->index_basepath, subdir,
+ index_dir, sizeof (index_dir));
+
+ dirp = opendir (index_dir);
+ if (!dirp)
+ return 0;
+
+ while (readdir_r (dirp, &buf, &entry) == 0) {
+ if (!entry)
+ break;
+ if (!strcmp (entry->d_name, ".") ||
+ !strcmp (entry->d_name, ".."))
+ continue;
+ if (!strncmp (entry->d_name, subdir, strlen (subdir)))
+ continue;
+ count++;
+ }
+ closedir (dirp);
+
+ return count;
+}
+
+
int32_t
index_getxattr_wrapper (call_frame_t *frame, xlator_t *this,
loc_t *loc, const char *name, dict_t *xdata)
@@ -724,6 +780,7 @@ index_getxattr_wrapper (call_frame_t *frame, xlator_t *this,
index_priv_t *priv = NULL;
dict_t *xattr = NULL;
int ret = 0;
+ uint64_t count = 0;
priv = this->private;
@@ -733,14 +790,26 @@ index_getxattr_wrapper (call_frame_t *frame, xlator_t *this,
goto done;
}
- ret = dict_set_static_bin (xattr, (char*)name, priv->xattrop_vgfid,
- sizeof (priv->xattrop_vgfid));
- if (ret) {
- ret = -ENOMEM;
- gf_log (THIS->name, GF_LOG_ERROR, "xattrop index "
- "gfid set failed");
- goto done;
- }
+ if (strcmp (name, GF_XATTROP_INDEX_GFID) == 0) {
+ ret = dict_set_static_bin (xattr, (char*)name, priv->xattrop_vgfid,
+ sizeof (priv->xattrop_vgfid));
+ if (ret) {
+ ret = -ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR, "xattrop index "
+ "gfid set failed");
+ goto done;
+ }
+ } else if (strcmp (name, GF_XATTROP_INDEX_COUNT) == 0) {
+ count = index_entry_count (this, XATTROP_SUBDIR);
+
+ ret = dict_set_uint64 (xattr, (char *)name, count);
+ if (ret) {
+ ret = -ENOMEM;
+ gf_log (this->name, GF_LOG_ERROR, "xattrop index "
+ "count set failed");
+ goto done;
+ }
+ }
done:
if (ret)
STACK_UNWIND_STRICT (getxattr, frame, -1, -ret, xattr, xdata);
@@ -910,8 +979,12 @@ index_getxattr (call_frame_t *frame, xlator_t *this,
loc_t *loc, const char *name, dict_t *xdata)
{
call_stub_t *stub = NULL;
+ index_priv_t *priv = NULL;
- if (!name || strcmp (GF_XATTROP_INDEX_GFID, name))
+ priv = this->private;
+
+ if (!name || (strcmp (GF_XATTROP_INDEX_GFID, name) &&
+ strcmp (GF_XATTROP_INDEX_COUNT, name)))
goto out;
stub = fop_getxattr_stub (frame, index_getxattr_wrapper, loc, name,
@@ -1074,12 +1147,18 @@ init (xlator_t *this)
INIT_LIST_HEAD (&priv->callstubs);
this->private = priv;
- ret = pthread_create (&thread, &w_attr, index_worker, this);
+
+ ret = index_dir_create (this, XATTROP_SUBDIR);
+ if (ret < 0)
+ goto out;
+
+ ret = gf_thread_create (&thread, &w_attr, index_worker, this);
if (ret) {
gf_log (this->name, GF_LOG_WARNING, "Failed to create "
"worker thread, aborting");
goto out;
}
+
ret = 0;
out:
if (ret) {
@@ -1179,8 +1258,7 @@ struct xlator_fops fops = {
.unlink = index_unlink
};
-struct xlator_dumpops dumpops = {
-};
+struct xlator_dumpops dumpops;
struct xlator_cbks cbks = {
.forget = index_forget,
diff --git a/xlators/features/locks/src/Makefile.am b/xlators/features/locks/src/Makefile.am
index 0fdaaf69f..0f79731b4 100644
--- a/xlators/features/locks/src/Makefile.am
+++ b/xlators/features/locks/src/Makefile.am
@@ -1,7 +1,7 @@
xlator_LTLIBRARIES = locks.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
-locks_la_LDFLAGS = -module -avoidversion
+locks_la_LDFLAGS = -module -avoid-version
locks_la_SOURCES = common.c posix.c entrylk.c inodelk.c reservelk.c \
clear.c
@@ -11,6 +11,7 @@ noinst_HEADERS = locks.h common.h locks-mem-types.h clear.h
AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src
+
AM_CFLAGS = -Wall -fno-strict-aliasing $(GF_CFLAGS)
CLEANFILES =
diff --git a/xlators/features/locks/src/clear.c b/xlators/features/locks/src/clear.c
index 5790a99ce..75593b898 100644
--- a/xlators/features/locks/src/clear.c
+++ b/xlators/features/locks/src/clear.c
@@ -338,8 +338,8 @@ blkd:
elock->basename, ENTRYLK_LOCK, elock->type,
-1, EAGAIN);
STACK_UNWIND_STRICT (entrylk, elock->frame, -1, EAGAIN, NULL);
- GF_FREE ((char *) elock->basename);
- GF_FREE (elock);
+
+ __pl_entrylk_unref (elock);
}
if (!(args->kind & CLRLK_GRANTED)) {
@@ -362,13 +362,13 @@ granted:
gcount++;
list_del_init (&elock->domain_list);
list_add_tail (&elock->domain_list, &removed);
+
+ __pl_entrylk_unref (elock);
}
}
pthread_mutex_unlock (&pl_inode->mutex);
- list_for_each_entry_safe (elock, tmp, &removed, domain_list) {
- grant_blocked_entry_locks (this, pl_inode, elock, dom);
- }
+ grant_blocked_entry_locks (this, pl_inode, dom);
ret = 0;
out:
@@ -379,8 +379,8 @@ out:
int
clrlk_clear_lks_in_all_domains (xlator_t *this, pl_inode_t *pl_inode,
- clrlk_args *args, int *blkd, int *granted,
- int *op_errno)
+ clrlk_args *args, int *blkd, int *granted,
+ int *op_errno)
{
pl_dom_list_t *dom = NULL;
int ret = -1;
diff --git a/xlators/features/locks/src/common.c b/xlators/features/locks/src/common.c
index 5eeb9cd15..f6c71c1cf 100644
--- a/xlators/features/locks/src/common.c
+++ b/xlators/features/locks/src/common.c
@@ -35,6 +35,7 @@ __insert_and_merge (pl_inode_t *pl_inode, posix_lock_t *lock);
static int
pl_send_prelock_unlock (xlator_t *this, pl_inode_t *pl_inode,
posix_lock_t *old_lock);
+
static pl_dom_list_t *
__allocate_domain (const char *volume)
{
@@ -75,8 +76,8 @@ get_domain (pl_inode_t *pl_inode, const char *volume)
{
pl_dom_list_t *dom = NULL;
- GF_VALIDATE_OR_GOTO (POSIX_LOCKS, pl_inode, out);
- GF_VALIDATE_OR_GOTO (POSIX_LOCKS, volume, out);
+ GF_VALIDATE_OR_GOTO ("posix-locks", pl_inode, out);
+ GF_VALIDATE_OR_GOTO ("posix-locks", volume, out);
pthread_mutex_lock (&pl_inode->mutex);
{
@@ -92,9 +93,9 @@ get_domain (pl_inode_t *pl_inode, const char *volume)
unlock:
pthread_mutex_unlock (&pl_inode->mutex);
if (dom) {
- gf_log (POSIX_LOCKS, GF_LOG_TRACE, "Domain %s found", volume);
+ gf_log ("posix-locks", GF_LOG_TRACE, "Domain %s found", volume);
} else {
- gf_log (POSIX_LOCKS, GF_LOG_TRACE, "Domain %s not found", volume);
+ gf_log ("posix-locks", GF_LOG_TRACE, "Domain %s not found", volume);
}
out:
return dom;
@@ -135,10 +136,10 @@ __pl_inode_is_empty (pl_inode_t *pl_inode)
void
pl_print_locker (char *str, int size, xlator_t *this, call_frame_t *frame)
{
- snprintf (str, size, "Pid=%llu, lk-owner=%s, Transport=%p, Frame=%llu",
+ snprintf (str, size, "Pid=%llu, lk-owner=%s, Client=%p, Frame=%llu",
(unsigned long long) frame->root->pid,
lkowner_utoa (&frame->root->lk_owner),
- (void *)frame->root->trans,
+ frame->root->client,
(unsigned long long) frame->root->unique);
}
@@ -462,14 +463,14 @@ unlock:
/* Create a new posix_lock_t */
posix_lock_t *
-new_posix_lock (struct gf_flock *flock, void *transport, pid_t client_pid,
+new_posix_lock (struct gf_flock *flock, client_t *client, pid_t client_pid,
gf_lkowner_t *owner, fd_t *fd)
{
posix_lock_t *lock = NULL;
- GF_VALIDATE_OR_GOTO (POSIX_LOCKS, flock, out);
- GF_VALIDATE_OR_GOTO (POSIX_LOCKS, transport, out);
- GF_VALIDATE_OR_GOTO (POSIX_LOCKS, fd, out);
+ GF_VALIDATE_OR_GOTO ("posix-locks", flock, out);
+ GF_VALIDATE_OR_GOTO ("posix-locks", client, out);
+ GF_VALIDATE_OR_GOTO ("posix-locks", fd, out);
lock = GF_CALLOC (1, sizeof (posix_lock_t),
gf_locks_mt_posix_lock_t);
@@ -485,7 +486,7 @@ new_posix_lock (struct gf_flock *flock, void *transport, pid_t client_pid,
else
lock->fl_end = flock->l_start + flock->l_len - 1;
- lock->transport = transport;
+ lock->client = client;
lock->fd_num = fd_to_fdnum (fd);
lock->fd = fd;
lock->client_pid = client_pid;
@@ -565,7 +566,7 @@ same_owner (posix_lock_t *l1, posix_lock_t *l2)
{
return (is_same_lkowner (&l1->owner, &l2->owner) &&
- (l1->transport == l2->transport));
+ (l1->client == l2->client));
}
@@ -694,7 +695,7 @@ subtract_locks (posix_lock_t *big, posix_lock_t *small)
}
GF_ASSERT (0);
- gf_log (POSIX_LOCKS, GF_LOG_ERROR, "Unexpected case in subtract_locks");
+ gf_log ("posix-locks", GF_LOG_ERROR, "Unexpected case in subtract_locks");
out:
if (v.locks[0]) {
@@ -714,6 +715,36 @@ done:
return v;
}
+static posix_lock_t *
+first_conflicting_overlap (pl_inode_t *pl_inode, posix_lock_t *lock)
+{
+ posix_lock_t *l = NULL;
+ posix_lock_t *conf = NULL;
+
+ pthread_mutex_lock (&pl_inode->mutex);
+ {
+ list_for_each_entry (l, &pl_inode->ext_list, list) {
+ if (l->blocked)
+ continue;
+
+ if (locks_overlap (l, lock)) {
+ if (same_owner (l, lock))
+ continue;
+
+ if ((l->fl_type == F_WRLCK) ||
+ (lock->fl_type == F_WRLCK)) {
+ conf = l;
+ goto unlock;
+ }
+ }
+ }
+ }
+unlock:
+ pthread_mutex_unlock (&pl_inode->mutex);
+
+ return conf;
+}
+
/*
Start searching from {begin}, and return the first lock that
conflicts, NULL if no conflict
@@ -782,7 +813,7 @@ __insert_and_merge (pl_inode_t *pl_inode, posix_lock_t *lock)
sum = add_locks (lock, conf);
sum->fl_type = lock->fl_type;
- sum->transport = lock->transport;
+ sum->client = lock->client;
sum->fd_num = lock->fd_num;
sum->client_pid = lock->client_pid;
sum->owner = lock->owner;
@@ -800,7 +831,7 @@ __insert_and_merge (pl_inode_t *pl_inode, posix_lock_t *lock)
sum = add_locks (lock, conf);
sum->fl_type = conf->fl_type;
- sum->transport = conf->transport;
+ sum->client = conf->client;
sum->fd_num = conf->fd_num;
sum->client_pid = conf->client_pid;
sum->owner = conf->owner;
@@ -958,7 +989,7 @@ pl_send_prelock_unlock (xlator_t *this, pl_inode_t *pl_inode,
flock.l_len = old_lock->user_flock.l_len;
- unlock_lock = new_posix_lock (&flock, old_lock->transport,
+ unlock_lock = new_posix_lock (&flock, old_lock->client,
old_lock->client_pid, &old_lock->owner,
old_lock->fd);
GF_VALIDATE_OR_GOTO (this->name, unlock_lock, out);
@@ -1058,7 +1089,7 @@ pl_getlk (pl_inode_t *pl_inode, posix_lock_t *lock)
{
posix_lock_t *conf = NULL;
- conf = first_overlap (pl_inode, lock);
+ conf = first_conflicting_overlap (pl_inode, lock);
if (conf == NULL) {
lock->fl_type = F_UNLCK;
@@ -1067,3 +1098,4 @@ pl_getlk (pl_inode_t *pl_inode, posix_lock_t *lock)
return conf;
}
+
diff --git a/xlators/features/locks/src/common.h b/xlators/features/locks/src/common.h
index 94ef4b2b4..5ec630ee8 100644
--- a/xlators/features/locks/src/common.h
+++ b/xlators/features/locks/src/common.h
@@ -14,12 +14,14 @@
/*dump locks format strings */
#define RANGE_FMT "type=%s, whence=%hd, start=%llu, len=%llu"
#define ENTRY_FMT "type=%s on basename=%s"
-#define DUMP_GEN_FMT "pid = %llu, owner=%s, transport=%p, "
+#define DUMP_GEN_FMT "pid = %llu, owner=%s, client=%p"
#define GRNTD_AT "granted at %s"
#define BLKD_AT "blocked at %s"
-#define DUMP_BLKD_FMT DUMP_GEN_FMT", "BLKD_AT
-#define DUMP_GRNTD_FMT DUMP_GEN_FMT", "GRNTD_AT
-#define DUMP_BLKD_GRNTD_FMT DUMP_GEN_FMT", "BLKD_AT", "GRNTD_AT
+#define CONN_ID "connection-id=%s"
+#define DUMP_BLKD_FMT DUMP_GEN_FMT", "CONN_ID", "BLKD_AT
+#define DUMP_GRNTD_FMT DUMP_GEN_FMT", "CONN_ID", "GRNTD_AT
+#define DUMP_BLKD_GRNTD_FMT DUMP_GEN_FMT", "CONN_ID", "BLKD_AT", "GRNTD_AT
+
#define ENTRY_BLKD_FMT ENTRY_FMT", "DUMP_BLKD_FMT
#define ENTRY_GRNTD_FMT ENTRY_FMT", "DUMP_GRNTD_FMT
#define ENTRY_BLKD_GRNTD_FMT ENTRY_FMT", "DUMP_BLKD_GRNTD_FMT
@@ -29,8 +31,10 @@
#define RANGE_BLKD_GRNTD_FMT RANGE_FMT", "DUMP_BLKD_GRNTD_FMT
#define SET_FLOCK_PID(flock, lock) ((flock)->l_pid = lock->client_pid)
+
+
posix_lock_t *
-new_posix_lock (struct gf_flock *flock, void *transport, pid_t client_pid,
+new_posix_lock (struct gf_flock *flock, client_t *client, pid_t client_pid,
gf_lkowner_t *owner, fd_t *fd);
pl_inode_t *
@@ -63,7 +67,8 @@ pl_dom_list_t *
get_domain (pl_inode_t *pl_inode, const char *volume);
void
-grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom);
+grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode,
+ pl_dom_list_t *dom);
void
__delete_inode_lock (pl_inode_lock_t *lock);
@@ -73,14 +78,14 @@ __pl_inodelk_unref (pl_inode_lock_t *lock);
void
grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode,
- pl_entry_lock_t *unlocked, pl_dom_list_t *dom);
+ pl_dom_list_t *dom);
void pl_update_refkeeper (xlator_t *this, inode_t *inode);
int32_t
-__get_inodelk_count (xlator_t *this, pl_inode_t *pl_inode);
+__get_inodelk_count (xlator_t *this, pl_inode_t *pl_inode, char *domname);
int32_t
-get_inodelk_count (xlator_t *this, inode_t *inode);
+get_inodelk_count (xlator_t *this, inode_t *inode, char *domname);
int32_t
__get_entrylk_count (xlator_t *this, pl_inode_t *pl_inode);
@@ -143,6 +148,11 @@ pl_verify_reservelk (xlator_t *this, pl_inode_t *pl_inode,
posix_lock_t *lock, int can_block);
int
pl_reserve_unlock (xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *reqlock);
+
uint32_t
check_entrylk_on_basename (xlator_t *this, inode_t *parent, char *basename);
+
+void __pl_inodelk_unref (pl_inode_lock_t *lock);
+void __pl_entrylk_unref (pl_entry_lock_t *lock);
+
#endif /* __COMMON_H__ */
diff --git a/xlators/features/locks/src/entrylk.c b/xlators/features/locks/src/entrylk.c
index 738b43be2..8496d9d8d 100644
--- a/xlators/features/locks/src/entrylk.c
+++ b/xlators/features/locks/src/entrylk.c
@@ -23,11 +23,29 @@
#include "locks.h"
#include "common.h"
+
+void
+__pl_entrylk_unref (pl_entry_lock_t *lock)
+{
+ lock->ref--;
+ if (!lock->ref) {
+ GF_FREE ((char *)lock->basename);
+ GF_FREE (lock->connection_id);
+ GF_FREE (lock);
+ }
+}
+
+
+static void
+__pl_entrylk_ref (pl_entry_lock_t *lock)
+{
+ lock->ref++;
+}
+
+
static pl_entry_lock_t *
new_entrylk_lock (pl_inode_t *pinode, const char *basename, entrylk_type type,
- void *trans, pid_t client_pid, gf_lkowner_t *owner,
- const char *volume)
-
+ const char *domain, call_frame_t *frame, char *conn_id)
{
pl_entry_lock_t *newlock = NULL;
@@ -39,14 +57,22 @@ new_entrylk_lock (pl_inode_t *pinode, const char *basename, entrylk_type type,
newlock->basename = basename ? gf_strdup (basename) : NULL;
newlock->type = type;
- newlock->trans = trans;
- newlock->volume = volume;
- newlock->client_pid = client_pid;
- newlock->owner = *owner;
+ newlock->client = frame->root->client;
+ newlock->client_pid = frame->root->pid;
+ newlock->volume = domain;
+ newlock->owner = frame->root->lk_owner;
+ newlock->frame = frame;
+ newlock->this = frame->this;
+
+ if (conn_id) {
+ newlock->connection_id = gf_strdup (conn_id);
+ }
INIT_LIST_HEAD (&newlock->domain_list);
INIT_LIST_HEAD (&newlock->blocked_locks);
+ INIT_LIST_HEAD (&newlock->client_list);
+ __pl_entrylk_ref (newlock);
out:
return newlock;
}
@@ -77,42 +103,42 @@ __same_entrylk_owner (pl_entry_lock_t *l1, pl_entry_lock_t *l2)
{
return (is_same_lkowner (&l1->owner, &l2->owner) &&
- (l1->trans == l2->trans));
+ (l1->client == l2->client));
}
/**
- * lock_grantable - is this lock grantable?
+ * entrylk_grantable - is this lock grantable?
* @inode: inode in which to look
* @basename: name we're trying to lock
* @type: type of lock
*/
static pl_entry_lock_t *
-__lock_grantable (pl_dom_list_t *dom, const char *basename, entrylk_type type)
+__entrylk_grantable (pl_dom_list_t *dom, pl_entry_lock_t *lock)
{
- pl_entry_lock_t *lock = NULL;
+ pl_entry_lock_t *tmp = NULL;
if (list_empty (&dom->entrylk_list))
return NULL;
- list_for_each_entry (lock, &dom->entrylk_list, domain_list) {
- if (names_conflict (lock->basename, basename))
- return lock;
+ list_for_each_entry (tmp, &dom->entrylk_list, domain_list) {
+ if (names_conflict (tmp->basename, lock->basename))
+ return tmp;
}
return NULL;
}
static pl_entry_lock_t *
-__blocked_lock_conflict (pl_dom_list_t *dom, const char *basename, entrylk_type type)
+__blocked_entrylk_conflict (pl_dom_list_t *dom, pl_entry_lock_t *lock)
{
- pl_entry_lock_t *lock = NULL;
+ pl_entry_lock_t *tmp = NULL;
if (list_empty (&dom->blocked_entrylks))
return NULL;
- list_for_each_entry (lock, &dom->blocked_entrylks, blocked_locks) {
- if (names_conflict (lock->basename, basename))
+ list_for_each_entry (tmp, &dom->blocked_entrylks, blocked_locks) {
+ if (names_conflict (tmp->basename, lock->basename))
return lock;
}
@@ -293,7 +319,7 @@ __find_most_matching_lock (pl_dom_list_t *dom, const char *basename)
}
/**
- * __lock_name - lock a name in a directory
+ * __lock_entrylk - lock a name in a directory
* @inode: inode for the directory in which to lock
* @basename: name of the entry to lock
* if null, lock the entire directory
@@ -304,86 +330,48 @@ __find_most_matching_lock (pl_dom_list_t *dom, const char *basename)
*/
int
-__lock_name (pl_inode_t *pinode, const char *basename, entrylk_type type,
- call_frame_t *frame, pl_dom_list_t *dom, xlator_t *this, int nonblock)
+__lock_entrylk (xlator_t *this, pl_inode_t *pinode, pl_entry_lock_t *lock,
+ int nonblock, pl_dom_list_t *dom)
{
- pl_entry_lock_t *lock = NULL;
- pl_entry_lock_t *conf = NULL;
- void *trans = NULL;
- pid_t client_pid = 0;
- int ret = -EINVAL;
+ pl_entry_lock_t *conf = NULL;
+ int ret = -EAGAIN;
- trans = frame->root->trans;
- client_pid = frame->root->pid;
-
- lock = new_entrylk_lock (pinode, basename, type, trans, client_pid,
- &frame->root->lk_owner, dom->domain);
- if (!lock) {
- ret = -ENOMEM;
- goto out;
- }
-
- lock->frame = frame;
- lock->this = this;
- lock->trans = trans;
-
- conf = __lock_grantable (dom, basename, type);
+ conf = __entrylk_grantable (dom, lock);
if (conf) {
ret = -EAGAIN;
- if (nonblock){
- GF_FREE ((char *)lock->basename);
- GF_FREE (lock);
+ if (nonblock)
goto out;
- }
-
gettimeofday (&lock->blkd_time, NULL);
list_add_tail (&lock->blocked_locks, &dom->blocked_entrylks);
gf_log (this->name, GF_LOG_TRACE,
"Blocking lock: {pinode=%p, basename=%s}",
- pinode, basename);
+ pinode, lock->basename);
goto out;
}
- if ( __blocked_lock_conflict (dom, basename, type) && !(__owner_has_lock (dom, lock))) {
+ if (__blocked_entrylk_conflict (dom, lock) && !(__owner_has_lock (dom, lock))) {
ret = -EAGAIN;
- if (nonblock) {
- GF_FREE ((char *) lock->basename);
- GF_FREE (lock);
+ if (nonblock)
goto out;
- }
- lock->frame = frame;
- lock->this = this;
-
gettimeofday (&lock->blkd_time, NULL);
list_add_tail (&lock->blocked_locks, &dom->blocked_entrylks);
- gf_log (this->name, GF_LOG_TRACE,
+ gf_log (this->name, GF_LOG_DEBUG,
"Lock is grantable, but blocking to prevent starvation");
gf_log (this->name, GF_LOG_TRACE,
"Blocking lock: {pinode=%p, basename=%s}",
- pinode, basename);
+ pinode, lock->basename);
- ret = -EAGAIN;
goto out;
}
- switch (type) {
-
- case ENTRYLK_WRLCK:
- gettimeofday (&lock->granted_time, NULL);
- list_add_tail (&lock->domain_list, &dom->entrylk_list);
- break;
-
- default:
- gf_log (this->name, GF_LOG_DEBUG,
- "Invalid type for entrylk specified: %d", type);
- ret = -EINVAL;
- goto out;
- }
+ __pl_entrylk_ref (lock);
+ gettimeofday (&lock->granted_time, NULL);
+ list_add (&lock->domain_list, &dom->entrylk_list);
ret = 0;
out:
@@ -391,37 +379,36 @@ out:
}
/**
- * __unlock_name - unlock a name in a directory
+ * __unlock_entrylk - unlock a name in a directory
* @inode: inode for the directory to unlock in
* @basename: name of the entry to unlock
* if null, unlock the entire directory
*/
pl_entry_lock_t *
-__unlock_name (pl_dom_list_t *dom, const char *basename, entrylk_type type)
+__unlock_entrylk (pl_dom_list_t *dom, pl_entry_lock_t *lock)
{
- pl_entry_lock_t *lock = NULL;
+ pl_entry_lock_t *tmp = NULL;
pl_entry_lock_t *ret_lock = NULL;
- lock = __find_most_matching_lock (dom, basename);
+ tmp = __find_most_matching_lock (dom, lock->basename);
- if (!lock) {
- gf_log ("locks", GF_LOG_DEBUG,
+ if (!tmp) {
+ gf_log ("locks", GF_LOG_ERROR,
"unlock on %s (type=ENTRYLK_WRLCK) attempted but no matching lock found",
- basename);
+ lock->basename);
goto out;
}
- if (names_equal (lock->basename, basename)
- && lock->type == type) {
+ if (names_equal (tmp->basename, lock->basename)
+ && tmp->type == lock->type) {
+
+ list_del_init (&tmp->domain_list);
+ ret_lock = tmp;
- if (type == ENTRYLK_WRLCK) {
- list_del_init (&lock->domain_list);
- ret_lock = lock;
- }
} else {
- gf_log ("locks", GF_LOG_DEBUG,
- "Unlock for a non-existing lock!");
+ gf_log ("locks", GF_LOG_ERROR,
+ "Unlock on %s for a non-existing lock!", lock->basename);
goto out;
}
@@ -443,7 +430,7 @@ check_entrylk_on_basename (xlator_t *this, inode_t *parent, char *basename)
pthread_mutex_lock (&pinode->mutex);
{
list_for_each_entry (dom, &pinode->dom_list, inode_list) {
- conf = __lock_grantable (dom, basename, ENTRYLK_WRLCK);
+ conf = __find_most_matching_lock (dom, basename);
if (conf && conf->basename) {
entrylk = 1;
break;
@@ -469,26 +456,14 @@ __grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode,
INIT_LIST_HEAD (&blocked_list);
list_splice_init (&dom->blocked_entrylks, &blocked_list);
- list_for_each_entry_safe (bl, tmp, &blocked_list,
- blocked_locks) {
+ list_for_each_entry_safe (bl, tmp, &blocked_list, blocked_locks) {
list_del_init (&bl->blocked_locks);
-
- gf_log ("locks", GF_LOG_TRACE,
- "Trying to unblock: {pinode=%p, basename=%s}",
- pl_inode, bl->basename);
-
- bl_ret = __lock_name (pl_inode, bl->basename, bl->type,
- bl->frame, dom, bl->this, 0);
+ bl_ret = __lock_entrylk (bl->this, pl_inode, bl, 0, dom);
if (bl_ret == 0) {
list_add (&bl->blocked_locks, granted);
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "should never happen");
- GF_FREE ((char *)bl->basename);
- GF_FREE (bl);
}
}
return;
@@ -497,7 +472,7 @@ __grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode,
/* Grants locks if possible which are blocked on a lock */
void
grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode,
- pl_entry_lock_t *unlocked, pl_dom_list_t *dom)
+ pl_dom_list_t *dom)
{
struct list_head granted_list;
pl_entry_lock_t *tmp = NULL;
@@ -507,121 +482,56 @@ grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode,
pthread_mutex_lock (&pl_inode->mutex);
{
- __grant_blocked_entry_locks (this, pl_inode, dom, &granted_list);
+ __grant_blocked_entry_locks (this, pl_inode, dom,
+ &granted_list);
}
pthread_mutex_unlock (&pl_inode->mutex);
list_for_each_entry_safe (lock, tmp, &granted_list, blocked_locks) {
- list_del_init (&lock->blocked_locks);
-
entrylk_trace_out (this, lock->frame, NULL, NULL, NULL,
lock->basename, ENTRYLK_LOCK, lock->type,
0, 0);
STACK_UNWIND_STRICT (entrylk, lock->frame, 0, 0, NULL);
+ lock->frame = NULL;
+ }
- }
-
- GF_FREE ((char *)unlocked->basename);
- GF_FREE (unlocked);
+ pthread_mutex_lock (&pl_inode->mutex);
+ {
+ list_for_each_entry_safe (lock, tmp, &granted_list, blocked_locks) {
+ list_del_init (&lock->blocked_locks);
+ __pl_entrylk_unref (lock);
+ }
+ }
+ pthread_mutex_unlock (&pl_inode->mutex);
return;
}
-/**
- * release_entry_locks_for_transport: release all entry locks from this
- * transport for this loc_t
- */
-
-static int
-release_entry_locks_for_transport (xlator_t *this, pl_inode_t *pinode,
- pl_dom_list_t *dom, void *trans)
-{
- pl_entry_lock_t *lock = NULL;
- pl_entry_lock_t *tmp = NULL;
- struct list_head granted;
- struct list_head released;
-
- INIT_LIST_HEAD (&granted);
- INIT_LIST_HEAD (&released);
-
- pthread_mutex_lock (&pinode->mutex);
- {
- list_for_each_entry_safe (lock, tmp, &dom->blocked_entrylks,
- blocked_locks) {
- if (lock->trans != trans)
- continue;
-
- list_del_init (&lock->blocked_locks);
-
- gf_log (this->name, GF_LOG_TRACE,
- "releasing lock on held by "
- "{transport=%p}",trans);
-
- list_add (&lock->blocked_locks, &released);
-
- }
-
- list_for_each_entry_safe (lock, tmp, &dom->entrylk_list,
- domain_list) {
- if (lock->trans != trans)
- continue;
-
- list_del_init (&lock->domain_list);
-
- gf_log (this->name, GF_LOG_TRACE,
- "releasing lock on held by "
- "{transport=%p}",trans);
-
- GF_FREE ((char *)lock->basename);
- GF_FREE (lock);
- }
-
- __grant_blocked_entry_locks (this, pinode, dom, &granted);
-
- }
-
- pthread_mutex_unlock (&pinode->mutex);
-
- list_for_each_entry_safe (lock, tmp, &released, blocked_locks) {
- list_del_init (&lock->blocked_locks);
-
- STACK_UNWIND_STRICT (entrylk, lock->frame, -1, EAGAIN, NULL);
-
- GF_FREE ((char *)lock->basename);
- GF_FREE (lock);
-
- }
-
- list_for_each_entry_safe (lock, tmp, &granted, blocked_locks) {
- list_del_init (&lock->blocked_locks);
-
- STACK_UNWIND_STRICT (entrylk, lock->frame, 0, 0, NULL);
-
- GF_FREE ((char *)lock->basename);
- GF_FREE (lock);
- }
-
- return 0;
-}
/* Common entrylk code called by pl_entrylk and pl_fentrylk */
int
pl_common_entrylk (call_frame_t *frame, xlator_t *this,
const char *volume, inode_t *inode, const char *basename,
- entrylk_cmd cmd, entrylk_type type, loc_t *loc, fd_t *fd)
-{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
+ entrylk_cmd cmd, entrylk_type type, loc_t *loc, fd_t *fd,
+ dict_t *xdata)
- void * transport = NULL;
-
- pl_inode_t * pinode = NULL;
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
int ret = -1;
- pl_entry_lock_t *unlocked = NULL;
char unwind = 1;
+ GF_UNUSED int dict_ret = -1;
+ pl_inode_t *pinode = NULL;
+ pl_entry_lock_t *reqlock = NULL;
+ pl_entry_lock_t *unlocked = NULL;
+ pl_dom_list_t *dom = NULL;
+ char *conn_id = NULL;
+ pl_ctx_t *ctx = NULL;
+ int nonblock = 0;
- pl_dom_list_t *dom = NULL;
+ if (xdata)
+ dict_ret = dict_get_str (xdata, "connection-id", &conn_id);
pinode = pl_inode_get (this, inode);
if (!pinode) {
@@ -629,6 +539,15 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this,
goto out;
}
+ if (frame->root->client) {
+ ctx = pl_ctx_get (frame->root->client, this);
+ if (!ctx) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_INFO, "pl_ctx_get() failed");
+ goto unwind;
+ }
+ }
+
dom = get_domain (pinode, volume);
if (!dom){
op_errno = ENOMEM;
@@ -637,73 +556,69 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this,
entrylk_trace_in (this, frame, volume, fd, loc, basename, cmd, type);
- transport = frame->root->trans;
-
- if (frame->root->lk_owner.len == 0) {
- /*
- this is a special case that means release
- all locks from this transport
- */
-
- gf_log (this->name, GF_LOG_TRACE,
- "Releasing locks for transport %p", transport);
-
- release_entry_locks_for_transport (this, pinode, dom, transport);
- op_ret = 0;
-
- goto out;
+ reqlock = new_entrylk_lock (pinode, basename, type, dom->domain, frame,
+ conn_id);
+ if (!reqlock) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto unwind;
}
switch (cmd) {
- case ENTRYLK_LOCK:
- pthread_mutex_lock (&pinode->mutex);
- {
- ret = __lock_name (pinode, basename, type,
- frame, dom, this, 0);
- }
- pthread_mutex_unlock (&pinode->mutex);
-
- op_errno = -ret;
- if (ret < 0) {
- if (ret == -EAGAIN)
- unwind = 0;
- else
- unwind = 1;
- goto out;
- } else {
- op_ret = 0;
- op_errno = 0;
- unwind = 1;
- goto out;
- }
-
- break;
-
case ENTRYLK_LOCK_NB:
- unwind = 1;
+ nonblock = 1;
+ /* fall through */
+ case ENTRYLK_LOCK:
+ if (ctx)
+ pthread_mutex_lock (&ctx->lock);
pthread_mutex_lock (&pinode->mutex);
{
- ret = __lock_name (pinode, basename, type,
- frame, dom, this, 1);
+ reqlock->pinode = pinode;
+
+ ret = __lock_entrylk (this, pinode, reqlock, nonblock, dom);
+ if (ret == 0) {
+ reqlock->frame = NULL;
+ op_ret = 0;
+ } else {
+ op_errno = -ret;
+ }
+
+ if (ctx && (!ret || !nonblock))
+ list_add (&reqlock->client_list,
+ &ctx->entrylk_lockers);
+
+ if (ret == -EAGAIN && !nonblock) {
+ /* blocked */
+ unwind = 0;
+ } else {
+ __pl_entrylk_unref (reqlock);
+ }
}
pthread_mutex_unlock (&pinode->mutex);
-
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- break;
+ if (ctx)
+ pthread_mutex_unlock (&ctx->lock);
+ break;
case ENTRYLK_UNLOCK:
+ if (ctx)
+ pthread_mutex_lock (&ctx->lock);
pthread_mutex_lock (&pinode->mutex);
{
- unlocked = __unlock_name (dom, basename, type);
+ unlocked = __unlock_entrylk (dom, reqlock);
+ if (unlocked) {
+ list_del_init (&unlocked->client_list);
+ __pl_entrylk_unref (unlocked);
+ op_ret = 0;
+ } else {
+ op_errno = EINVAL;
+ }
+ __pl_entrylk_unref (reqlock);
}
pthread_mutex_unlock (&pinode->mutex);
+ if (ctx)
+ pthread_mutex_unlock (&ctx->lock);
- if (unlocked)
- grant_blocked_entry_locks (this, pinode, unlocked, dom);
+ grant_blocked_entry_locks (this, pinode, dom);
break;
@@ -713,21 +628,19 @@ pl_common_entrylk (call_frame_t *frame, xlator_t *this,
"a bug report at http://bugs.gluster.com", cmd);
goto out;
}
-
- op_ret = 0;
out:
pl_update_refkeeper (this, inode);
+
if (unwind) {
entrylk_trace_out (this, frame, volume, fd, loc, basename,
cmd, type, op_ret, op_errno);
-
+unwind:
STACK_UNWIND_STRICT (entrylk, frame, op_ret, op_errno, NULL);
} else {
entrylk_trace_block (this, frame, volume, fd, loc, basename,
cmd, type);
}
-
return 0;
}
@@ -740,10 +653,10 @@ out:
int
pl_entrylk (call_frame_t *frame, xlator_t *this,
const char *volume, loc_t *loc, const char *basename,
- entrylk_cmd cmd, entrylk_type type)
+ entrylk_cmd cmd, entrylk_type type, dict_t *xdata)
{
-
- pl_common_entrylk (frame, this, volume, loc->inode, basename, cmd, type, loc, NULL);
+ pl_common_entrylk (frame, this, volume, loc->inode, basename, cmd,
+ type, loc, NULL, xdata);
return 0;
}
@@ -758,10 +671,136 @@ pl_entrylk (call_frame_t *frame, xlator_t *this,
int
pl_fentrylk (call_frame_t *frame, xlator_t *this,
const char *volume, fd_t *fd, const char *basename,
- entrylk_cmd cmd, entrylk_type type)
+ entrylk_cmd cmd, entrylk_type type, dict_t *xdata)
+{
+ pl_common_entrylk (frame, this, volume, fd->inode, basename, cmd,
+ type, NULL, fd, xdata);
+
+ return 0;
+}
+
+
+static void
+pl_entrylk_log_cleanup (pl_entry_lock_t *lock)
{
+ pl_inode_t *pinode = NULL;
+ char *path = NULL;
+ char *file = NULL;
+
+ pinode = lock->pinode;
+
+ inode_path (pinode->refkeeper, NULL, &path);
+
+ if (path)
+ file = path;
+ else
+ file = uuid_utoa (pinode->refkeeper->gfid);
+
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "releasing lock on %s held by "
+ "{client=%p, pid=%"PRId64" lk-owner=%s}",
+ file, lock->client, (uint64_t) lock->client_pid,
+ lkowner_utoa (&lock->owner));
+ GF_FREE (path);
+}
- pl_common_entrylk (frame, this, volume, fd->inode, basename, cmd, type, NULL, fd);
+
+/* Release all entrylks from this client */
+int
+pl_entrylk_client_cleanup (xlator_t *this, pl_ctx_t *ctx)
+{
+ pl_entry_lock_t *tmp = NULL;
+ pl_entry_lock_t *l = NULL;
+ pl_dom_list_t *dom = NULL;
+ pl_inode_t *pinode = NULL;
+
+ struct list_head released;
+ struct list_head unwind;
+
+ INIT_LIST_HEAD (&released);
+ INIT_LIST_HEAD (&unwind);
+
+ pthread_mutex_lock (&ctx->lock);
+ {
+ list_for_each_entry_safe (l, tmp, &ctx->entrylk_lockers,
+ client_list) {
+ list_del_init (&l->client_list);
+
+ pl_entrylk_log_cleanup (l);
+
+ pinode = l->pinode;
+
+ pthread_mutex_lock (&pinode->mutex);
+ {
+ /* If the entrylk object is part of granted list but not
+ * blocked list, then perform the following actions:
+ * i. delete the object from granted list;
+ * ii. grant other locks (from other clients) that may
+ * have been blocked on this entrylk; and
+ * iii. unref the object.
+ *
+ * If the entrylk object (L1) is part of both granted
+ * and blocked lists, then this means that a parallel
+ * unlock on another entrylk (L2 say) may have 'granted'
+ * L1 and added it to 'granted' list in
+ * __grant_blocked_entry_locks() (although using the
+ * 'blocked_locks' member). In that case, the cleanup
+ * codepath must try and grant other overlapping
+ * blocked entrylks from other clients, now that L1 is
+ * out of their way and then unref L1 in the end, and
+ * leave it to the other thread (the one executing
+ * unlock codepath) to unwind L1's frame, delete it from
+ * blocked_locks list, and perform the last unref on L1.
+ *
+ * If the entrylk object (L1) is part of blocked list
+ * only, the cleanup code path must:
+ * i. delete it from the blocked_locks list inside
+ * this critical section,
+ * ii. unwind its frame with EAGAIN,
+ * iii. try and grant blocked entry locks from other
+ * clients that were otherwise grantable, but were
+ * blocked to avoid leaving L1 to starve forever.
+ * iv. unref the object.
+ */
+ if (!list_empty (&l->domain_list)) {
+ list_del_init (&l->domain_list);
+ list_add_tail (&l->client_list,
+ &released);
+ } else {
+ list_del_init (&l->blocked_locks);
+ list_add_tail (&l->client_list,
+ &unwind);
+ }
+ }
+ pthread_mutex_unlock (&pinode->mutex);
+ }
+ }
+ pthread_mutex_unlock (&ctx->lock);
+
+ list_for_each_entry_safe (l, tmp, &unwind, client_list) {
+ list_del_init (&l->client_list);
+
+ if (l->frame)
+ STACK_UNWIND_STRICT (entrylk, l->frame, -1, EAGAIN,
+ NULL);
+ list_add_tail (&l->client_list, &released);
+ }
+
+ list_for_each_entry_safe (l, tmp, &released, client_list) {
+ list_del_init (&l->client_list);
+
+ pinode = l->pinode;
+
+ dom = get_domain (pinode, l->volume);
+
+ grant_blocked_entry_locks (this, pinode, dom);
+
+ pthread_mutex_lock (&pinode->mutex);
+ {
+ __pl_entrylk_unref (l);
+ }
+ pthread_mutex_unlock (&pinode->mutex);
+ }
return 0;
}
diff --git a/xlators/features/locks/src/inodelk.c b/xlators/features/locks/src/inodelk.c
index 42350e59a..c76cb7f91 100644
--- a/xlators/features/locks/src/inodelk.c
+++ b/xlators/features/locks/src/inodelk.c
@@ -26,7 +26,7 @@
inline void
__delete_inode_lock (pl_inode_lock_t *lock)
{
- list_del (&lock->list);
+ list_del_init (&lock->list);
}
static inline void
@@ -39,8 +39,10 @@ inline void
__pl_inodelk_unref (pl_inode_lock_t *lock)
{
lock->ref--;
- if (!lock->ref)
+ if (!lock->ref) {
+ GF_FREE (lock->connection_id);
GF_FREE (lock);
+ }
}
/* Check if 2 inodelks are conflicting on type. Only 2 shared locks don't conflict */
@@ -122,7 +124,7 @@ static inline int
same_inodelk_owner (pl_inode_lock_t *l1, pl_inode_lock_t *l2)
{
return (is_same_lkowner (&l1->owner, &l2->owner) &&
- (l1->transport == l2->transport));
+ (l1->client == l2->client));
}
/* Returns true if the 2 inodelks conflict with each other */
@@ -202,7 +204,7 @@ __lock_inodelk (xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock,
int ret = -EINVAL;
conf = __inodelk_grantable (dom, lock);
- if (conf){
+ if (conf) {
ret = -EAGAIN;
if (can_block == 0)
goto out;
@@ -230,7 +232,7 @@ __lock_inodelk (xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock,
gettimeofday (&lock->blkd_time, NULL);
list_add_tail (&lock->blocked_locks, &dom->blocked_inodelks);
- gf_log (this->name, GF_LOG_TRACE,
+ gf_log (this->name, GF_LOG_DEBUG,
"Lock is grantable, but blocking to prevent starvation");
gf_log (this->name, GF_LOG_TRACE,
"%s (pid=%d) (lk-owner=%s) %"PRId64" - %"PRId64" => Blocked",
@@ -292,7 +294,7 @@ __inode_unlock_lock (xlator_t *this, pl_inode_lock_t *lock, pl_dom_list_t *dom)
" Matching lock not found for unlock %llu-%llu, by %s "
"on %p", (unsigned long long)lock->fl_start,
(unsigned long long)lock->fl_end,
- lkowner_utoa (&lock->owner), lock->transport);
+ lkowner_utoa (&lock->owner), lock->client);
goto out;
}
__delete_inode_lock (conf);
@@ -300,11 +302,13 @@ __inode_unlock_lock (xlator_t *this, pl_inode_lock_t *lock, pl_dom_list_t *dom)
" Matching lock found for unlock %llu-%llu, by %s on %p",
(unsigned long long)lock->fl_start,
(unsigned long long)lock->fl_end, lkowner_utoa (&lock->owner),
- lock->transport);
+ lock->client);
out:
return conf;
}
+
+
static void
__grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode,
struct list_head *granted, pl_dom_list_t *dom)
@@ -333,7 +337,8 @@ __grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode,
/* Grant all inodelks blocked on a lock */
void
-grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *dom)
+grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode,
+ pl_dom_list_t *dom)
{
struct list_head granted;
pl_inode_lock_t *lock;
@@ -360,6 +365,7 @@ grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *
&lock->user_flock, 0, 0, lock->volume);
STACK_UNWIND_STRICT (inodelk, lock->frame, 0, 0, NULL);
+ lock->frame = NULL;
}
pthread_mutex_lock (&pl_inode->mutex);
@@ -372,108 +378,153 @@ grant_blocked_inode_locks (xlator_t *this, pl_inode_t *pl_inode, pl_dom_list_t *
pthread_mutex_unlock (&pl_inode->mutex);
}
-/* Release all inodelks from this transport */
-static int
-release_inode_locks_of_transport (xlator_t *this, pl_dom_list_t *dom,
- inode_t *inode, void *trans)
-{
- pl_inode_lock_t *tmp = NULL;
- pl_inode_lock_t *l = NULL;
-
- pl_inode_t * pinode = NULL;
-
- struct list_head released;
+static void
+pl_inodelk_log_cleanup (pl_inode_lock_t *lock)
+{
+ pl_inode_t *pl_inode = NULL;
char *path = NULL;
char *file = NULL;
- INIT_LIST_HEAD (&released);
+ pl_inode = lock->pl_inode;
- pinode = pl_inode_get (this, inode);
+ inode_path (pl_inode->refkeeper, NULL, &path);
- pthread_mutex_lock (&pinode->mutex);
- {
+ if (path)
+ file = path;
+ else
+ file = uuid_utoa (pl_inode->refkeeper->gfid);
- list_for_each_entry_safe (l, tmp, &dom->blocked_inodelks, blocked_locks) {
- if (l->transport != trans)
- continue;
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "releasing lock on %s held by "
+ "{client=%p, pid=%"PRId64" lk-owner=%s}",
+ file, lock->client, (uint64_t) lock->client_pid,
+ lkowner_utoa (&lock->owner));
+ GF_FREE (path);
+}
- list_del_init (&l->blocked_locks);
- inode_path (inode, NULL, &path);
- if (path)
- file = path;
- else
- file = uuid_utoa (inode->gfid);
+/* Release all inodelks from this client */
+int
+pl_inodelk_client_cleanup (xlator_t *this, pl_ctx_t *ctx)
+{
+ pl_inode_lock_t *tmp = NULL;
+ pl_inode_lock_t *l = NULL;
+ pl_dom_list_t *dom = NULL;
+ pl_inode_t *pl_inode = NULL;
- gf_log (this->name, GF_LOG_DEBUG,
- "releasing blocking lock on %s held by "
- "{transport=%p, pid=%"PRId64" lk-owner=%s}",
- file, trans, (uint64_t) l->client_pid,
- lkowner_utoa (&l->owner));
+ struct list_head released;
+ struct list_head unwind;
- list_add (&l->blocked_locks, &released);
- if (path) {
- GF_FREE (path);
- path = NULL;
+ INIT_LIST_HEAD (&released);
+ INIT_LIST_HEAD (&unwind);
+
+ pthread_mutex_lock (&ctx->lock);
+ {
+ list_for_each_entry_safe (l, tmp, &ctx->inodelk_lockers,
+ client_list) {
+ list_del_init (&l->client_list);
+
+ pl_inodelk_log_cleanup (l);
+
+ pl_inode = l->pl_inode;
+
+ pthread_mutex_lock (&pl_inode->mutex);
+ {
+ /* If the inodelk object is part of granted list but not
+ * blocked list, then perform the following actions:
+ * i. delete the object from granted list;
+ * ii. grant other locks (from other clients) that may
+ * have been blocked on this inodelk; and
+ * iii. unref the object.
+ *
+ * If the inodelk object (L1) is part of both granted
+ * and blocked lists, then this means that a parallel
+ * unlock on another inodelk (L2 say) may have 'granted'
+ * L1 and added it to 'granted' list in
+ * __grant_blocked_node_locks() (although using the
+ * 'blocked_locks' member). In that case, the cleanup
+ * codepath must try and grant other overlapping
+ * blocked inodelks from other clients, now that L1 is
+ * out of their way and then unref L1 in the end, and
+ * leave it to the other thread (the one executing
+ * unlock codepath) to unwind L1's frame, delete it from
+ * blocked_locks list, and perform the last unref on L1.
+ *
+ * If the inodelk object (L1) is part of blocked list
+ * only, the cleanup code path must:
+ * i. delete it from the blocked_locks list inside
+ * this critical section,
+ * ii. unwind its frame with EAGAIN,
+ * iii. try and grant blocked inode locks from other
+ * clients that were otherwise grantable, but just
+ * got blocked to avoid leaving L1 to starve
+ * forever.
+ * iv. unref the object.
+ */
+ if (!list_empty (&l->list)) {
+ __delete_inode_lock (l);
+ list_add_tail (&l->client_list,
+ &released);
+ } else {
+ list_del_init(&l->blocked_locks);
+ list_add_tail (&l->client_list,
+ &unwind);
+ }
}
+ pthread_mutex_unlock (&pl_inode->mutex);
}
+ }
+ pthread_mutex_unlock (&ctx->lock);
- list_for_each_entry_safe (l, tmp, &dom->inodelk_list, list) {
- if (l->transport != trans)
- continue;
-
- inode_path (inode, NULL, &path);
- if (path)
- file = path;
- else
- file = uuid_utoa (inode->gfid);
-
- gf_log (this->name, GF_LOG_DEBUG,
- "releasing granted lock on %s held by "
- "{transport=%p, pid=%"PRId64" lk-owner=%s}",
- file, trans, (uint64_t) l->client_pid,
- lkowner_utoa (&l->owner));
-
- if (path) {
- GF_FREE (path);
- path = NULL;
- }
+ list_for_each_entry_safe (l, tmp, &unwind, client_list) {
+ list_del_init (&l->client_list);
+
+ if (l->frame)
+ STACK_UNWIND_STRICT (inodelk, l->frame, -1, EAGAIN,
+ NULL);
+ list_add_tail (&l->client_list, &released);
- __delete_inode_lock (l);
- __pl_inodelk_unref (l);
- }
}
- GF_FREE (path);
- pthread_mutex_unlock (&pinode->mutex);
+ list_for_each_entry_safe (l, tmp, &released, client_list) {
+ list_del_init (&l->client_list);
+
+ pl_inode = l->pl_inode;
+
+ dom = get_domain (pl_inode, l->volume);
- list_for_each_entry_safe (l, tmp, &released, blocked_locks) {
- list_del_init (&l->blocked_locks);
+ grant_blocked_inode_locks (this, pl_inode, dom);
- STACK_UNWIND_STRICT (inodelk, l->frame, -1, EAGAIN, NULL);
- //No need to take lock as the locks are only in one list
- __pl_inodelk_unref (l);
+ pthread_mutex_lock (&pl_inode->mutex);
+ {
+ __pl_inodelk_unref (l);
+ }
+ pthread_mutex_unlock (&pl_inode->mutex);
}
- grant_blocked_inode_locks (this, pinode, dom);
return 0;
}
static int
-pl_inode_setlk (xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock,
- int can_block, pl_dom_list_t *dom)
+pl_inode_setlk (xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,
+ pl_inode_lock_t *lock, int can_block, pl_dom_list_t *dom)
{
int ret = -EINVAL;
pl_inode_lock_t *retlock = NULL;
gf_boolean_t unref = _gf_true;
+ lock->pl_inode = pl_inode;
+
+ if (ctx)
+ pthread_mutex_lock (&ctx->lock);
pthread_mutex_lock (&pl_inode->mutex);
{
if (lock->fl_type != F_UNLCK) {
ret = __lock_inodelk (this, pl_inode, lock, can_block, dom);
if (ret == 0) {
+ lock->frame = NULL;
gf_log (this->name, GF_LOG_TRACE,
"%s (pid=%d) (lk-owner=%s) %"PRId64" - %"PRId64" => OK",
lock->fl_type == F_UNLCK ? "Unlock" : "Lock",
@@ -492,6 +543,10 @@ pl_inode_setlk (xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock,
if (can_block)
unref = _gf_false;
}
+
+ if (ctx && (!ret || can_block))
+ list_add_tail (&lock->client_list,
+ &ctx->inodelk_lockers);
} else {
retlock = __inode_unlock_lock (this, lock, dom);
if (!retlock) {
@@ -500,23 +555,29 @@ pl_inode_setlk (xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock,
ret = -EINVAL;
goto out;
}
- __pl_inodelk_unref (retlock);
+ list_del_init (&retlock->client_list);
+ __pl_inodelk_unref (retlock);
ret = 0;
}
- }
out:
- if (unref)
- __pl_inodelk_unref (lock);
+ if (unref)
+ __pl_inodelk_unref (lock);
+ }
pthread_mutex_unlock (&pl_inode->mutex);
+ if (ctx)
+ pthread_mutex_unlock (&ctx->lock);
+
grant_blocked_inode_locks (this, pl_inode, dom);
+
return ret;
}
/* Create a new inode_lock_t */
pl_inode_lock_t *
-new_inode_lock (struct gf_flock *flock, void *transport, pid_t client_pid,
- gf_lkowner_t *owner, const char *volume)
+new_inode_lock (struct gf_flock *flock, client_t *client, pid_t client_pid,
+ call_frame_t *frame, xlator_t *this, const char *volume,
+ char *conn_id)
{
pl_inode_lock_t *lock = NULL;
@@ -535,33 +596,77 @@ new_inode_lock (struct gf_flock *flock, void *transport, pid_t client_pid,
else
lock->fl_end = flock->l_start + flock->l_len - 1;
- lock->transport = transport;
+ lock->client = client;
lock->client_pid = client_pid;
lock->volume = volume;
- lock->owner = *owner;
+ lock->owner = frame->root->lk_owner;
+ lock->frame = frame;
+ lock->this = this;
+
+ if (conn_id) {
+ lock->connection_id = gf_strdup (conn_id);
+ }
INIT_LIST_HEAD (&lock->list);
INIT_LIST_HEAD (&lock->blocked_locks);
+ INIT_LIST_HEAD (&lock->client_list);
__pl_inodelk_ref (lock);
return lock;
}
+int32_t
+_pl_convert_volume (const char *volume, char **res)
+{
+ char *mdata_vol = NULL;
+ int ret = 0;
+
+ mdata_vol = strrchr (volume, ':');
+ //if the volume already ends with :metadata don't bother
+ if (mdata_vol && (strcmp (mdata_vol, ":metadata") == 0))
+ return 0;
+
+ ret = gf_asprintf (res, "%s:metadata", volume);
+ if (ret <= 0)
+ return ENOMEM;
+ return 0;
+}
+
+int32_t
+_pl_convert_volume_for_special_range (struct gf_flock *flock,
+ const char *volume, char **res)
+{
+ int32_t ret = 0;
+
+ if ((flock->l_start == LLONG_MAX -1) &&
+ (flock->l_len == 0)) {
+ ret = _pl_convert_volume (volume, res);
+ }
+
+ return ret;
+}
+
/* Common inodelk code called from pl_inodelk and pl_finodelk */
int
pl_common_inodelk (call_frame_t *frame, xlator_t *this,
const char *volume, inode_t *inode, int32_t cmd,
- struct gf_flock *flock, loc_t *loc, fd_t *fd)
+ struct gf_flock *flock, loc_t *loc, fd_t *fd, dict_t *xdata)
{
int32_t op_ret = -1;
int32_t op_errno = 0;
int ret = -1;
+ GF_UNUSED int dict_ret = -1;
int can_block = 0;
- pid_t client_pid = -1;
- void * transport = NULL;
pl_inode_t * pinode = NULL;
pl_inode_lock_t * reqlock = NULL;
pl_dom_list_t * dom = NULL;
+ char *res = NULL;
+ char *res1 = NULL;
+ char *conn_id = NULL;
+ pl_ctx_t *ctx = NULL;
+
+ if (xdata)
+ dict_ret = dict_get_str (xdata, "connection-id", &conn_id);
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (inode, unwind);
@@ -572,10 +677,22 @@ pl_common_inodelk (call_frame_t *frame, xlator_t *this,
goto unwind;
}
+ op_errno = _pl_convert_volume_for_special_range (flock, volume, &res);
+ if (op_errno)
+ goto unwind;
+ if (res)
+ volume = res;
+
pl_trace_in (this, frame, fd, loc, cmd, flock, volume);
- transport = frame->root->trans;
- client_pid = frame->root->pid;
+ if (frame->root->client) {
+ ctx = pl_ctx_get (frame->root->client, this);
+ if (!ctx) {
+ op_errno = ENOMEM;
+ gf_log (this->name, GF_LOG_INFO, "pl_ctx_get() failed");
+ goto unwind;
+ }
+ }
pinode = pl_inode_get (this, inode);
if (!pinode) {
@@ -589,22 +706,8 @@ pl_common_inodelk (call_frame_t *frame, xlator_t *this,
goto unwind;
}
- if (frame->root->lk_owner.len == 0) {
- /*
- special case: this means release all locks
- from this transport
- */
- gf_log (this->name, GF_LOG_TRACE,
- "Releasing all locks from transport %p", transport);
-
- release_inode_locks_of_transport (this, dom, inode, transport);
-
- op_ret = 0;
- goto unwind;
- }
-
- reqlock = new_inode_lock (flock, transport, client_pid,
- &frame->root->lk_owner, volume);
+ reqlock = new_inode_lock (flock, frame->root->client, frame->root->pid,
+ frame, this, volume, conn_id);
if (!reqlock) {
op_ret = -1;
@@ -612,21 +715,17 @@ pl_common_inodelk (call_frame_t *frame, xlator_t *this,
goto unwind;
}
- reqlock->frame = frame;
- reqlock->this = this;
switch (cmd) {
case F_SETLKW:
can_block = 1;
- reqlock->frame = frame;
- reqlock->this = this;
/* fall through */
case F_SETLK:
memcpy (&reqlock->user_flock, flock, sizeof (struct gf_flock));
- ret = pl_inode_setlk (this, pinode, reqlock,
- can_block, dom);
+ ret = pl_inode_setlk (this, ctx, pinode, reqlock, can_block,
+ dom);
if (ret < 0) {
if ((can_block) && (F_UNLCK != flock->l_type)) {
@@ -659,53 +758,78 @@ unwind:
STACK_UNWIND_STRICT (inodelk, frame, op_ret, op_errno, NULL);
out:
+ GF_FREE (res);
+ GF_FREE (res1);
return 0;
}
int
pl_inodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock)
+ const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *flock,
+ dict_t *xdata)
{
-
- pl_common_inodelk (frame, this, volume, loc->inode, cmd, flock, loc, NULL);
+ pl_common_inodelk (frame, this, volume, loc->inode, cmd, flock,
+ loc, NULL, xdata);
return 0;
}
int
pl_finodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock)
+ const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *flock,
+ dict_t *xdata)
{
-
- pl_common_inodelk (frame, this, volume, fd->inode, cmd, flock, NULL, fd);
+ pl_common_inodelk (frame, this, volume, fd->inode, cmd, flock,
+ NULL, fd, xdata);
return 0;
}
+static inline int32_t
+__get_inodelk_dom_count (pl_dom_list_t *dom)
+{
+ pl_inode_lock_t *lock = NULL;
+ int32_t count = 0;
+
+ list_for_each_entry (lock, &dom->inodelk_list, list) {
+ count++;
+ }
+ list_for_each_entry (lock, &dom->blocked_inodelks, blocked_locks) {
+ count++;
+ }
+ return count;
+}
+/* Returns the no. of locks (blocked/granted) held on a given domain name
+ * If @domname is NULL, returns the no. of locks in all the domains present.
+ * If @domname is non-NULL and non-existent, returns 0 */
int32_t
-__get_inodelk_count (xlator_t *this, pl_inode_t *pl_inode)
+__get_inodelk_count (xlator_t *this, pl_inode_t *pl_inode, char *domname)
{
int32_t count = 0;
- pl_inode_lock_t *lock = NULL;
pl_dom_list_t *dom = NULL;
list_for_each_entry (dom, &pl_inode->dom_list, inode_list) {
- list_for_each_entry (lock, &dom->inodelk_list, list) {
- count++;
- }
- list_for_each_entry (lock, &dom->blocked_inodelks, blocked_locks) {
- count++;
- }
+ if (domname) {
+ if (strcmp (domname, dom->domain) == 0) {
+ count = __get_inodelk_dom_count (dom);
+ goto out;
+ }
+
+ } else {
+ /* Counting locks from all domains */
+ count += __get_inodelk_dom_count (dom);
+ }
}
+out:
return count;
}
int32_t
-get_inodelk_count (xlator_t *this, inode_t *inode)
+get_inodelk_count (xlator_t *this, inode_t *inode, char *domname)
{
pl_inode_t *pl_inode = NULL;
uint64_t tmp_pl_inode = 0;
@@ -721,7 +845,7 @@ get_inodelk_count (xlator_t *this, inode_t *inode)
pthread_mutex_lock (&pl_inode->mutex);
{
- count = __get_inodelk_count (this, pl_inode);
+ count = __get_inodelk_count (this, pl_inode, domname);
}
pthread_mutex_unlock (&pl_inode->mutex);
diff --git a/xlators/features/locks/src/locks.h b/xlators/features/locks/src/locks.h
index d2b47bffe..8c2a6f867 100644
--- a/xlators/features/locks/src/locks.h
+++ b/xlators/features/locks/src/locks.h
@@ -19,10 +19,10 @@
#include "stack.h"
#include "call-stub.h"
#include "locks-mem-types.h"
+#include "client_t.h"
#include "lkowner.h"
-#define POSIX_LOCKS "posix-locks"
struct __pl_fd;
struct __posix_lock {
@@ -33,7 +33,7 @@ struct __posix_lock {
off_t fl_end;
short blocked; /* waiting to acquire */
- struct gf_flock user_flock; /* the flock supplied by the user */
+ struct gf_flock user_flock; /* the flock supplied by the user */
xlator_t *this; /* required for blocked locks */
unsigned long fd_num;
@@ -46,7 +46,7 @@ struct __posix_lock {
/* These two together serve to uniquely identify each process
across nodes */
- void *transport; /* to identify client node */
+ void *client; /* to identify client node */
gf_lkowner_t owner;
pid_t client_pid; /* pid of client process */
};
@@ -63,9 +63,9 @@ struct __pl_inode_lock {
const char *volume;
- struct gf_flock user_flock; /* the flock supplied by the user */
+ struct gf_flock user_flock; /* the flock supplied by the user */
xlator_t *this; /* required for blocked locks */
- fd_t *fd;
+ struct __pl_inode *pl_inode;
call_frame_t *frame;
@@ -75,9 +75,13 @@ struct __pl_inode_lock {
/* These two together serve to uniquely identify each process
across nodes */
- void *transport; /* to identify client node */
+ void *client; /* to identify client node */
gf_lkowner_t owner;
pid_t client_pid; /* pid of client process */
+
+ char *connection_id; /* stores the client connection id */
+
+ struct list_head client_list; /* list of all locks from a client */
};
typedef struct __pl_inode_lock pl_inode_lock_t;
@@ -101,9 +105,11 @@ typedef struct __pl_dom_list_t pl_dom_list_t;
struct __entry_lock {
struct list_head domain_list; /* list_head back to pl_dom_list_t */
struct list_head blocked_locks; /* list_head back to blocked_entrylks */
+ int ref;
call_frame_t *frame;
xlator_t *this;
+ struct __pl_inode *pinode;
const char *volume;
@@ -113,9 +119,13 @@ struct __entry_lock {
struct timeval blkd_time; /*time at which lock was queued into blkd list*/
struct timeval granted_time; /*time at which lock was queued into active list*/
- void *trans;
+ void *client;
gf_lkowner_t owner;
- pid_t client_pid; /* pid of client process */
+ pid_t client_pid; /* pid of client process */
+
+ char *connection_id; /* stores the client connection id */
+
+ struct list_head client_list; /* list of all locks from a client */
};
typedef struct __entry_lock pl_entry_lock_t;
@@ -140,20 +150,17 @@ struct __pl_inode {
typedef struct __pl_inode pl_inode_t;
-struct __pl_fd {
- gf_boolean_t nonblocking; /* whether O_NONBLOCK has been set */
-};
-typedef struct __pl_fd pl_fd_t;
-
-
typedef struct {
gf_boolean_t mandatory; /* if mandatory locking is enabled */
gf_boolean_t trace; /* trace lock requests in and out */
+ char *brickname;
} posix_locks_private_t;
+
typedef struct {
gf_boolean_t entrylk_count_req;
gf_boolean_t inodelk_count_req;
+ gf_boolean_t inodelk_dom_count_req;
gf_boolean_t posixlk_count_req;
gf_boolean_t parent_entrylk_req;
@@ -165,8 +172,33 @@ typedef struct {
enum {TRUNCATE, FTRUNCATE} op;
} pl_local_t;
+
typedef struct {
struct list_head locks_list;
} pl_fdctx_t;
+
+struct _locker {
+ struct list_head lockers;
+ char *volume;
+ inode_t *inode;
+ gf_lkowner_t owner;
+};
+
+typedef struct _locks_ctx {
+ pthread_mutex_t lock;
+ struct list_head inodelk_lockers;
+ struct list_head entrylk_lockers;
+} pl_ctx_t;
+
+
+pl_ctx_t *
+pl_ctx_get (client_t *client, xlator_t *xlator);
+
+int
+pl_inodelk_client_cleanup (xlator_t *this, pl_ctx_t *ctx);
+
+int
+pl_entrylk_client_cleanup (xlator_t *this, pl_ctx_t *ctx);
+
#endif /* __POSIX_LOCKS_H__ */
diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c
index 0fbe57e22..337623d65 100644
--- a/xlators/features/locks/src/posix.c
+++ b/xlators/features/locks/src/posix.c
@@ -28,6 +28,8 @@
#include "common.h"
#include "statedump.h"
#include "clear.h"
+#include "defaults.h"
+#include "syncop.h"
#ifndef LLONG_MAX
#define LLONG_MAX LONG_LONG_MAX /* compat with old gcc */
@@ -38,6 +40,9 @@
void do_blocked_rw (pl_inode_t *);
static int __rw_allowable (pl_inode_t *, posix_lock_t *, glusterfs_fop_t);
+static int format_brickname(char *);
+int pl_lockinfo_get_brickname (xlator_t *, inode_t *, int32_t *);
+static int fetch_pathinfo(xlator_t *, inode_t *, int32_t *, char **);
static pl_fdctx_t *
pl_new_fdctx ()
@@ -46,7 +51,7 @@ pl_new_fdctx ()
fdctx = GF_CALLOC (1, sizeof (*fdctx),
gf_locks_mt_pl_fdctx_t);
- GF_VALIDATE_OR_GOTO (POSIX_LOCKS, fdctx, out);
+ GF_VALIDATE_OR_GOTO ("posix-locks", fdctx, out);
INIT_LIST_HEAD (&fdctx->locks_list);
@@ -61,7 +66,7 @@ pl_check_n_create_fdctx (xlator_t *this, fd_t *fd)
uint64_t tmp = 0;
pl_fdctx_t *fdctx = NULL;
- GF_VALIDATE_OR_GOTO (POSIX_LOCKS, this, out);
+ GF_VALIDATE_OR_GOTO ("posix-locks", this, out);
GF_VALIDATE_OR_GOTO (this->name, fd, out);
LOCK (&fd->lock);
@@ -114,7 +119,7 @@ pl_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
static int
truncate_allowed (pl_inode_t *pl_inode,
- void *transport, pid_t client_pid,
+ client_t *client, pid_t client_pid,
gf_lkowner_t *owner, off_t offset)
{
posix_lock_t *l = NULL;
@@ -123,7 +128,7 @@ truncate_allowed (pl_inode_t *pl_inode,
region.fl_start = offset;
region.fl_end = LLONG_MAX;
- region.transport = transport;
+ region.client = client;
region.client_pid = client_pid;
region.owner = *owner;
@@ -134,7 +139,7 @@ truncate_allowed (pl_inode_t *pl_inode,
&& locks_overlap (&region, l)
&& !same_owner (&region, l)) {
ret = 0;
- gf_log (POSIX_LOCKS, GF_LOG_TRACE, "Truncate "
+ gf_log ("posix-locks", GF_LOG_TRACE, "Truncate "
"allowed");
break;
}
@@ -181,7 +186,7 @@ truncate_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (priv->mandatory
&& pl_inode->mandatory
- && !truncate_allowed (pl_inode, frame->root->trans,
+ && !truncate_allowed (pl_inode, frame->root->client,
frame->root->pid, &frame->root->lk_owner,
local->offset)) {
op_ret = -1;
@@ -289,7 +294,7 @@ pl_locks_by_fd (pl_inode_t *pl_inode, fd_t *fd)
{
list_for_each_entry (l, &pl_inode->ext_list, list) {
- if ((l->fd_num == fd_to_fdnum(fd))) {
+ if (l->fd_num == fd_to_fdnum(fd)) {
found = 1;
break;
}
@@ -314,7 +319,7 @@ delete_locks_of_fd (xlator_t *this, pl_inode_t *pl_inode, fd_t *fd)
{
list_for_each_entry_safe (l, tmp, &pl_inode->ext_list, list) {
- if ((l->fd_num == fd_to_fdnum(fd))) {
+ if (l->fd_num == fd_to_fdnum(fd)) {
if (l->blocked) {
list_move_tail (&l->list, &blocked_list);
continue;
@@ -342,7 +347,7 @@ delete_locks_of_fd (xlator_t *this, pl_inode_t *pl_inode, fd_t *fd)
static void
__delete_locks_of_owner (pl_inode_t *pl_inode,
- void *transport, gf_lkowner_t *owner)
+ client_t *client, gf_lkowner_t *owner)
{
posix_lock_t *tmp = NULL;
posix_lock_t *l = NULL;
@@ -352,7 +357,7 @@ __delete_locks_of_owner (pl_inode_t *pl_inode,
list_for_each_entry_safe (l, tmp, &pl_inode->ext_list, list) {
if (l->blocked)
continue;
- if ((l->transport == transport) &&
+ if ((l->client == client) &&
is_same_lkowner (&l->owner, owner)) {
gf_log ("posix-locks", GF_LOG_TRACE,
" Flushing lock"
@@ -386,7 +391,7 @@ int32_t
pl_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
const char *name, dict_t *xdata)
{
- int op_errno = EINVAL;
+ int32_t op_errno = EINVAL;
int op_ret = -1;
int32_t bcount = 0;
int32_t gcount = 0;
@@ -394,7 +399,8 @@ pl_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
char *lk_summary = NULL;
pl_inode_t *pl_inode = NULL;
dict_t *dict = NULL;
- clrlk_args args = {0,};
+ clrlk_args args = {0,};
+ char *brickname = NULL;
if (!name)
goto usual;
@@ -441,29 +447,48 @@ pl_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
goto out;
}
+ op_ret = fetch_pathinfo (this, loc->inode, &op_errno, &brickname);
+ if (op_ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Couldn't get brickname");
+ } else {
+ op_ret = format_brickname(brickname);
+ if (op_ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Couldn't format brickname");
+ GF_FREE(brickname);
+ brickname = NULL;
+ }
+ }
+
if (!gcount && !bcount) {
if (gf_asprintf (&lk_summary, "No locks cleared.") == -1) {
+ op_ret = -1;
op_errno = ENOMEM;
goto out;
}
} else if (gf_asprintf (&lk_summary, "%s: %s blocked locks=%d "
- "granted locks=%d", this->name,
+ "granted locks=%d",
+ (brickname == NULL)? this->name : brickname,
(args.type == CLRLK_INODE)? "inode":
(args.type == CLRLK_ENTRY)? "entry":
(args.type == CLRLK_POSIX)? "posix": " ",
bcount, gcount) == -1) {
+ op_ret = -1;
op_errno = ENOMEM;
goto out;
}
strncpy (key, name, strlen (name));
if (dict_set_dynstr (dict, key, lk_summary)) {
+ op_ret = -1;
op_errno = ENOMEM;
goto out;
}
op_ret = 0;
out:
+ GF_FREE(brickname);
STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict, xdata);
GF_FREE (args.opts);
@@ -479,6 +504,406 @@ usual:
return 0;
}
+static int
+format_brickname(char *brickname)
+{
+ int ret = -1;
+ char *hostname = NULL;
+ char *volume = NULL;
+ char *saveptr = NULL;
+
+ if (!brickname)
+ goto out;
+
+ strtok_r(brickname, ":", &saveptr);
+ hostname = gf_strdup(strtok_r(NULL, ":", &saveptr));
+ if (hostname == NULL)
+ goto out;
+ volume = gf_strdup(strtok_r(NULL, ".", &saveptr));
+ if (volume == NULL)
+ goto out;
+
+ sprintf(brickname, "%s:%s", hostname, volume);
+
+ ret = 0;
+out:
+ GF_FREE(hostname);
+ GF_FREE(volume);
+ return ret;
+}
+
+static int
+fetch_pathinfo (xlator_t *this, inode_t *inode, int32_t *op_errno,
+ char **brickname)
+{
+ int ret = -1;
+ loc_t loc = {0, };
+ dict_t *dict = NULL;
+
+ if (!brickname)
+ goto out;
+
+ if (!op_errno)
+ goto out;
+
+ uuid_copy (loc.gfid, inode->gfid);
+ loc.inode = inode_ref (inode);
+
+ ret = syncop_getxattr (FIRST_CHILD(this), &loc, &dict,
+ GF_XATTR_PATHINFO_KEY);
+ if (ret < 0) {
+ *op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_get_str (dict, GF_XATTR_PATHINFO_KEY, brickname);
+ if (ret)
+ goto out;
+
+ *brickname = gf_strdup(*brickname);
+ if (*brickname == NULL) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (dict != NULL) {
+ dict_unref (dict);
+ }
+ loc_wipe(&loc);
+
+ return ret;
+}
+
+
+int
+pl_lockinfo_get_brickname (xlator_t *this, inode_t *inode, int32_t *op_errno)
+{
+ int ret = -1;
+ posix_locks_private_t *priv = NULL;
+ char *brickname = NULL;
+ char *end = NULL;
+ char *tmp = NULL;
+
+ priv = this->private;
+
+ ret = fetch_pathinfo (this, inode, op_errno, &brickname);
+ if (ret)
+ goto out;
+
+ end = strrchr (brickname, ':');
+ if (!end) {
+ GF_FREE(brickname);
+ ret = -1;
+ goto out;
+ }
+
+ tmp = brickname;
+ brickname = gf_strndup (brickname, (end - brickname));
+ if (brickname == NULL) {
+ ret = -1;
+ goto out;
+ }
+
+ priv->brickname = brickname;
+ ret = 0;
+out:
+ GF_FREE(tmp);
+ return ret;
+}
+
+char *
+pl_lockinfo_key (xlator_t *this, inode_t *inode, int32_t *op_errno)
+{
+ posix_locks_private_t *priv = NULL;
+ char *key = NULL;
+ int ret = 0;
+
+ priv = this->private;
+
+ if (priv->brickname == NULL) {
+ ret = pl_lockinfo_get_brickname (this, inode, op_errno);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "cannot get brickname");
+ goto out;
+ }
+ }
+
+ key = priv->brickname;
+out:
+ return key;
+}
+
+int32_t
+pl_fgetxattr_handle_lockinfo (xlator_t *this, fd_t *fd,
+ dict_t *dict, int32_t *op_errno)
+{
+ pl_inode_t *pl_inode = NULL;
+ char *key = NULL, *buf = NULL;
+ int32_t op_ret = 0;
+ unsigned long fdnum = 0;
+ int32_t len = 0;
+ dict_t *tmp = NULL;
+
+ pl_inode = pl_inode_get (this, fd->inode);
+
+ if (!pl_inode) {
+ gf_log (this->name, GF_LOG_DEBUG, "Could not get inode.");
+ *op_errno = EBADFD;
+ op_ret = -1;
+ goto out;
+ }
+
+ if (!pl_locks_by_fd (pl_inode, fd)) {
+ op_ret = 0;
+ goto out;
+ }
+
+ fdnum = fd_to_fdnum (fd);
+
+ key = pl_lockinfo_key (this, fd->inode, op_errno);
+ if (key == NULL) {
+ op_ret = -1;
+ goto out;
+ }
+
+ tmp = dict_new ();
+ if (tmp == NULL) {
+ op_ret = -1;
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ op_ret = dict_set_uint64 (tmp, key, fdnum);
+ if (op_ret < 0) {
+ *op_errno = -op_ret;
+ op_ret = -1;
+ gf_log (this->name, GF_LOG_WARNING, "setting lockinfo value "
+ "(%lu) for fd (ptr:%p inode-gfid:%s) failed (%s)",
+ fdnum, fd, uuid_utoa (fd->inode->gfid),
+ strerror (*op_errno));
+ goto out;
+ }
+
+ len = dict_serialized_length (tmp);
+ if (len < 0) {
+ *op_errno = -op_ret;
+ op_ret = -1;
+ gf_log (this->name, GF_LOG_WARNING,
+ "dict_serialized_length failed (%s) while handling "
+ "lockinfo for fd (ptr:%p inode-gfid:%s)",
+ strerror (*op_errno), fd, uuid_utoa (fd->inode->gfid));
+ goto out;
+ }
+
+ buf = GF_CALLOC (1, len, gf_common_mt_char);
+ if (buf == NULL) {
+ op_ret = -1;
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ op_ret = dict_serialize (tmp, buf);
+ if (op_ret < 0) {
+ *op_errno = -op_ret;
+ op_ret = -1;
+ gf_log (this->name, GF_LOG_WARNING,
+ "dict_serialize failed (%s) while handling lockinfo "
+ "for fd (ptr: %p inode-gfid:%s)", strerror (*op_errno),
+ fd, uuid_utoa (fd->inode->gfid));
+ goto out;
+ }
+
+ op_ret = dict_set_dynptr (dict, GF_XATTR_LOCKINFO_KEY, buf, len);
+ if (op_ret < 0) {
+ *op_errno = -op_ret;
+ op_ret = -1;
+ gf_log (this->name, GF_LOG_WARNING, "setting lockinfo value "
+ "(%lu) for fd (ptr:%p inode-gfid:%s) failed (%s)",
+ fdnum, fd, uuid_utoa (fd->inode->gfid),
+ strerror (*op_errno));
+ goto out;
+ }
+
+ buf = NULL;
+out:
+ if (tmp != NULL) {
+ dict_unref (tmp);
+ }
+
+ if (buf != NULL) {
+ GF_FREE (buf);
+ }
+
+ return op_ret;
+}
+
+
+int32_t
+pl_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata)
+{
+ int32_t op_ret = 0, op_errno = 0;
+ dict_t *dict = NULL;
+
+ if (!name) {
+ goto usual;
+ }
+
+ if (strcmp (name, GF_XATTR_LOCKINFO_KEY) == 0) {
+ dict = dict_new ();
+ if (dict == NULL) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+
+ op_ret = pl_fgetxattr_handle_lockinfo (this, fd, dict,
+ &op_errno);
+ if (op_ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "getting lockinfo on fd (ptr:%p inode-gfid:%s) "
+ "failed (%s)", fd, uuid_utoa (fd->inode->gfid),
+ strerror (op_errno));
+ }
+
+ goto unwind;
+ } else {
+ goto usual;
+ }
+
+unwind:
+ STACK_UNWIND_STRICT (fgetxattr, frame, op_ret, op_errno, dict, NULL);
+ if (dict != NULL) {
+ dict_unref (dict);
+ }
+
+ return 0;
+
+usual:
+ STACK_WIND (frame, default_fgetxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata);
+ return 0;
+}
+
+int32_t
+pl_migrate_locks (call_frame_t *frame, fd_t *newfd, uint64_t oldfd_num,
+ int32_t *op_errno)
+{
+ pl_inode_t *pl_inode = NULL;
+ uint64_t newfd_num = 0;
+ posix_lock_t *l = NULL;
+ int32_t op_ret = 0;
+
+ newfd_num = fd_to_fdnum (newfd);
+
+ pl_inode = pl_inode_get (frame->this, newfd->inode);
+ if (pl_inode == NULL) {
+ op_ret = -1;
+ *op_errno = EBADFD;
+ goto out;
+ }
+
+ pthread_mutex_lock (&pl_inode->mutex);
+ {
+ list_for_each_entry (l, &pl_inode->ext_list, list) {
+ if (l->fd_num == oldfd_num) {
+ l->fd_num = newfd_num;
+ l->client = frame->root->client;
+ }
+ }
+ }
+ pthread_mutex_unlock (&pl_inode->mutex);
+
+ op_ret = 0;
+out:
+ return op_ret;
+}
+
+int32_t
+pl_fsetxattr_handle_lockinfo (call_frame_t *frame, fd_t *fd, char *lockinfo_buf,
+ int len, int32_t *op_errno)
+{
+ int32_t op_ret = -1;
+ dict_t *lockinfo = NULL;
+ uint64_t oldfd_num = 0;
+ char *key = NULL;
+
+ lockinfo = dict_new ();
+ if (lockinfo == NULL) {
+ op_ret = -1;
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ op_ret = dict_unserialize (lockinfo_buf, len, &lockinfo);
+ if (op_ret < 0) {
+ *op_errno = -op_ret;
+ op_ret = -1;
+ goto out;
+ }
+
+ key = pl_lockinfo_key (frame->this, fd->inode, op_errno);
+ if (key == NULL) {
+ op_ret = -1;
+ goto out;
+ }
+
+ op_ret = dict_get_uint64 (lockinfo, key, &oldfd_num);
+
+ if (oldfd_num == 0) {
+ op_ret = 0;
+ goto out;
+ }
+
+ op_ret = pl_migrate_locks (frame, fd, oldfd_num, op_errno);
+ if (op_ret < 0) {
+ gf_log (frame->this->name, GF_LOG_WARNING,
+ "migration of locks from oldfd (ptr:%p) to newfd "
+ "(ptr:%p) (inode-gfid:%s)", (void *)oldfd_num, fd,
+ uuid_utoa (fd->inode->gfid));
+ goto out;
+ }
+
+out:
+ dict_unref (lockinfo);
+
+ return op_ret;
+}
+
+int32_t
+pl_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+ int32_t flags, dict_t *xdata)
+{
+ int32_t op_ret = 0, op_errno = 0;
+ void *lockinfo_buf = NULL;
+ int len = 0;
+
+ op_ret = dict_get_ptr_and_len (dict, GF_XATTR_LOCKINFO_KEY,
+ &lockinfo_buf, &len);
+ if (lockinfo_buf == NULL) {
+ goto usual;
+ }
+
+ op_ret = pl_fsetxattr_handle_lockinfo (frame, fd, lockinfo_buf, len,
+ &op_errno);
+ if (op_ret < 0) {
+ goto unwind;
+ }
+
+usual:
+ STACK_WIND (frame, default_fsetxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata);
+ return 0;
+
+unwind:
+ STACK_UNWIND_STRICT (fsetxattr, frame, op_ret, op_errno, NULL);
+ return 0;
+}
+
int32_t
pl_opendir_cbk (call_frame_t *frame,
void *cookie,
@@ -560,7 +985,7 @@ pl_flush (call_frame_t *frame, xlator_t *this,
}
pthread_mutex_lock (&pl_inode->mutex);
{
- __delete_locks_of_owner (pl_inode, frame->root->trans,
+ __delete_locks_of_owner (pl_inode, frame->root->client,
&frame->root->lk_owner);
}
pthread_mutex_unlock (&pl_inode->mutex);
@@ -755,7 +1180,7 @@ pl_readv (call_frame_t *frame, xlator_t *this,
if (priv->mandatory && pl_inode->mandatory) {
region.fl_start = offset;
region.fl_end = offset + size - 1;
- region.transport = frame->root->trans;
+ region.client = frame->root->client;
region.fd_num = fd_to_fdnum(fd);
region.client_pid = frame->root->pid;
region.owner = frame->root->lk_owner;
@@ -849,7 +1274,7 @@ pl_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
if (priv->mandatory && pl_inode->mandatory) {
region.fl_start = offset;
region.fl_end = offset + iov_length (vector, count) - 1;
- region.transport = frame->root->trans;
+ region.client = frame->root->client;
region.fd_num = fd_to_fdnum(fd);
region.client_pid = frame->root->pid;
region.owner = frame->root->lk_owner;
@@ -916,7 +1341,7 @@ __fd_has_locks (pl_inode_t *pl_inode, fd_t *fd)
posix_lock_t *l = NULL;
list_for_each_entry (l, &pl_inode->ext_list, list) {
- if ((l->fd_num == fd_to_fdnum(fd))) {
+ if (l->fd_num == fd_to_fdnum(fd)) {
found = 1;
break;
}
@@ -930,7 +1355,7 @@ lock_dup (posix_lock_t *lock)
{
posix_lock_t *new_lock = NULL;
- new_lock = new_posix_lock (&lock->user_flock, lock->transport,
+ new_lock = new_posix_lock (&lock->user_flock, lock->client,
lock->client_pid, &lock->owner,
(fd_t *)lock->fd_num);
return new_lock;
@@ -945,7 +1370,7 @@ __dup_locks_to_fdctx (pl_inode_t *pl_inode, fd_t *fd,
int ret = 0;
list_for_each_entry (l, &pl_inode->ext_list, list) {
- if ((l->fd_num == fd_to_fdnum(fd))) {
+ if (l->fd_num == fd_to_fdnum(fd)) {
duplock = lock_dup (l);
if (!duplock) {
ret = -1;
@@ -1090,8 +1515,6 @@ int
pl_lk (call_frame_t *frame, xlator_t *this,
fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata)
{
- void *transport = NULL;
- pid_t client_pid = 0;
pl_inode_t *pl_inode = NULL;
int op_ret = 0;
int op_errno = 0;
@@ -1100,9 +1523,6 @@ pl_lk (call_frame_t *frame, xlator_t *this,
posix_lock_t *conf = NULL;
int ret = 0;
- transport = frame->root->trans;
- client_pid = frame->root->pid;
-
if ((flock->l_start < 0) || (flock->l_len < 0)) {
op_ret = -1;
op_errno = EINVAL;
@@ -1116,7 +1536,7 @@ pl_lk (call_frame_t *frame, xlator_t *this,
goto unwind;
}
- reqlock = new_posix_lock (flock, transport, client_pid,
+ reqlock = new_posix_lock (flock, frame->root->client, frame->root->pid,
&frame->root->lk_owner, fd);
if (!reqlock) {
@@ -1341,6 +1761,7 @@ pl_forget (xlator_t *this,
list_del_init (&entry_l->domain_list);
GF_FREE ((char *)entry_l->basename);
+ GF_FREE (entry_l->connection_id);
GF_FREE (entry_l);
}
@@ -1374,6 +1795,7 @@ pl_forget (xlator_t *this,
STACK_UNWIND_STRICT (entrylk, entry_l->frame, -1, 0, NULL);
GF_FREE ((char *)entry_l->basename);
+ GF_FREE (entry_l->connection_id);
GF_FREE (entry_l);
}
@@ -1523,19 +1945,34 @@ pl_entrylk_xattr_fill (xlator_t *this, inode_t *inode,
}
void
-pl_inodelk_xattr_fill (xlator_t *this, inode_t *inode,
- dict_t *dict)
+pl_inodelk_xattr_fill (xlator_t *this, inode_t *inode, dict_t *dict,
+ gf_boolean_t per_dom)
{
int32_t count = 0;
int ret = -1;
+ char *domname = NULL;
+
+
+ if (per_dom){
+ ret = dict_get_str (dict, GLUSTERFS_INODELK_DOM_COUNT,
+ &domname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get "
+ "value for key %s",GLUSTERFS_INODELK_DOM_COUNT);
+ goto out;
+ }
+ }
+
+ count = get_inodelk_count (this, inode, domname);
- count = get_inodelk_count (this, inode);
ret = dict_set_int32 (dict, GLUSTERFS_INODELK_COUNT, count);
if (ret < 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- " dict_set failed on key %s", GLUSTERFS_INODELK_COUNT);
+ gf_log (this->name, GF_LOG_DEBUG, "Failed to set count for "
+ "key %s", GLUSTERFS_INODELK_COUNT);
}
+out:
+ return;
}
void
@@ -1580,7 +2017,9 @@ pl_lookup_cbk (call_frame_t *frame,
if (local->entrylk_count_req)
pl_entrylk_xattr_fill (this, inode, xdata);
if (local->inodelk_count_req)
- pl_inodelk_xattr_fill (this, inode, xdata);
+ pl_inodelk_xattr_fill (this, inode, xdata, _gf_false);
+ if (local->inodelk_dom_count_req)
+ pl_inodelk_xattr_fill (this, inode, xdata, _gf_true);
if (local->posixlk_count_req)
pl_posixlk_xattr_fill (this, inode, xdata);
@@ -1627,6 +2066,8 @@ pl_lookup (call_frame_t *frame,
local->entrylk_count_req = 1;
if (dict_get (xdata, GLUSTERFS_INODELK_COUNT))
local->inodelk_count_req = 1;
+ if (dict_get (xdata, GLUSTERFS_INODELK_DOM_COUNT))
+ local->inodelk_dom_count_req = 1;
if (dict_get (xdata, GLUSTERFS_POSIXLK_COUNT))
local->posixlk_count_req = 1;
if (dict_get (xdata, GLUSTERFS_PARENT_ENTRYLK))
@@ -1665,7 +2106,11 @@ pl_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (local->entrylk_count_req)
pl_entrylk_xattr_fill (this, entry->inode, entry->dict);
if (local->inodelk_count_req)
- pl_inodelk_xattr_fill (this, entry->inode, entry->dict);
+ pl_inodelk_xattr_fill (this, entry->inode, entry->dict,
+ _gf_false);
+ if (local->inodelk_dom_count_req)
+ pl_inodelk_xattr_fill (this, entry->inode, entry->dict,
+ _gf_true);
if (local->posixlk_count_req)
pl_posixlk_xattr_fill (this, entry->inode, entry->dict);
}
@@ -1694,6 +2139,8 @@ pl_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
local->entrylk_count_req = 1;
if (dict_get (dict, GLUSTERFS_INODELK_COUNT))
local->inodelk_count_req = 1;
+ if (dict_get (dict, GLUSTERFS_INODELK_DOM_COUNT))
+ local->inodelk_dom_count_req = 1;
if (dict_get (dict, GLUSTERFS_POSIXLK_COUNT))
local->posixlk_count_req = 1;
}
@@ -1713,8 +2160,8 @@ out:
void
pl_dump_lock (char *str, int size, struct gf_flock *flock,
- gf_lkowner_t *owner, void *trans, time_t *granted_time,
- time_t *blkd_time, gf_boolean_t active)
+ gf_lkowner_t *owner, void *trans, char *conn_id,
+ time_t *granted_time, time_t *blkd_time, gf_boolean_t active)
{
char *type_str = NULL;
char granted[32] = {0,};
@@ -1742,7 +2189,7 @@ pl_dump_lock (char *str, int size, struct gf_flock *flock,
(unsigned long long) flock->l_start,
(unsigned long long) flock->l_len,
(unsigned long long) flock->l_pid,
- lkowner_utoa (owner), trans,
+ lkowner_utoa (owner), trans, conn_id,
ctime_r (granted_time, granted));
} else {
snprintf (str, size, RANGE_BLKD_GRNTD_FMT,
@@ -1750,7 +2197,7 @@ pl_dump_lock (char *str, int size, struct gf_flock *flock,
(unsigned long long) flock->l_start,
(unsigned long long) flock->l_len,
(unsigned long long) flock->l_pid,
- lkowner_utoa (owner), trans,
+ lkowner_utoa (owner), trans, conn_id,
ctime_r (blkd_time, blocked),
ctime_r (granted_time, granted));
}
@@ -1761,7 +2208,7 @@ pl_dump_lock (char *str, int size, struct gf_flock *flock,
(unsigned long long) flock->l_start,
(unsigned long long) flock->l_len,
(unsigned long long) flock->l_pid,
- lkowner_utoa (owner), trans,
+ lkowner_utoa (owner), trans, conn_id,
ctime_r (blkd_time, blocked));
}
@@ -1798,14 +2245,16 @@ __dump_entrylks (pl_inode_t *pl_inode)
lock->type == ENTRYLK_RDLCK ? "ENTRYLK_RDLCK" :
"ENTRYLK_WRLCK", lock->basename,
(unsigned long long) lock->client_pid,
- lkowner_utoa (&lock->owner), lock->trans,
+ lkowner_utoa (&lock->owner), lock->client,
+ lock->connection_id,
ctime_r (&lock->granted_time.tv_sec, granted));
} else {
snprintf (tmp, 256, ENTRY_BLKD_GRNTD_FMT,
lock->type == ENTRYLK_RDLCK ? "ENTRYLK_RDLCK" :
"ENTRYLK_WRLCK", lock->basename,
(unsigned long long) lock->client_pid,
- lkowner_utoa (&lock->owner), lock->trans,
+ lkowner_utoa (&lock->owner), lock->client,
+ lock->connection_id,
ctime_r (&lock->blkd_time.tv_sec, blocked),
ctime_r (&lock->granted_time.tv_sec, granted));
}
@@ -1824,7 +2273,8 @@ __dump_entrylks (pl_inode_t *pl_inode)
lock->type == ENTRYLK_RDLCK ? "ENTRYLK_RDLCK" :
"ENTRYLK_WRLCK", lock->basename,
(unsigned long long) lock->client_pid,
- lkowner_utoa (&lock->owner), lock->trans,
+ lkowner_utoa (&lock->owner), lock->client,
+ lock->connection_id,
ctime_r (&lock->blkd_time.tv_sec, blocked));
gf_proc_dump_write(key, tmp);
@@ -1875,7 +2325,7 @@ __dump_inodelks (pl_inode_t *pl_inode)
SET_FLOCK_PID (&lock->user_flock, lock);
pl_dump_lock (tmp, 256, &lock->user_flock,
&lock->owner,
- lock->transport,
+ lock->client, lock->connection_id,
&lock->granted_time.tv_sec,
&lock->blkd_time.tv_sec,
_gf_true);
@@ -1892,7 +2342,7 @@ __dump_inodelks (pl_inode_t *pl_inode)
SET_FLOCK_PID (&lock->user_flock, lock);
pl_dump_lock (tmp, 256, &lock->user_flock,
&lock->owner,
- lock->transport,
+ lock->client, lock->connection_id,
0, &lock->blkd_time.tv_sec,
_gf_false);
gf_proc_dump_write(key, tmp);
@@ -1933,7 +2383,7 @@ __dump_posixlks (pl_inode_t *pl_inode)
count,
lock->blocked ? "BLOCKED" : "ACTIVE");
pl_dump_lock (tmp, 256, &lock->user_flock,
- &lock->owner, lock->transport,
+ &lock->owner, lock->client, NULL,
&lock->granted_time.tv_sec, &lock->blkd_time.tv_sec,
(lock->blocked)? _gf_false: _gf_true);
gf_proc_dump_write(key, tmp);
@@ -2010,7 +2460,7 @@ unlock:
__dump_entrylks (pl_inode);
}
- count = __get_inodelk_count (this, pl_inode);
+ count = __get_inodelk_count (this, pl_inode, NULL);
if (count) {
gf_proc_dump_write("inodelk-count", "%d", count);
__dump_inodelks (pl_inode);
@@ -2057,6 +2507,79 @@ mem_acct_init (xlator_t *this)
return ret;
}
+
+pl_ctx_t*
+pl_ctx_get (client_t *client, xlator_t *xlator)
+{
+ void *tmp = NULL;
+ pl_ctx_t *ctx = NULL;
+
+ client_ctx_get (client, xlator, &tmp);
+
+ ctx = tmp;
+
+ if (ctx != NULL)
+ goto out;
+
+ ctx = GF_CALLOC (1, sizeof (pl_ctx_t), gf_locks_mt_posix_lock_t);
+
+ if (ctx == NULL)
+ goto out;
+
+ pthread_mutex_init (&ctx->lock, NULL);
+ INIT_LIST_HEAD (&ctx->inodelk_lockers);
+ INIT_LIST_HEAD (&ctx->entrylk_lockers);
+
+ if (client_ctx_set (client, xlator, ctx) != 0) {
+ pthread_mutex_destroy (&ctx->lock);
+ GF_FREE (ctx);
+ ctx = NULL;
+ }
+out:
+ return ctx;
+}
+
+
+static int
+pl_client_disconnect_cbk (xlator_t *this, client_t *client)
+{
+ pl_ctx_t *pl_ctx = NULL;
+
+ pl_ctx = pl_ctx_get (client, this);
+
+ pl_inodelk_client_cleanup (this, pl_ctx);
+
+ pl_entrylk_client_cleanup (this, pl_ctx);
+
+ return 0;
+}
+
+
+static int
+pl_client_destroy_cbk (xlator_t *this, client_t *client)
+{
+ void *tmp = NULL;
+ pl_ctx_t *pl_ctx = NULL;
+
+ pl_client_disconnect_cbk (this, client);
+
+ client_ctx_del (client, this, &tmp);
+
+ if (tmp == NULL)
+ return 0;
+
+ pl_ctx = tmp;
+
+ GF_ASSERT (list_empty(&pl_ctx->inodelk_lockers));
+ GF_ASSERT (list_empty(&pl_ctx->entrylk_lockers));
+
+ pthread_mutex_destroy (&pl_ctx->lock);
+ GF_FREE (pl_ctx);
+
+ return 0;
+}
+
+
int
init (xlator_t *this)
{
@@ -2085,7 +2608,7 @@ init (xlator_t *this)
gf_log (this->name, GF_LOG_CRITICAL,
"'locks' translator is not loaded over a storage "
"translator");
- goto out;;
+ goto out;
}
priv = GF_CALLOC (1, sizeof (*priv),
@@ -2134,6 +2657,7 @@ fini (xlator_t *this)
if (!priv)
return 0;
this->private = NULL;
+ GF_FREE (priv->brickname);
GF_FREE (priv);
return 0;
@@ -2177,6 +2701,8 @@ struct xlator_fops fops = {
.opendir = pl_opendir,
.readdirp = pl_readdirp,
.getxattr = pl_getxattr,
+ .fgetxattr = pl_fgetxattr,
+ .fsetxattr = pl_fsetxattr,
};
struct xlator_dumpops dumpops = {
@@ -2184,9 +2710,11 @@ struct xlator_dumpops dumpops = {
};
struct xlator_cbks cbks = {
- .forget = pl_forget,
- .release = pl_release,
- .releasedir = pl_releasedir,
+ .forget = pl_forget,
+ .release = pl_release,
+ .releasedir = pl_releasedir,
+ .client_destroy = pl_client_destroy_cbk,
+ .client_disconnect = pl_client_disconnect_cbk,
};
diff --git a/xlators/features/mac-compat/src/Makefile.am b/xlators/features/mac-compat/src/Makefile.am
index 3ba71256a..42ed350e9 100644
--- a/xlators/features/mac-compat/src/Makefile.am
+++ b/xlators/features/mac-compat/src/Makefile.am
@@ -1,14 +1,15 @@
xlator_LTLIBRARIES = mac-compat.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
-mac_compat_la_LDFLAGS = -module -avoidversion
+mac_compat_la_LDFLAGS = -module -avoid-version
mac_compat_la_SOURCES = mac-compat.c
mac_compat_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+noinst_HEADERS = mac-compat.h
+
AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src
AM_CFLAGS = -Wall $(GF_CFLAGS)
-CLEANFILES =
-
+CLEANFILES =
diff --git a/xlators/features/mac-compat/src/mac-compat.c b/xlators/features/mac-compat/src/mac-compat.c
index b50afe999..0eaf563e8 100644
--- a/xlators/features/mac-compat/src/mac-compat.c
+++ b/xlators/features/mac-compat/src/mac-compat.c
@@ -15,35 +15,28 @@
#include "xlator.h"
#include "defaults.h"
#include "compat-errno.h"
+#include "syscall.h"
+#include "mem-pool.h"
+#include "mac-compat.h"
-
-enum apple_xattr {
- GF_FINDER_INFO_XATTR,
- GF_RESOURCE_FORK_XATTR,
- GF_XATTR_ALL,
- GF_XATTR_NONE
-};
-
-static char *apple_xattr_name[] = {
- [GF_FINDER_INFO_XATTR] = "com.apple.FinderInfo",
- [GF_RESOURCE_FORK_XATTR] = "com.apple.ResourceFork"
-};
-
-static const char *apple_xattr_value[] = {
- [GF_FINDER_INFO_XATTR] =
- /* 1 2 3 4 5 6 7 8 */
- "\0\0\0\0\0\0\0\0"
- "\0\0\0\0\0\0\0\0"
- "\0\0\0\0\0\0\0\0"
- "\0\0\0\0\0\0\0\0",
- [GF_RESOURCE_FORK_XATTR] = ""
-};
-
-static int32_t apple_xattr_len[] = {
- [GF_FINDER_INFO_XATTR] = 32,
- [GF_RESOURCE_FORK_XATTR] = 1
-};
-
+static int
+dict_key_remove_namespace(dict_t *dict, char *key, data_t *value, void *data)
+{
+ /*
+ char buffer[3*value->len+1];
+ int index = 0;
+ for (index = 0; index < value->len; index++)
+ sprintf(buffer+3*index, " %02x", value->data[index]);
+ */
+ xlator_t *this = (xlator_t *) data;
+ if (strncmp(key, "user.", 5) == 0) {
+ dict_set (dict, key + 5, value);
+ gf_log (this->name, GF_LOG_DEBUG,
+ "remove_namespace_dict: %s -> %s ", key, key + 5);
+ dict_del (dict, key);
+ }
+ return 0;
+}
int32_t
maccomp_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -53,54 +46,91 @@ maccomp_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
intptr_t ax = (intptr_t)this->private;
int i = 0;
+ gf_log (this->name, GF_LOG_DEBUG,
+ "getxattr_cbk: dict %p private: %p xdata %p ", dict,
+ this->private, xdata);
+
+ if (dict) {
+ dict_foreach(dict, dict_key_remove_namespace, this);
+ }
+ else {
+ // TODO: we expect dict to exist here, don't know why this
+ // this is needed
+ dict = dict_new();
+ }
+ gf_log (this->name, GF_LOG_DEBUG,
+ "getxattr_cbk: dict %p ax: %ld op_ret %d op_err %d ", dict, ax,
+ op_ret, op_errno);
if ((ax == GF_XATTR_ALL && op_ret >= 0) || ax != GF_XATTR_NONE) {
op_ret = op_errno = 0;
-
for (i = 0; i < GF_XATTR_ALL; i++) {
if (dict_get (dict, apple_xattr_name[i]))
continue;
-
+ /* set dummy data */
+ gf_log (this->name, GF_LOG_DEBUG,
+ "getxattr_cbk: setting dummy data %p, %s", dict,
+ apple_xattr_name[i]);
if (dict_set (dict, apple_xattr_name[i],
bin_to_data ((void *)apple_xattr_value[i],
apple_xattr_len[i])) == -1) {
op_ret = -1;
- op_errno = ENOMEM;
+ op_errno = ENOATTR;
break;
}
}
}
-
STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict, xdata);
-
return 0;
}
-int32_t
-maccomp_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
- const char *name, dict_t *xdata)
+static
+int prepend_xattr_user_namespace(dict_t *dict, char *key, data_t *value, void *obj)
{
- intptr_t ax = GF_XATTR_NONE;
- int i = 0;
+ xlator_t *this = (xlator_t *) obj;
+ dict_t *newdict = (dict_t *) this->private;
+ char *newkey = NULL;
+ gf_add_prefix(XATTR_USER_PREFIX, key, &newkey);
+ key = newkey;
+ dict_set(newdict, (char *)key, value);
+ if (newkey)
+ GF_FREE(newkey);
+ return 0;
+}
+intptr_t
+check_name(const char *name, char **newkey)
+{
+ intptr_t ax = GF_XATTR_NONE;
if (name) {
+ int i = 0;
for (i = 0; i < GF_XATTR_ALL; i++) {
if (strcmp (apple_xattr_name[i], name) == 0) {
ax = i;
-
break;
}
}
+ gf_add_prefix("user.", name, newkey);
} else
ax = GF_XATTR_ALL;
+ return ax;
+}
- this->private = (void *)ax;
+int32_t
+maccomp_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ char *newkey = NULL;
+ this->private = (void *) check_name(name, &newkey);
+ gf_log (this->name, GF_LOG_DEBUG,
+ "getxattr: name %s private: %p xdata %p ", name,
+ this->private, xdata);
STACK_WIND (frame, maccomp_getxattr_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->getxattr,
- loc, name, xdata);
+ loc, newkey, xdata);
return 0;
}
@@ -109,30 +139,17 @@ int32_t
maccomp_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
const char *name, dict_t *xdata)
{
- intptr_t ax = GF_XATTR_NONE;
- int i = 0;
-
- if (name) {
- for (i = 0; i < GF_XATTR_ALL; i++) {
- if (strcmp (apple_xattr_name[i], name) == 0) {
- ax = i;
-
- break;
- }
- }
- } else
- ax = GF_XATTR_ALL;
-
- this->private = (void *)ax;
+ char *newkey = NULL;
+ this->private = (void *) check_name(name, &newkey);
STACK_WIND (frame, maccomp_getxattr_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->fgetxattr,
- fd, name, xdata);
+ fd, newkey, xdata);
+ GF_FREE(newkey);
return 0;
}
-
int32_t
maccomp_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
@@ -141,12 +158,56 @@ maccomp_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret == -1 && ax != GF_XATTR_NONE)
op_ret = op_errno = 0;
-
+ gf_log (this->name, GF_LOG_DEBUG,
+ "setxattr_cbk op_ret %d op_errno %d private: %p xdata %p ",
+ op_ret, op_errno, this->private, xdata);
STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+int32_t
+maccomp_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *iatt1,
+ struct iatt *iattr2, dict_t *xdata)
+{
+ gf_log (this->name, GF_LOG_DEBUG,
+ "setattr_cbk op_ret %d op_errno %d private: %p xdata %p ",
+ op_ret, op_errno, this->private, xdata);
+ STACK_UNWIND_STRICT (setattr, frame, op_ret, op_errno,
+ iatt1, iattr2, xdata);
return 0;
}
+int map_flags(int flags)
+{
+ /* DARWIN has different defines on XATTR_ flags.
+ There do not seem to be a POSIX standard
+ Parse any other flags over.
+ NOFOLLOW is always true on Linux and Darwin
+ */
+ int linux_flags = flags & ~(GF_XATTR_CREATE | GF_XATTR_REPLACE | XATTR_REPLACE);
+ if (XATTR_CREATE & flags)
+ linux_flags |= GF_XATTR_CREATE;
+ if (XATTR_REPLACE & flags)
+ linux_flags |= GF_XATTR_REPLACE;
+ return linux_flags;
+}
+
+int32_t
+maccomp_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata)
+{
+ char *newkey = NULL;
+
+ this->private = (void *) check_name(name, &newkey);
+
+ STACK_WIND (frame, default_fremovexattr_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fremovexattr,
+ fd, newkey, xdata);
+ GF_FREE(newkey);
+ return 0;
+}
int32_t
maccomp_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
@@ -162,16 +223,56 @@ maccomp_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
break;
}
}
+ dict_t *newdict = dict_new();
+ this->private = (void *) newdict;
+ dict_foreach(dict, prepend_xattr_user_namespace, this);
this->private = (void *)ax;
-
+ int linux_flags = map_flags(flags);
+ gf_log (this->name, GF_LOG_DEBUG,
+ "setxattr flags: %d -> %d dict %p private: %p xdata %p ",
+ flags, linux_flags, dict, this->private, xdata);
STACK_WIND (frame, maccomp_setxattr_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->setxattr,
- loc, dict, flags, xdata);
+ loc, newdict, linux_flags, xdata);
+ dict_unref(newdict);
return 0;
}
+int32_t
+maccomp_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *iattr,
+ int32_t flags, dict_t *xdata)
+{
+ gf_log (this->name, GF_LOG_DEBUG,
+ "setattr iattr %p private: %p xdata %p ",
+ iattr, this->private, xdata);
+ STACK_WIND (frame, maccomp_setattr_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setattr,
+ loc, iattr, flags, xdata);
+ return 0;
+}
+
+int32_t
+maccomp_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ char *newkey = NULL;
+ this->private = (void *) check_name(name, &newkey);
+
+ STACK_WIND (frame, default_removexattr_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->removexattr,
+ loc, newkey, xdata);
+
+ gf_log (this->name, GF_LOG_TRACE,
+ "removeattr name %p private: %p xdata %p ",
+ name, this->private, xdata);
+ GF_FREE(newkey);
+ return 0;
+
+}
int32_t
maccomp_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
@@ -188,12 +289,20 @@ maccomp_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
}
}
- this->private = (void *)ax;
+ dict_t *newdict = dict_new();
+ this->private = (void *) newdict;
+ dict_foreach(dict, prepend_xattr_user_namespace, this);
+ this->private = (void *)ax;
+ int linux_flags = map_flags(flags);
+ gf_log (this->name, GF_LOG_DEBUG,
+ "fsetxattr flags: %d -> %d dict %p private: %p xdata %p ",
+ flags, linux_flags, dict, this->private, xdata);
STACK_WIND (frame, maccomp_setxattr_cbk,
FIRST_CHILD(this),
FIRST_CHILD(this)->fops->fsetxattr,
- fd, dict, flags, xdata);
+ fd, newdict, linux_flags, xdata);
+ dict_unref(newdict);
return 0;
}
@@ -224,14 +333,16 @@ fini (xlator_t *this)
struct xlator_fops fops = {
- .getxattr = maccomp_getxattr,
- .fgetxattr = maccomp_fgetxattr,
- .setxattr = maccomp_setxattr,
- .fsetxattr = maccomp_fsetxattr,
+ .getxattr = maccomp_getxattr,
+ .fgetxattr = maccomp_fgetxattr,
+ .setxattr = maccomp_setxattr,
+ .setattr = maccomp_setattr,
+ .fsetxattr = maccomp_fsetxattr,
+ .removexattr = maccomp_removexattr,
+ .fremovexattr = maccomp_fremovexattr,
};
-struct xlator_cbks cbks = {
-};
+struct xlator_cbks cbks;
struct volume_options options[] = {
{ .key = {NULL} },
diff --git a/xlators/features/mac-compat/src/mac-compat.h b/xlators/features/mac-compat/src/mac-compat.h
new file mode 100644
index 000000000..b033ca0e4
--- /dev/null
+++ b/xlators/features/mac-compat/src/mac-compat.h
@@ -0,0 +1,41 @@
+/*
+ Copyright (c) 2014 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __MAC_COMPAT_H__
+#define __MAC_COMPAT_H__
+
+enum apple_xattr {
+ GF_FINDER_INFO_XATTR,
+ GF_RESOURCE_FORK_XATTR,
+ GF_XATTR_ALL,
+ GF_XATTR_NONE
+};
+
+static char *apple_xattr_name[] = {
+ [GF_FINDER_INFO_XATTR] = "com.apple.FinderInfo",
+ [GF_RESOURCE_FORK_XATTR] = "com.apple.ResourceFork"
+};
+
+static const char *apple_xattr_value[] = {
+ [GF_FINDER_INFO_XATTR] =
+ /* 1 2 3 4 5 6 7 8 */
+ "\0\0\0\0\0\0\0\0"
+ "\0\0\0\0\0\0\0\0"
+ "\0\0\0\0\0\0\0\0"
+ "\0\0\0\0\0\0\0\0",
+ [GF_RESOURCE_FORK_XATTR] = ""
+};
+
+static int32_t apple_xattr_len[] = {
+ [GF_FINDER_INFO_XATTR] = 32,
+ [GF_RESOURCE_FORK_XATTR] = 1
+};
+
+#endif /* __MAC_COMPAT_H__ */
diff --git a/xlators/features/marker/Makefile.am b/xlators/features/marker/Makefile.am
index a6ba2de16..a985f42a8 100644
--- a/xlators/features/marker/Makefile.am
+++ b/xlators/features/marker/Makefile.am
@@ -1,3 +1,3 @@
-SUBDIRS = src @SYNCDAEMON_SUBDIR@
+SUBDIRS = src
CLEANFILES =
diff --git a/xlators/features/marker/src/Makefile.am b/xlators/features/marker/src/Makefile.am
index 5c350b60b..a7c676472 100644
--- a/xlators/features/marker/src/Makefile.am
+++ b/xlators/features/marker/src/Makefile.am
@@ -1,7 +1,7 @@
xlator_LTLIBRARIES = marker.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
-marker_la_LDFLAGS = -module -avoidversion
+marker_la_LDFLAGS = -module -avoid-version
marker_la_SOURCES = marker.c marker-quota.c marker-quota-helper.c marker-common.c
marker_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
diff --git a/xlators/features/marker/src/marker-quota-helper.c b/xlators/features/marker/src/marker-quota-helper.c
index af5fed132..ec0d83316 100644
--- a/xlators/features/marker/src/marker-quota-helper.c
+++ b/xlators/features/marker/src/marker-quota-helper.c
@@ -154,15 +154,17 @@ out:
inode_contribution_t *
-__mq_add_new_contribution_node (xlator_t *this, quota_inode_ctx_t *ctx, loc_t *loc)
+__mq_add_new_contribution_node (xlator_t *this, quota_inode_ctx_t *ctx,
+ loc_t *loc)
{
- int32_t ret = 0;
+ int32_t ret = 0;
inode_contribution_t *contribution = NULL;
if (!loc->parent) {
if (!uuid_is_null (loc->pargfid))
loc->parent = inode_find (loc->inode->table,
loc->pargfid);
+
if (!loc->parent)
loc->parent = inode_parent (loc->inode, loc->pargfid,
loc->name);
@@ -170,9 +172,10 @@ __mq_add_new_contribution_node (xlator_t *this, quota_inode_ctx_t *ctx, loc_t *l
goto out;
}
- list_for_each_entry (contribution, &ctx->contribution_head, contri_list) {
+ list_for_each_entry (contribution, &ctx->contribution_head,
+ contri_list) {
if (loc->parent &&
- uuid_compare (contribution->gfid, loc->parent->gfid) == 0) {
+ uuid_compare (contribution->gfid, loc->parent->gfid) == 0) {
goto out;
}
}
@@ -196,14 +199,16 @@ out:
inode_contribution_t *
-mq_add_new_contribution_node (xlator_t *this, quota_inode_ctx_t *ctx, loc_t *loc)
+mq_add_new_contribution_node (xlator_t *this, quota_inode_ctx_t *ctx,
+ loc_t *loc)
{
inode_contribution_t *contribution = NULL;
if ((ctx == NULL) || (loc == NULL))
return NULL;
- if (strcmp (loc->path, "/") == 0)
+ if (((loc->path) && (strcmp (loc->path, "/") == 0))
+ || (!loc->path && uuid_is_null (loc->pargfid)))
return NULL;
LOCK (&ctx->lock);
@@ -226,12 +231,16 @@ mq_dict_set_contribution (xlator_t *this, dict_t *dict,
GF_VALIDATE_OR_GOTO ("marker", this, out);
GF_VALIDATE_OR_GOTO ("marker", dict, out);
GF_VALIDATE_OR_GOTO ("marker", loc, out);
- GF_VALIDATE_OR_GOTO ("marker", loc->parent, out);
- GET_CONTRI_KEY (contri_key, loc->parent->gfid, ret);
- if (ret < 0) {
- ret = -1;
- goto out;
+ if (loc->parent) {
+ GET_CONTRI_KEY (contri_key, loc->parent->gfid, ret);
+ if (ret < 0) {
+ ret = -1;
+ goto out;
+ }
+ } else {
+ /* nameless lookup, fetch contributions to all parents */
+ GET_CONTRI_KEY (contri_key, NULL, ret);
}
ret = dict_set_int64 (dict, contri_key, 0);
diff --git a/xlators/features/marker/src/marker-quota-helper.h b/xlators/features/marker/src/marker-quota-helper.h
index 6cdd14881..b200413b0 100644
--- a/xlators/features/marker/src/marker-quota-helper.h
+++ b/xlators/features/marker/src/marker-quota-helper.h
@@ -9,7 +9,7 @@
*/
#ifndef _MARKER_QUOTA_HELPER_H
-#define _MARKER_QUOTA_HELPER
+#define _MARKER_QUOTA_HELPER_H
#ifndef _CONFIG_H
#define _CONFIG_H
diff --git a/xlators/features/marker/src/marker-quota.c b/xlators/features/marker/src/marker-quota.c
index 6f9af6e13..1903fdc40 100644
--- a/xlators/features/marker/src/marker-quota.c
+++ b/xlators/features/marker/src/marker-quota.c
@@ -30,7 +30,8 @@ mq_loc_copy (loc_t *dst, loc_t *src)
GF_VALIDATE_OR_GOTO ("marker", src, out);
if (src->inode == NULL ||
- src->path == NULL) {
+ ((src->parent == NULL) && (uuid_is_null (src->pargfid))
+ && !__is_root_gfid (src->inode->gfid))) {
gf_log ("marker", GF_LOG_WARNING,
"src loc is not valid");
goto out;
@@ -364,7 +365,10 @@ mq_update_size_xattr (call_frame_t *frame, void *cookie, xlator_t *this,
local->loc.path, ntoh64 (*delta));
new_dict = dict_new ();
- if (!new_dict);
+ if (!new_dict) {
+ errno = ENOMEM;
+ goto err;
+ }
ret = dict_set_bin (new_dict, QUOTA_SIZE_KEY, delta, 8);
if (ret)
@@ -384,7 +388,6 @@ mq_update_size_xattr (call_frame_t *frame, void *cookie, xlator_t *this,
err:
if (op_ret == -1 || ret == -1) {
local->err = -1;
-
mq_release_lock_on_dirty_inode (frame, NULL, this, 0, 0, NULL);
}
@@ -1038,7 +1041,11 @@ mq_create_xattr (xlator_t *this, call_frame_t *frame)
goto free_size;
}
- if (strcmp (local->loc.path, "/") != 0) {
+ if ((local->loc.path && strcmp (local->loc.path, "/") != 0)
+ || (local->loc.inode && !uuid_is_null (local->loc.inode->gfid) &&
+ !__is_root_gfid (local->loc.inode->gfid))
+ || (!uuid_is_null (local->loc.gfid)
+ && !__is_root_gfid (local->loc.gfid))) {
contri = mq_add_new_contribution_node (this, ctx, &local->loc);
if (contri == NULL)
goto err;
@@ -1107,7 +1114,12 @@ mq_check_n_set_inode_xattr (call_frame_t *frame, void *cookie,
goto create_xattr;
//check contribution xattr if not root
- if (strcmp (local->loc.path, "/") != 0) {
+ if ((local->loc.path && strcmp (local->loc.path, "/") != 0)
+ || (!uuid_is_null (local->loc.gfid)
+ && !__is_root_gfid (local->loc.gfid))
+ || (local->loc.inode
+ && !uuid_is_null (local->loc.inode->gfid)
+ && !__is_root_gfid (local->loc.inode->gfid))) {
GET_CONTRI_KEY (contri_key, local->loc.parent->gfid, ret);
if (ret < 0)
goto out;
@@ -1234,6 +1246,7 @@ mq_get_parent_inode_local (xlator_t *this, quota_local_t *local)
{
int32_t ret = -1;
quota_inode_ctx_t *ctx = NULL;
+ inode_contribution_t *contribution = NULL;
GF_VALIDATE_OR_GOTO ("marker", this, out);
GF_VALIDATE_OR_GOTO ("marker", local, out);
@@ -1263,7 +1276,7 @@ mq_get_parent_inode_local (xlator_t *this, quota_local_t *local)
ret = mq_inode_ctx_get (local->loc.inode, this, &ctx);
if (ret < 0) {
gf_log_callingfn (this->name, GF_LOG_WARNING,
- "inode ctx get failed");
+ "inode ctx get failed");
goto out;
}
@@ -1277,7 +1290,31 @@ mq_get_parent_inode_local (xlator_t *this, quota_local_t *local)
goto out;
}
- local->contri = (inode_contribution_t *) ctx->contribution_head.next;
+ /* Earlier we used to get the next entry in the list maintained
+ by the context. In a good situation it works. i.e the next
+ parent in the directory hierarchy for this path is obtained.
+
+ But consider the below situation:
+ mount-point: /mnt/point
+ quota enabled directories within mount point: /a, /b, /c
+
+ Now when some file (file1) in the directory /c is written some data,
+ then to update the directories, marker has to get the contribution
+ object for the parent inode, i.e /c.
+ Beefore, it was being done by
+ local->contri = (inode_contribution_t *) ctx->contribution_head.next;
+ It works in the normal situations. But suppose /c is moved to /b.
+ Now /b's contribution object is added to the end of the list of
+ parents that the file file1 within /b/c is maintaining. Now if
+ the file /b/c/file1 is copied to /b/c/new, to update the parent in
+ the order c, b and / we cannot go to the next element in the list,
+ as in this case the next contribution object would be / and /b's
+ contribution will be at the end of the list. So get the proper
+ parent's contribution, by searching the entire list.
+ */
+ contribution = mq_get_contribution_node (local->loc.parent, ctx);
+ GF_ASSERT (contribution != NULL);
+ local->contri = contribution;
ret = 0;
out:
@@ -1521,9 +1558,10 @@ mq_update_parent_size (call_frame_t *frame,
}
UNLOCK (&local->contri->lock);
- gf_log (this->name, GF_LOG_DEBUG, "%s %"PRId64 "%"PRId64,
- local->loc.path, local->ctx->size,
- local->contri->contribution);
+ gf_log_callingfn (this->name, GF_LOG_DEBUG, "path: %s size: %"PRId64
+ " contribution:%"PRId64,
+ local->loc.path, local->ctx->size,
+ local->contri->contribution);
if (dict == NULL) {
op_errno = EINVAL;
@@ -1651,7 +1689,7 @@ unlock:
}
UNLOCK (&contribution->lock);
- gf_log (this->name, GF_LOG_DEBUG, "%s %"PRId64 "%"PRId64,
+ gf_log (this->name, GF_LOG_DEBUG, "%s %"PRId64 " %"PRId64,
local->loc.path, size_int, contri_int);
local->delta = size_int - contri_int;
@@ -1730,7 +1768,8 @@ mq_fetch_child_size_and_contri (call_frame_t *frame, void *cookie,
VALIDATE_OR_GOTO (local->ctx, err);
VALIDATE_OR_GOTO (local->contri, err);
- gf_log (this->name, GF_LOG_DEBUG, "%s marked dirty", local->parent_loc.path);
+ gf_log (this->name, GF_LOG_DEBUG, "%s marked dirty",
+ local->parent_loc.path);
//update parent ctx
ret = mq_inode_ctx_get (local->parent_loc.inode, this, &ctx);
@@ -1901,15 +1940,18 @@ fr_destroy:
return -1;
}
-
int
-mq_start_quota_txn (xlator_t *this, loc_t *loc,
- quota_inode_ctx_t *ctx,
- inode_contribution_t *contri)
+mq_prepare_txn_frame (xlator_t *this, loc_t *loc,
+ quota_inode_ctx_t *ctx,
+ inode_contribution_t *contri,
+ call_frame_t **new_frame)
{
- int32_t ret = -1;
- call_frame_t *frame = NULL;
- quota_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ int ret = -1;
+ quota_local_t *local = NULL;
+
+ if (!this || !loc || !new_frame)
+ goto err;
frame = create_frame (this, this->ctx->pool);
if (frame == NULL)
@@ -1935,14 +1977,36 @@ mq_start_quota_txn (xlator_t *this, loc_t *loc,
local->ctx = ctx;
local->contri = contri;
+ ret = 0;
+ *new_frame = frame;
+
+ return ret;
+
+fr_destroy:
+ QUOTA_STACK_DESTROY (frame, this);
+err:
+ return ret;
+}
+
+int
+mq_start_quota_txn (xlator_t *this, loc_t *loc,
+ quota_inode_ctx_t *ctx,
+ inode_contribution_t *contri)
+{
+ int32_t ret = -1;
+ call_frame_t *frame = NULL;
+
+ ret = mq_prepare_txn_frame (this, loc, ctx,
+ contri, &frame);
+ if (ret)
+ goto err;
+
ret = mq_get_lock_on_parent (frame, this);
if (ret == -1)
goto err;
return 0;
-fr_destroy:
- QUOTA_STACK_DESTROY (frame, this);
err:
mq_set_ctx_updation_status (ctx, _gf_false);
@@ -1970,11 +2034,46 @@ mq_initiate_quota_txn (xlator_t *this, loc_t *loc)
goto out;
}
+ /* Create the contribution node if its absent. Is it right to
+ assume that if the contribution node is not there, then
+ create one and proceed instead of returning?
+ Reason for this assumption is for hard links. Suppose
+ hard link for a file f1 present in a directory d1 is
+ created in the directory d2 (as f2). Now, since d2's
+ contribution is not there in f1's inode ctx, d2's
+ contribution xattr wont be created and will create problems
+ for quota operations.
+ */
contribution = mq_get_contribution_node (loc->parent, ctx);
- if (contribution == NULL)
- goto out;
+ if (!contribution) {
+ if ((loc->path && strcmp (loc->path, "/"))
+ || (!uuid_is_null (loc->gfid)
+ && !__is_root_gfid (loc->gfid))
+ || (loc->inode && !uuid_is_null (loc->inode->gfid)
+ && !__is_root_gfid (loc->inode->gfid)))
+ gf_log_callingfn (this->name, GF_LOG_TRACE,
+ "contribution node for the "
+ "path (%s) with parent (%s) "
+ "not found", loc->path,
+ loc->parent?
+ uuid_utoa (loc->parent->gfid):
+ NULL);
+
+ contribution = mq_add_new_contribution_node (this, ctx, loc);
+ if (!contribution) {
+ if(loc->path && strcmp (loc->path, "/"))
+ gf_log_callingfn (this->name, GF_LOG_WARNING,
+ "could not allocate "
+ " contribution node for (%s) "
+ "parent: (%s)", loc->path,
+ loc->parent?
+ uuid_utoa (loc->parent->gfid):
+ NULL);
+ goto out;
+ }
+ }
- /* To improve performance, donot start another transaction
+ /* To improve performance, do not start another transaction
* if one is already in progress for same inode
*/
status = _gf_true;
@@ -1993,16 +2092,7 @@ out:
}
-/* int32_t */
-/* validate_inode_size_contribution (xlator_t *this, loc_t *loc, int64_t size, */
-/* int64_t contribution) */
-/* { */
-/* if (size != contribution) { */
-/* mq_initiate_quota_txn (this, loc); */
-/* } */
-/* return 0; */
-/* } */
int32_t
@@ -2031,12 +2121,13 @@ mq_inspect_directory_xattr (xlator_t *this,
}
}
- if (strcmp (loc->path, "/") != 0) {
+ if (!loc->path || (loc->path && strcmp (loc->path, "/") != 0)) {
contribution = mq_add_new_contribution_node (this, ctx, loc);
if (contribution == NULL) {
if (!uuid_is_null (loc->inode->gfid))
- gf_log (this->name, GF_LOG_WARNING,
- "cannot add a new contribution node");
+ gf_log (this->name, GF_LOG_DEBUG,
+ "cannot add a new contribution node "
+ "(%s)", uuid_utoa (loc->inode->gfid));
ret = -1;
goto err;
}
@@ -2050,7 +2141,10 @@ mq_inspect_directory_xattr (xlator_t *this,
if (ret < 0)
goto out;
- if (strcmp (loc->path, "/") != 0) {
+ if ((loc->path && strcmp (loc->path, "/") != 0)
+ || (!uuid_is_null (loc->gfid) && !__is_root_gfid (loc->gfid))
+ || (loc->inode && !uuid_is_null (loc->inode->gfid) &&
+ !__is_root_gfid (loc->inode->gfid))) {
not_root = _gf_true;
GET_CONTRI_KEY (contri_key, contribution->gfid, ret);
@@ -2122,8 +2216,11 @@ mq_inspect_file_xattr (xlator_t *this,
}
contribution = mq_add_new_contribution_node (this, ctx, loc);
- if (contribution == NULL)
+ if (contribution == NULL) {
+ gf_log_callingfn (this->name, GF_LOG_DEBUG, "cannot allocate "
+ "contribution node (path:%s)", loc->path);
goto out;
+ }
LOCK (&ctx->lock);
{
@@ -2155,8 +2252,12 @@ mq_inspect_file_xattr (xlator_t *this,
if (size != contri_int) {
mq_initiate_quota_txn (this, loc);
}
- } else
- mq_initiate_quota_txn (this, loc);
+ } else {
+ if (size)
+ mq_initiate_quota_txn (this, loc);
+ else
+ mq_set_inode_xattr (this, loc);
+ }
}
out:
@@ -2169,8 +2270,8 @@ mq_xattr_state (xlator_t *this,
dict_t *dict,
struct iatt buf)
{
- if (buf.ia_type == IA_IFREG ||
- buf.ia_type == IA_IFLNK) {
+ if (((buf.ia_type == IA_IFREG) && !dht_is_linkfile (&buf, dict))
+ || (buf.ia_type == IA_IFLNK)) {
mq_inspect_file_xattr (this, loc, dict, buf);
} else if (buf.ia_type == IA_IFDIR)
mq_inspect_directory_xattr (this, loc, dict, buf);
@@ -2192,7 +2293,7 @@ mq_req_xattr (xlator_t *this,
goto set_size;
//if not "/" then request contribution
- if (strcmp (loc->path, "/") == 0)
+ if (loc->path && strcmp (loc->path, "/") == 0)
goto set_size;
ret = mq_dict_set_contribution (this, dict, loc);
@@ -2235,6 +2336,12 @@ _mq_inode_remove_done (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t ret = 0;
char contri_key [512] = {0, };
quota_local_t *local = NULL;
+ inode_t *inode = NULL;
+ dentry_t *tmp = NULL;
+ gf_boolean_t last_dentry = _gf_true;
+ loc_t loc = {0, };
+ dentry_t *other_dentry = NULL;
+ gf_boolean_t remove = _gf_false;
local = (quota_local_t *) frame->local;
@@ -2245,17 +2352,84 @@ _mq_inode_remove_done (call_frame_t *frame, void *cookie, xlator_t *this,
frame->local = NULL;
- if (local->hl_count > 1) {
- GET_CONTRI_KEY (contri_key, local->contri->gfid, ret);
+ GET_CONTRI_KEY (contri_key, local->contri->gfid, ret);
- STACK_WIND (frame, mq_removexattr_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->removexattr,
- &local->loc, contri_key, NULL);
- ret = 0;
- } else {
- mq_removexattr_cbk (frame, NULL, this, 0, 0, NULL);
+ if (!local->loc.inode)
+ inode = inode_grep (local->loc.parent->table, local->loc.parent,
+ local->loc.name);
+ else
+ inode = inode_ref (local->loc.inode);
+
+ /* Suppose there are 2 directories dir1 and dir2. Quota limit is set on
+ both the directories. There is a file (f1) in dir1. A hark link is
+ created for that file inside the directory dir2 (say f2). Now one
+ more xattr is set in the inode as a new hard link is created in a
+ separate directory.
+ i.e trusted.glusterfs.quota.<gfid of dir2>.contri=<contribution>
+
+ Now when the hardlink f2 is removed, then the new xattr added (i.e
+ the xattr indicating its contribution to ITS parent directory) should
+ be removed (IFF there is not another hardlink for that file in the
+ same directory).
+
+ To do that upon getting unlink first check whether any other hard
+ links for the same inode exists in the same directory. If so do not
+ do anything and proceed for quota transaction.
+ Otherwise, if the removed entry was the only link for that inode
+ within that directory, then get another dentry for the inode
+ (by traversing the list of dentries for the inode) and using the
+ the dentry's parent and name, send removexattr so that the xattr
+ is removed.
+
+ If it is not done, then if the volume is restarted or the brick
+ process is restarted, then wrong quota usage will be shown for the
+ directory dir2.
+ */
+ if (inode) {
+ tmp = NULL;
+ list_for_each_entry (tmp, &inode->dentry_list, inode_list) {
+ if (local->loc.parent == tmp->parent) {
+ if (strcmp (local->loc.name, local->loc.name)) {
+ last_dentry = _gf_false;
+ break;
+ }
+ }
+ }
+ remove = last_dentry;
}
+ if (remove) {
+ if (!other_dentry) {
+ list_for_each_entry (tmp, &inode->dentry_list,
+ inode_list) {
+ if (local->loc.parent != tmp->parent) {
+ other_dentry = tmp;
+ break;
+ }
+ }
+ }
+
+ if (!other_dentry)
+ mq_removexattr_cbk (frame, NULL, this, 0, 0, NULL);
+ else {
+ loc.parent = inode_ref (other_dentry->parent);
+ loc.name = gf_strdup (other_dentry->name);
+ uuid_copy (loc.pargfid , other_dentry->parent->gfid);
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
+ inode_path (other_dentry->parent, other_dentry->name,
+ (char **)&loc.path);
+
+ STACK_WIND (frame, mq_removexattr_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->removexattr,
+ &loc, contri_key, NULL);
+ }
+ } else
+ mq_removexattr_cbk (frame, NULL, this, 0, 0, NULL);
+
+ ret = 0;
+
if (strcmp (local->parent_loc.path, "/") != 0) {
ret = mq_get_parent_inode_local (this, local);
if (ret < 0)
@@ -2266,6 +2440,8 @@ _mq_inode_remove_done (call_frame_t *frame, void *cookie, xlator_t *this,
out:
mq_local_unref (this, local);
+ loc_wipe (&loc);
+ inode_unref (inode);
return 0;
}
@@ -2392,8 +2568,11 @@ mq_reduce_parent_size (xlator_t *this, loc_t *loc, int64_t contri)
goto out;
contribution = mq_get_contribution_node (loc->parent, ctx);
- if (contribution == NULL)
+ if (contribution == NULL) {
+ gf_log_callingfn (this->name, GF_LOG_WARNING, "contribution for"
+ " the node %s is NULL", loc->path);
goto out;
+ }
local = mq_local_new ();
if (local == NULL) {
@@ -2412,6 +2591,8 @@ mq_reduce_parent_size (xlator_t *this, loc_t *loc, int64_t contri)
}
if (local->size == 0) {
+ gf_log_callingfn (this->name, GF_LOG_TRACE,
+ "local->size is 0 " "path: (%s)", loc->path);
ret = 0;
goto out;
}
@@ -2424,8 +2605,12 @@ mq_reduce_parent_size (xlator_t *this, loc_t *loc, int64_t contri)
local->contri = contribution;
ret = mq_inode_loc_fill (NULL, loc->parent, &local->parent_loc);
- if (ret < 0)
+ if (ret < 0) {
+ gf_log_callingfn (this->name, GF_LOG_INFO, "building parent loc"
+ " failed. (gfid: %s)",
+ uuid_utoa (loc->parent->gfid));
goto out;
+ }
frame = create_frame (this, this->ctx->pool);
if (!frame) {
diff --git a/xlators/features/marker/src/marker-quota.h b/xlators/features/marker/src/marker-quota.h
index 385760ac4..42def9d22 100644
--- a/xlators/features/marker/src/marker-quota.h
+++ b/xlators/features/marker/src/marker-quota.h
@@ -42,8 +42,6 @@
var = GF_CALLOC (sizeof (type), 1, \
gf_marker_mt_##type); \
if (!var) { \
- gf_log ("", GF_LOG_ERROR, \
- "out of memory"); \
ret = -1; \
} \
} while (0);
@@ -61,13 +59,20 @@
ret = 0; \
} while (0);
-#define GET_CONTRI_KEY(var, _gfid, _ret) \
- do { \
- char _gfid_unparsed[40]; \
- uuid_unparse (_gfid, _gfid_unparsed); \
- _ret = snprintf (var, CONTRI_KEY_MAX, QUOTA_XATTR_PREFIX \
- ".%s.%s." CONTRIBUTION, "quota", \
- _gfid_unparsed); \
+#define GET_CONTRI_KEY(var, _gfid, _ret) \
+ do { \
+ if (_gfid != NULL) { \
+ char _gfid_unparsed[40]; \
+ uuid_unparse (_gfid, _gfid_unparsed); \
+ _ret = snprintf (var, CONTRI_KEY_MAX, \
+ QUOTA_XATTR_PREFIX \
+ ".%s.%s." CONTRIBUTION, "quota", \
+ _gfid_unparsed); \
+ } else { \
+ _ret = snprintf (var, CONTRI_KEY_MAX, \
+ QUOTA_XATTR_PREFIX \
+ ".%s.." CONTRIBUTION, "quota"); \
+ } \
} while (0);
#define QUOTA_SAFE_INCREMENT(lock, var) \
diff --git a/xlators/features/marker/src/marker.c b/xlators/features/marker/src/marker.c
index 6c68167d8..adcf3d8e7 100644
--- a/xlators/features/marker/src/marker.c
+++ b/xlators/features/marker/src/marker.c
@@ -21,6 +21,7 @@
#include "marker-quota-helper.h"
#include "marker-common.h"
#include "byte-order.h"
+#include "syncop.h"
#define _GF_UID_GID_CHANGED 1
@@ -146,12 +147,21 @@ out:
}
int32_t
-marker_error_handler (xlator_t *this)
+marker_error_handler (xlator_t *this, marker_local_t *local, int32_t op_errno)
{
- marker_conf_t *priv = NULL;
+ marker_conf_t *priv = NULL;
+ const char *path = NULL;
- priv = (marker_conf_t *) this->private;
+ priv = (marker_conf_t *) this->private;
+ path = local
+ ? (local->loc.path
+ ? local->loc.path : uuid_utoa(local->loc.gfid))
+ : "<nul>";
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "Indexing gone corrupt at %s (reason: %s)."
+ " Geo-replication slave content needs to be revalidated",
+ path, strerror (op_errno));
unlink (priv->timestamp_file);
return 0;
@@ -176,6 +186,8 @@ marker_local_unref (marker_local_t *local)
loc_wipe (&local->loc);
loc_wipe (&local->parent_loc);
+ if (local->xdata)
+ dict_unref (local->xdata);
if (local->oplocal) {
marker_local_unref (local->oplocal);
@@ -243,18 +255,18 @@ out:
return 0;
}
-int32_t
+gf_boolean_t
call_from_special_client (call_frame_t *frame, xlator_t *this, const char *name)
{
struct volume_mark *vol_mark = NULL;
marker_conf_t *priv = NULL;
- gf_boolean_t ret = _gf_true;
+ gf_boolean_t is_true = _gf_true;
priv = (marker_conf_t *)this->private;
if (frame->root->pid != GF_CLIENT_PID_GSYNCD || name == NULL ||
strcmp (name, MARKER_XATTR_PREFIX "." VOLUME_MARK) != 0) {
- ret = _gf_false;
+ is_true = _gf_false;
goto out;
}
@@ -262,7 +274,7 @@ call_from_special_client (call_frame_t *frame, xlator_t *this, const char *name)
marker_getxattr_stampfile_cbk (frame, this, name, vol_mark, NULL);
out:
- return ret;
+ return is_true;
}
int32_t
@@ -270,15 +282,65 @@ marker_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *dict,
dict_t *xdata)
{
+ int ret = 0;
+ char *src = NULL;
+ char *dst = NULL;
+ int len = 0;
+ marker_local_t *local = NULL;
+
+ local = frame->local;
+
+
if (cookie) {
gf_log (this->name, GF_LOG_DEBUG,
"Filtering the quota extended attributes");
- dict_foreach_fnmatch (dict, "trusted.glusterfs.quota*",
- marker_filter_quota_xattr, NULL);
+ /* If the getxattr is from a non special client, then do not
+ copy the quota related xattrs (except the quota limit key
+ i.e trusted.glusterfs.quota.limit-set which has been set by
+ glusterd on the directory on which quota limit is set.) for
+ directories. Let the healing of xattrs happen upon lookup.
+ NOTE: setting of trusted.glusterfs.quota.limit-set as of now
+ happens from glusterd. It should be moved to quotad. Also
+ trusted.glusterfs.quota.limit-set is set on directory which
+ is permanent till quota is removed on that directory or limit
+ is changed. So let that xattr be healed by other xlators
+ properly whenever directory healing is done.
+ */
+ ret = dict_get_ptr_and_len (dict, QUOTA_LIMIT_KEY,
+ (void **)&src, &len);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "dict_get on %s "
+ "failed", QUOTA_LIMIT_KEY);
+ } else {
+ dst = GF_CALLOC (len, sizeof (char), gf_common_mt_char);
+ if (dst)
+ memcpy (dst, src, len);
+ }
+
+ /*
+ * Except limit-set xattr, rest of the xattrs are maintained
+ * by quota xlator. Don't expose them to other xlators.
+ * This filter makes sure quota xattrs are not healed as part of
+ * metadata self-heal
+ */
+ GF_REMOVE_INTERNAL_XATTR ("trusted.glusterfs.quota*", dict);
+ if (!ret && IA_ISDIR (local->loc.inode->ia_type) && dst) {
+ ret = dict_set_dynptr (dict, QUOTA_LIMIT_KEY,
+ dst, len);
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING, "setting "
+ "key %s failed", QUOTA_LIMIT_KEY);
+ else
+ dst = NULL;
+ }
}
+ GF_FREE (dst);
+
+ frame->local = NULL;
STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict, xdata);
+ marker_local_unref (local);
return 0;
}
@@ -286,20 +348,29 @@ int32_t
marker_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
const char *name, dict_t *xdata)
{
- gf_boolean_t ret = _gf_false;
- marker_conf_t *priv = NULL;
- unsigned long cookie = 0;
+ gf_boolean_t is_true = _gf_false;
+ marker_conf_t *priv = NULL;
+ unsigned long cookie = 0;
+ marker_local_t *local = NULL;
priv = this->private;
- if (priv == NULL || (priv->feature_enabled & GF_XTIME) == 0)
- goto wind;
+ frame->local = mem_get0 (this->local_pool);
+ local = frame->local;
+ if (local == NULL)
+ goto out;
+
+ MARKER_INIT_LOCAL (frame, local);
+
+ if ((loc_copy (&local->loc, loc)) < 0)
+ goto out;
gf_log (this->name, GF_LOG_DEBUG, "USER:PID = %d", frame->root->pid);
- ret = call_from_special_client (frame, this, name);
-wind:
- if (ret == _gf_false) {
+ if (priv && priv->feature_enabled & GF_XTIME)
+ is_true = call_from_special_client (frame, this, name);
+
+ if (is_true == _gf_false) {
if (name == NULL) {
/* Signifies that marker translator
* has to filter the quota's xattr's,
@@ -308,13 +379,19 @@ wind:
*/
cookie = 1;
}
- STACK_WIND_COOKIE (frame, marker_getxattr_cbk, (void *)cookie,
+ STACK_WIND_COOKIE (frame, marker_getxattr_cbk,
+ (void *)cookie,
FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->getxattr, loc,
- name, xdata);
+ FIRST_CHILD(this)->fops->getxattr,
+ loc, name, xdata);
}
return 0;
+out:
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (getxattr, frame, -1, ENOMEM, NULL, NULL);
+ marker_local_unref (local);
+ return 0;
}
@@ -345,7 +422,7 @@ marker_specific_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local = (marker_local_t*) frame->local;
if (op_ret == -1 && op_errno == ENOSPC) {
- marker_error_handler (this);
+ marker_error_handler (this, local, op_errno);
done = 1;
goto out;
}
@@ -457,11 +534,16 @@ marker_create_frame (xlator_t *this, marker_local_t *local)
int32_t
marker_xtime_update_marks (xlator_t *this, marker_local_t *local)
{
+ marker_conf_t *priv = NULL;
+
GF_VALIDATE_OR_GOTO ("marker", this, out);
GF_VALIDATE_OR_GOTO (this->name, local, out);
- if ((local->pid == GF_CLIENT_PID_GSYNCD) ||
- (local->pid == GF_CLIENT_PID_DEFRAG))
+ priv = this->private;
+
+ if ((local->pid == GF_CLIENT_PID_GSYNCD
+ && !(priv->feature_enabled & GF_XTIME_GSYNC_FORCE))
+ || (local->pid == GF_CLIENT_PID_DEFRAG))
goto out;
marker_gettimeofday (local);
@@ -791,8 +873,10 @@ marker_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
priv = this->private;
- if ((priv->feature_enabled & GF_QUOTA) && (local->ia_nlink == 1))
- mq_reduce_parent_size (this, &local->loc, -1);
+ if (priv->feature_enabled & GF_QUOTA) {
+ if (!local->skip_txn)
+ mq_reduce_parent_size (this, &local->loc, -1);
+ }
if (priv->feature_enabled & GF_XTIME)
marker_xtime_update_marks (this, local);
@@ -804,37 +888,6 @@ out:
int32_t
-marker_unlink_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf,
- dict_t *xdata)
-{
- marker_local_t *local = NULL;
-
- local = frame->local;
- if (op_ret < 0) {
- goto err;
- }
-
- if (local == NULL) {
- op_errno = EINVAL;
- goto err;
- }
-
- local->ia_nlink = buf->ia_nlink;
-
- STACK_WIND (frame, marker_unlink_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->unlink, &local->loc, local->xflag,
- NULL);
- return 0;
-err:
- frame->local = NULL;
- STACK_UNWIND_STRICT (unlink, frame, -1, op_errno, NULL, NULL, NULL);
- marker_local_unref (local);
- return 0;
-}
-
-
-int32_t
marker_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
dict_t *xdata)
{
@@ -849,6 +902,8 @@ marker_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
local = mem_get0 (this->local_pool);
local->xflag = xflag;
+ if (xdata)
+ local->xdata = dict_ref (xdata);
MARKER_INIT_LOCAL (frame, local);
ret = loc_copy (&local->loc, loc);
@@ -856,12 +911,10 @@ marker_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
if (ret == -1)
goto err;
- if (uuid_is_null (loc->gfid) && loc->inode)
- uuid_copy (loc->gfid, loc->inode->gfid);
-
- STACK_WIND (frame, marker_unlink_stat_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->stat, loc, xdata);
- return 0;
+ if (xdata && dict_get (xdata, GLUSTERFS_MARKER_DONT_ACCOUNT_KEY)) {
+ local->skip_txn = 1;
+ goto unlink_wind;
+ }
unlink_wind:
STACK_WIND (frame, marker_unlink_cbk, FIRST_CHILD(this),
@@ -901,8 +954,11 @@ marker_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
priv = this->private;
- if (priv->feature_enabled & GF_QUOTA)
- mq_initiate_quota_txn (this, &local->loc);
+ if (priv->feature_enabled & GF_QUOTA) {
+ if (!local->skip_txn)
+ mq_set_inode_xattr (this, &local->loc);
+ }
+
if (priv->feature_enabled & GF_XTIME)
marker_xtime_update_marks (this, local);
@@ -933,6 +989,9 @@ marker_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
if (ret == -1)
goto err;
+
+ if (xdata && dict_get (xdata, GLUSTERFS_MARKER_DONT_ACCOUNT_KEY))
+ local->skip_txn = 1;
wind:
STACK_WIND (frame, marker_link_cbk, FIRST_CHILD(this),
FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
@@ -962,7 +1021,7 @@ marker_rename_done (call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret < 0) {
if (local->err == 0) {
- local->err = op_errno;
+ local->err = op_errno ? op_errno : EINVAL;
}
gf_log (this->name, GF_LOG_WARNING,
@@ -978,6 +1037,11 @@ marker_rename_done (call_frame_t *frame, void *cookie, xlator_t *this,
} else if (local->err != 0) {
STACK_UNWIND_STRICT (rename, frame, -1, local->err, NULL, NULL,
NULL, NULL, NULL, NULL);
+ } else {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "continuation stub to unwind the call is absent, hence "
+ "call will be hung (call-stack id = %"PRIu64")",
+ frame->root->unique);
}
mq_reduce_parent_size (this, &oplocal->loc, oplocal->contribution);
@@ -993,7 +1057,7 @@ marker_rename_done (call_frame_t *frame, void *cookie, xlator_t *this,
newloc.name++;
newloc.parent = inode_ref (local->loc.parent);
- mq_rename_update_newpath (this, &newloc);
+ mq_set_inode_xattr (this, &newloc);
loc_wipe (&newloc);
@@ -1023,7 +1087,7 @@ marker_rename_release_newp_lock (call_frame_t *frame, void *cookie,
if (op_ret < 0) {
if (local->err == 0) {
- local->err = op_errno;
+ local->err = op_errno ? op_errno : EINVAL;
}
gf_log (this->name, GF_LOG_WARNING,
@@ -1212,7 +1276,7 @@ marker_do_rename (call_frame_t *frame, void *cookie, xlator_t *this,
MARKER_RESET_UID_GID (frame, frame->root, local);
if ((op_ret < 0) && (op_errno != ENOATTR)) {
- local->err = op_errno;
+ local->err = op_errno ? op_errno : EINVAL;
gf_log (this->name, GF_LOG_WARNING,
"fetching contribution values from %s (gfid:%s) "
"failed (%s)", local->loc.path,
@@ -1224,7 +1288,7 @@ marker_do_rename (call_frame_t *frame, void *cookie, xlator_t *this,
if (local->loc.inode != NULL) {
GET_CONTRI_KEY (contri_key, local->loc.parent->gfid, ret);
if (ret < 0) {
- local->err = errno;
+ local->err = errno ? errno : ENOMEM;
goto err;
}
@@ -1264,7 +1328,7 @@ marker_get_newpath_contribution (call_frame_t *frame, void *cookie,
MARKER_RESET_UID_GID (frame, frame->root, local);
if ((op_ret < 0) && (op_errno != ENOATTR)) {
- local->err = op_errno;
+ local->err = op_errno ? op_errno : EINVAL;
gf_log (this->name, GF_LOG_WARNING,
"fetching contribution values from %s (gfid:%s) "
"failed (%s)", oplocal->loc.path,
@@ -1275,7 +1339,7 @@ marker_get_newpath_contribution (call_frame_t *frame, void *cookie,
GET_CONTRI_KEY (contri_key, oplocal->loc.parent->gfid, ret);
if (ret < 0) {
- local->err = errno;
+ local->err = errno ? errno : ENOMEM;
goto err;
}
@@ -1285,7 +1349,7 @@ marker_get_newpath_contribution (call_frame_t *frame, void *cookie,
if (local->loc.inode != NULL) {
GET_CONTRI_KEY (contri_key, local->loc.parent->gfid, ret);
if (ret < 0) {
- local->err = errno;
+ local->err = errno ? errno : ENOMEM;
goto err;
}
@@ -1326,7 +1390,7 @@ marker_get_oldpath_contribution (call_frame_t *frame, void *cookie,
oplocal = local->oplocal;
if (op_ret < 0) {
- local->err = op_errno;
+ local->err = op_errno ? op_errno : EINVAL;
gf_log (this->name, GF_LOG_WARNING,
"cannot hold inodelk on %s (gfid:%s) (%s)",
local->next_lock_on->path,
@@ -1337,7 +1401,7 @@ marker_get_oldpath_contribution (call_frame_t *frame, void *cookie,
GET_CONTRI_KEY (contri_key, oplocal->loc.parent->gfid, ret);
if (ret < 0) {
- local->err = errno;
+ local->err = errno ? errno : ENOMEM;
goto quota_err;
}
@@ -1393,7 +1457,7 @@ marker_rename_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
loc = &local->parent_loc;
}
- local->err = op_errno;
+ local->err = op_errno ? op_errno : EINVAL;
gf_log (this->name, GF_LOG_WARNING,
"cannot hold inodelk on %s (gfid:%s) (%s)",
loc->path, uuid_utoa (loc->inode->gfid),
@@ -1799,6 +1863,210 @@ err:
}
+int32_t
+marker_fallocate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ marker_local_t *local = NULL;
+ marker_conf_t *priv = NULL;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_TRACE, "%s occurred while "
+ "fallocating a file ", strerror (op_errno));
+ }
+
+ local = (marker_local_t *) frame->local;
+
+ frame->local = NULL;
+
+ STACK_UNWIND_STRICT (fallocate, frame, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+
+ if (op_ret == -1 || local == NULL)
+ goto out;
+
+ priv = this->private;
+
+ if (priv->feature_enabled & GF_QUOTA)
+ mq_initiate_quota_txn (this, &local->loc);
+
+ if (priv->feature_enabled & GF_XTIME)
+ marker_xtime_update_marks (this, local);
+out:
+ marker_local_unref (local);
+
+ return 0;
+}
+
+int32_t
+marker_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ int32_t ret = 0;
+ marker_local_t *local = NULL;
+ marker_conf_t *priv = NULL;
+
+ priv = this->private;
+
+ if (priv->feature_enabled == 0)
+ goto wind;
+
+ local = mem_get0 (this->local_pool);
+
+ MARKER_INIT_LOCAL (frame, local);
+
+ ret = marker_inode_loc_fill (fd->inode, &local->loc);
+
+ if (ret == -1)
+ goto err;
+wind:
+ STACK_WIND (frame, marker_fallocate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len,
+ xdata);
+ return 0;
+err:
+ STACK_UNWIND_STRICT (fallocate, frame, -1, ENOMEM, NULL, NULL, NULL);
+
+ return 0;
+}
+
+
+int32_t
+marker_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ marker_local_t *local = NULL;
+ marker_conf_t *priv = NULL;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_TRACE, "%s occurred during discard",
+ strerror (op_errno));
+ }
+
+ local = (marker_local_t *) frame->local;
+
+ frame->local = NULL;
+
+ STACK_UNWIND_STRICT (discard, frame, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+
+ if (op_ret == -1 || local == NULL)
+ goto out;
+
+ priv = this->private;
+
+ if (priv->feature_enabled & GF_QUOTA)
+ mq_initiate_quota_txn (this, &local->loc);
+
+ if (priv->feature_enabled & GF_XTIME)
+ marker_xtime_update_marks (this, local);
+out:
+ marker_local_unref (local);
+
+ return 0;
+}
+
+int32_t
+marker_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ int32_t ret = 0;
+ marker_local_t *local = NULL;
+ marker_conf_t *priv = NULL;
+
+ priv = this->private;
+
+ if (priv->feature_enabled == 0)
+ goto wind;
+
+ local = mem_get0 (this->local_pool);
+
+ MARKER_INIT_LOCAL (frame, local);
+
+ ret = marker_inode_loc_fill (fd->inode, &local->loc);
+
+ if (ret == -1)
+ goto err;
+wind:
+ STACK_WIND (frame, marker_discard_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata);
+ return 0;
+err:
+ STACK_UNWIND_STRICT (discard, frame, -1, ENOMEM, NULL, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+marker_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ marker_local_t *local = NULL;
+ marker_conf_t *priv = NULL;
+
+ if (op_ret == -1) {
+ gf_log (this->name, GF_LOG_TRACE, "%s occurred during zerofill",
+ strerror (op_errno));
+ }
+
+ local = (marker_local_t *) frame->local;
+
+ frame->local = NULL;
+
+ STACK_UNWIND_STRICT (zerofill, frame, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+
+ if (op_ret == -1 || local == NULL)
+ goto out;
+
+ priv = this->private;
+
+ if (priv->feature_enabled & GF_QUOTA)
+ mq_initiate_quota_txn (this, &local->loc);
+
+ if (priv->feature_enabled & GF_XTIME)
+ marker_xtime_update_marks (this, local);
+out:
+ marker_local_unref (local);
+
+ return 0;
+}
+
+int32_t
+marker_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata)
+{
+ int32_t ret = 0;
+ marker_local_t *local = NULL;
+ marker_conf_t *priv = NULL;
+
+ priv = this->private;
+
+ if (priv->feature_enabled == 0)
+ goto wind;
+
+ local = mem_get0 (this->local_pool);
+
+ MARKER_INIT_LOCAL (frame, local);
+
+ ret = marker_inode_loc_fill (fd->inode, &local->loc);
+
+ if (ret == -1)
+ goto err;
+wind:
+ STACK_WIND (frame, marker_zerofill_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata);
+ return 0;
+err:
+ STACK_UNWIND_STRICT (zerofill, frame, -1, ENOMEM, NULL, NULL, NULL);
+
+ return 0;
+}
+
+
/* when a call from the special client is received on
* key trusted.glusterfs.volume-mark with value "RESET"
* or if the value is 0length, update the change the
@@ -1891,16 +2159,150 @@ out:
return 0;
}
+int
+remove_quota_keys (dict_t *dict, char *k, data_t *v, void *data)
+{
+ call_frame_t *frame = data;
+ marker_local_t *local = frame->local;
+ xlator_t *this = frame->this;
+ int ret = -1;
+
+ ret = syncop_removexattr (FIRST_CHILD (this), &local->loc, k, 0);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "%s: Failed to remove "
+ "extended attribute: %s", local->loc.path, k);
+ return -1;
+ }
+ return 0;
+}
+
+int
+quota_xattr_cleaner_cbk (int ret, call_frame_t *frame, void *args)
+{
+ dict_t *xdata = args;
+ int op_ret = -1;
+ int op_errno = 0;
+ marker_local_t *local = NULL;
+
+ local = frame->local;
+ frame->local = NULL;
+
+ op_ret = (ret < 0)? -1: 0;
+ op_errno = -ret;
+
+ STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, xdata);
+ marker_local_unref (local);
+ return ret;
+}
+
+int
+quota_xattr_cleaner (void *args)
+{
+ struct synctask *task = NULL;
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ marker_local_t *local = NULL;
+ dict_t *xdata = NULL;
+ int ret = -1;
+
+ task = synctask_get ();
+ if (!task)
+ goto out;
+
+ frame = task->frame;
+ this = frame->this;
+ local = frame->local;
+
+ ret = syncop_listxattr (FIRST_CHILD(this), &local->loc, &xdata);
+ if (ret == -1) {
+ ret = -errno;
+ goto out;
+ }
+
+ ret = dict_foreach_fnmatch (xdata, "trusted.glusterfs.quota.*",
+ remove_quota_keys, frame);
+ if (ret == -1) {
+ ret = -errno;
+ goto out;
+ }
+ ret = dict_foreach_fnmatch (xdata, PGFID_XATTR_KEY_PREFIX"*",
+ remove_quota_keys, frame);
+ if (ret == -1) {
+ ret = -errno;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (xdata)
+ dict_unref (xdata);
+
+ return ret;
+}
+
+int
+marker_do_xattr_cleanup (call_frame_t *frame, xlator_t *this, dict_t *xdata,
+ loc_t *loc)
+{
+ int ret = -1;
+ marker_local_t *local = NULL;
+
+ local = mem_get0 (this->local_pool);
+ if (!local)
+ goto out;
+
+ MARKER_INIT_LOCAL (frame, local);
+
+ loc_copy (&local->loc, loc);
+ ret = synctask_new (this->ctx->env, quota_xattr_cleaner,
+ quota_xattr_cleaner_cbk, frame, xdata);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to create synctask "
+ "for cleaning up quota extended attributes");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (ret) {
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (setxattr, frame, -1, ENOMEM, xdata);
+ marker_local_unref (local);
+ }
+ return ret;
+}
+
+static inline gf_boolean_t
+marker_xattr_cleanup_cmd (dict_t *dict)
+{
+ return (dict_get (dict, VIRTUAL_QUOTA_XATTR_CLEANUP_KEY) != NULL);
+}
+
int32_t
marker_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
int32_t flags, dict_t *xdata)
{
- int32_t ret = 0;
- marker_local_t *local = NULL;
- marker_conf_t *priv = NULL;
+ int32_t ret = 0;
+ marker_local_t *local = NULL;
+ marker_conf_t *priv = NULL;
+ int op_errno = ENOMEM;
priv = this->private;
+ if (marker_xattr_cleanup_cmd (dict)) {
+ if (frame->root->uid != 0 || frame->root->gid != 0) {
+ op_errno = EPERM;
+ ret = -1;
+ goto err;
+ }
+
+ /* The following function does the cleanup and then unwinds the
+ * corresponding call*/
+ loc_path (loc, NULL);
+ marker_do_xattr_cleanup (frame, this, xdata, loc);
+ return 0;
+ }
+
if (priv->feature_enabled == 0)
goto wind;
@@ -1921,7 +2323,7 @@ wind:
FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, xdata);
return 0;
err:
- STACK_UNWIND_STRICT (setxattr, frame, -1, ENOMEM, NULL);
+ STACK_UNWIND_STRICT (setxattr, frame, -1, op_errno, NULL);
return 0;
}
@@ -2268,22 +2670,94 @@ err:
return 0;
}
+
+int
+marker_build_ancestry_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, gf_dirent_t *entries,
+ dict_t *xdata)
+{
+ gf_dirent_t *entry = NULL;
+ loc_t loc = {0, };
+ inode_t *parent = NULL;
+
+ if ((op_ret <= 0) || (entries == NULL)) {
+ goto out;
+ }
+
+
+ list_for_each_entry (entry, &entries->list, list) {
+ if (entry->inode == entry->inode->table->root) {
+ loc.path = gf_strdup ("/");
+ inode_unref (parent);
+ parent = NULL;
+ }
+
+ loc.inode = inode_ref (entry->inode);
+
+ if (parent != NULL) {
+ loc.parent = inode_ref (parent);
+ uuid_copy (loc.pargfid, parent->gfid);
+ }
+
+ uuid_copy (loc.gfid, entry->d_stat.ia_gfid);
+
+ mq_xattr_state (this, &loc, entry->dict, entry->d_stat);
+
+ inode_unref (parent);
+ parent = inode_ref (entry->inode);
+ loc_wipe (&loc);
+ }
+
+ if (parent)
+ inode_unref (parent);
+
+out:
+ STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries, xdata);
+ return 0;
+}
+
int
marker_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, gf_dirent_t *entries,
dict_t *xdata)
{
- gf_dirent_t *entry = NULL;
+ gf_dirent_t *entry = NULL;
+ marker_conf_t *priv = NULL;
+ marker_local_t *local = NULL;
+ loc_t loc = {0, };
if (op_ret <= 0)
goto unwind;
+ priv = this->private;
+ local = frame->local;
+
+ if (!(priv->feature_enabled & GF_QUOTA) || (local == NULL)) {
+ goto unwind;
+ }
+
list_for_each_entry (entry, &entries->list, list) {
- /* TODO: fill things */
+ if ((strcmp (entry->d_name, ".") == 0) ||
+ (strcmp (entry->d_name, "..") == 0))
+ continue;
+
+ loc.inode = inode_ref (entry->inode);
+ loc.parent = inode_ref (local->loc.inode);
+
+ uuid_copy (loc.gfid, entry->d_stat.ia_gfid);
+ uuid_copy (loc.pargfid, loc.parent->gfid);
+
+ mq_xattr_state (this, &loc, entry->dict, entry->d_stat);
+
+ loc_wipe (&loc);
}
unwind:
+ local = frame->local;
+ frame->local = NULL;
+
STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries, xdata);
+ marker_local_unref (local);
return 0;
}
@@ -2292,20 +2766,36 @@ int
marker_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
off_t offset, dict_t *dict)
{
- marker_conf_t *priv = NULL;
+ marker_conf_t *priv = NULL;
+ loc_t loc = {0, };
+ marker_local_t *local = NULL;
priv = this->private;
- if (priv->feature_enabled == 0)
- goto wind;
+ if ((dict != NULL) && dict_get (dict, GET_ANCESTRY_DENTRY_KEY)) {
+ STACK_WIND (frame, marker_build_ancestry_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdirp,
+ fd, size, offset, dict);
+ } else {
+ if (priv->feature_enabled & GF_QUOTA) {
+ local = mem_get0 (this->local_pool);
- if ((priv->feature_enabled & GF_QUOTA) && dict)
- mq_req_xattr (this, NULL, dict);
+ MARKER_INIT_LOCAL (frame, local);
-wind:
- STACK_WIND (frame, marker_readdirp_cbk,
- FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdirp,
- fd, size, offset, dict);
+ loc.parent = local->loc.inode = inode_ref (fd->inode);
+
+ if (dict == NULL)
+ dict = dict_new ();
+
+ mq_req_xattr (this, &loc, dict);
+ }
+
+ STACK_WIND (frame, marker_readdirp_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdirp,
+ fd, size, offset, dict);
+ }
return 0;
}
@@ -2443,7 +2933,7 @@ out:
int32_t
reconfigure (xlator_t *this, dict_t *options)
{
- int32_t ret = -1;
+ int32_t ret = 0;
data_t *data = NULL;
gf_boolean_t flag = _gf_false;
marker_conf_t *priv = NULL;
@@ -2484,11 +2974,17 @@ reconfigure (xlator_t *this, dict_t *options)
"xtime updation will fail");
} else {
priv->feature_enabled |= GF_XTIME;
+ data = dict_get (options, "gsync-force-xtime");
+ if (!data)
+ goto out;
+ ret = gf_string2boolean (data->data, &flag);
+ if (ret == 0 && flag)
+ priv->feature_enabled |= GF_XTIME_GSYNC_FORCE;
}
}
}
out:
- return 0;
+ return ret;
}
@@ -2544,9 +3040,16 @@ init (xlator_t *this)
goto err;
priv->feature_enabled |= GF_XTIME;
+ data = dict_get (options, "gsync-force-xtime");
+ if (!data)
+ goto cont;
+ ret = gf_string2boolean (data->data, &flag);
+ if (ret == 0 && flag)
+ priv->feature_enabled |= GF_XTIME_GSYNC_FORCE;
}
}
+ cont:
this->local_pool = mem_pool_new (marker_local_t, 128);
if (!this->local_pool) {
gf_log (this->name, GF_LOG_ERROR,
@@ -2608,6 +3111,9 @@ struct xlator_fops fops = {
.removexattr = marker_removexattr,
.getxattr = marker_getxattr,
.readdirp = marker_readdirp,
+ .fallocate = marker_fallocate,
+ .discard = marker_discard,
+ .zerofill = marker_zerofill,
};
struct xlator_cbks cbks = {
@@ -2619,5 +3125,6 @@ struct volume_options options[] = {
{.key = {"timestamp-file"}},
{.key = {"quota"}},
{.key = {"xtime"}},
+ {.key = {"gsync-force-xtime"}},
{.key = {NULL}}
};
diff --git a/xlators/features/marker/src/marker.h b/xlators/features/marker/src/marker.h
index 63491ab37..23d1580f0 100644
--- a/xlators/features/marker/src/marker.h
+++ b/xlators/features/marker/src/marker.h
@@ -28,8 +28,9 @@
#define TIMESTAMP_FILE "timestamp-file"
enum {
- GF_QUOTA=1,
- GF_XTIME=2
+ GF_QUOTA = 1,
+ GF_XTIME = 2,
+ GF_XTIME_GSYNC_FORCE = 4,
};
/*initialize the local variable*/
@@ -110,6 +111,8 @@ struct marker_local{
inode_contribution_t *contri;
int xflag;
+ dict_t *xdata;
+ gf_boolean_t skip_txn;
};
typedef struct marker_local marker_local_t;
diff --git a/xlators/features/marker/utils/Makefile.am b/xlators/features/marker/utils/Makefile.am
deleted file mode 100644
index 556951d9f..000000000
--- a/xlators/features/marker/utils/Makefile.am
+++ /dev/null
@@ -1,3 +0,0 @@
-SUBDIRS = syncdaemon src
-
-CLEANFILES =
diff --git a/xlators/features/marker/utils/src/Makefile.am b/xlators/features/marker/utils/src/Makefile.am
deleted file mode 100644
index 9e410cda6..000000000
--- a/xlators/features/marker/utils/src/Makefile.am
+++ /dev/null
@@ -1,26 +0,0 @@
-gsyncddir = $(libexecdir)/glusterfs
-
-gsyncd_PROGRAMS = gsyncd
-
-gsyncd_SOURCES = gsyncd.c procdiggy.c
-
-gsyncd_LDADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
- $(GF_GLUSTERFS_LIBS)
-
-gsyncd_LDFLAGS = $(GF_LDFLAGS)
-
-noinst_HEADERS = procdiggy.h
-
-AM_CPPFLAGS = $(GF_CPPFLAGS) \
- -I$(top_srcdir)/libglusterfs/src\
- -DGSYNCD_PREFIX=\"$(libexecdir)/glusterfs\"\
- -DUSE_LIBGLUSTERFS\
- -DSBIN_DIR=\"$(sbindir)\" -DPYTHON=\"$(PYTHON)\"
-
-AM_CFLAGS = -Wall $(GF_CFLAGS)
-
-
-CLEANFILES =
-
-$(top_builddir)/libglusterfs/src/libglusterfs.la:
- $(MAKE) -C $(top_builddir)/libglusterfs/src/ all
diff --git a/xlators/features/marker/utils/src/gsyncd.c b/xlators/features/marker/utils/src/gsyncd.c
deleted file mode 100644
index a45873a71..000000000
--- a/xlators/features/marker/utils/src/gsyncd.c
+++ /dev/null
@@ -1,363 +0,0 @@
-/*
- Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <string.h>
-#include <sys/param.h> /* for PATH_MAX */
-
-/* NOTE (USE_LIBGLUSTERFS):
- * ------------------------
- * When USE_LIBGLUSTERFS debugging sumbol is passed; perform
- * glusterfs translator like initialization so that glusterfs
- * globals, contexts are valid when glustefs api's are invoked.
- * We unconditionally pass then while building gsyncd binary.
- */
-#ifdef USE_LIBGLUSTERFS
-#include "glusterfs.h"
-#include "globals.h"
-#endif
-
-#include "common-utils.h"
-#include "run.h"
-#include "procdiggy.h"
-
-#define _GLUSTERD_CALLED_ "_GLUSTERD_CALLED_"
-#define _GSYNCD_DISPATCHED_ "_GSYNCD_DISPATCHED_"
-#define GSYNCD_CONF "geo-replication/gsyncd.conf"
-#define GSYNCD_PY "gsyncd.py"
-#define RSYNC "rsync"
-
-int restricted = 0;
-
-static int
-duplexpand (void **buf, size_t tsiz, size_t *len)
-{
- size_t osiz = tsiz * *len;
- char *p = realloc (*buf, osiz << 1);
- if (!p) {
- free(*buf);
- return -1;
- }
-
- memset (p + osiz, 0, osiz);
- *buf = p;
- *len <<= 1;
-
- return 0;
-}
-
-static int
-str2argv (char *str, char ***argv)
-{
- char *p = NULL;
- char *savetok = NULL;
- int argc = 0;
- size_t argv_len = 32;
- int ret = 0;
-
- assert (str);
- str = strdup (str);
- if (!str)
- return -1;
-
- *argv = calloc (argv_len, sizeof (**argv));
- if (!*argv)
- goto error;
-
- while ((p = strtok_r (str, " ", &savetok))) {
- str = NULL;
-
- argc++;
- if (argc == argv_len) {
- ret = duplexpand ((void *)argv,
- sizeof (**argv),
- &argv_len);
- if (ret == -1)
- goto error;
- }
- (*argv)[argc - 1] = p;
- }
-
- return argc;
-
- error:
- fprintf (stderr, "out of memory\n");
- return -1;
-}
-
-static int
-invoke_gsyncd (int argc, char **argv)
-{
- char config_file[PATH_MAX] = {0,};
- size_t gluster_workdir_len = 0;
- runner_t runner = {0,};
- int i = 0;
- int j = 0;
- char *nargv[argc + 4];
-
- if (restricted) {
- size_t len;
- /* in restricted mode we forcibly use the system-wide config */
- runinit (&runner);
- runner_add_args (&runner, SBIN_DIR"/gluster",
- "--log-file=-", "system::", "getwd",
- NULL);
- runner_redir (&runner, STDOUT_FILENO, RUN_PIPE);
- if (runner_start (&runner) == 0 &&
- fgets (config_file, PATH_MAX,
- runner_chio (&runner, STDOUT_FILENO)) != NULL &&
- (len = strlen (config_file)) &&
- config_file[len - 1] == '\n' &&
- runner_end (&runner) == 0)
- gluster_workdir_len = len - 1;
-
- if (gluster_workdir_len) {
- if (gluster_workdir_len + 1 + strlen (GSYNCD_CONF) + 1 >
- PATH_MAX)
- goto error;
- config_file[gluster_workdir_len] = '/';
- strcat (config_file, GSYNCD_CONF);
- } else
- goto error;
-
- if (setenv ("_GSYNCD_RESTRICTED_", "1", 1) == -1)
- goto error;
- }
-
- if (chdir ("/") == -1)
- goto error;
-
- j = 0;
- nargv[j++] = PYTHON;
- nargv[j++] = GSYNCD_PREFIX"/python/syncdaemon/"GSYNCD_PY;
- for (i = 1; i < argc; i++)
- nargv[j++] = argv[i];
- if (config_file[0]) {
- nargv[j++] = "-c";
- nargv[j++] = config_file;
- }
- nargv[j++] = NULL;
-
- execvp (PYTHON, nargv);
-
- fprintf (stderr, "exec of "PYTHON" failed\n");
- return 127;
-
- error:
- fprintf (stderr, "gsyncd initializaion failed\n");
- return 1;
-}
-
-
-static int
-find_gsyncd (pid_t pid, pid_t ppid, char *name, void *data)
-{
- char buf[NAME_MAX * 2] = {0,};
- char path[PATH_MAX] = {0,};
- char *p = NULL;
- int zeros = 0;
- int ret = 0;
- int fd = -1;
- pid_t *pida = (pid_t *)data;
-
- if (ppid != pida[0])
- return 0;
-
- sprintf (path, PROC"/%d/cmdline", pid);
- fd = open (path, O_RDONLY);
- if (fd == -1)
- return 0;
- ret = read (fd, buf, sizeof (buf));
- close (fd);
- if (ret == -1)
- return 0;
- for (zeros = 0, p = buf; zeros < 2 && p < buf + ret; p++)
- zeros += !*p;
-
- ret = 0;
- switch (zeros) {
- case 2:
- if ((strcmp (basename (buf), basename (PYTHON)) ||
- strcmp (basename (buf + strlen (buf) + 1), GSYNCD_PY)) == 0) {
- ret = 1;
- break;
- }
- /* fallthrough */
- case 1:
- if (strcmp (basename (buf), GSYNCD_PY) == 0)
- ret = 1;
- }
-
- if (ret == 1) {
- if (pida[1] != -1) {
- fprintf (stderr, GSYNCD_PY" sibling is not unique");
- return -1;
- }
- pida[1] = pid;
- }
-
- return 0;
-}
-
-static int
-invoke_rsync (int argc, char **argv)
-{
- int i = 0;
- char path[PATH_MAX] = {0,};
- pid_t pid = -1;
- pid_t ppid = -1;
- pid_t pida[] = {-1, -1};
- char *name = NULL;
- char buf[PATH_MAX + 1] = {0,};
- int ret = 0;
-
- assert (argv[argc] == NULL);
-
- if (argc < 2 || strcmp (argv[1], "--server") != 0)
- goto error;
-
- for (i = 2; i < argc && argv[i][0] == '-'; i++);
-
- if (!(i == argc - 2 && strcmp (argv[i], ".") == 0 && argv[i + 1][0] == '/')) {
- fprintf (stderr, "need an rsync invocation without protected args\n");
- goto error;
- }
-
- /* look up sshd we are spawned from */
- for (pid = getpid () ;; pid = ppid) {
- ppid = pidinfo (pid, &name);
- if (ppid < 0) {
- fprintf (stderr, "sshd ancestor not found\n");
- goto error;
- }
- if (strcmp (name, "sshd") == 0) {
- GF_FREE (name);
- break;
- }
- GF_FREE (name);
- }
- /* look up "ssh-sibling" gsyncd */
- pida[0] = pid;
- ret = prociter (find_gsyncd, pida);
- if (ret == -1 || pida[1] == -1) {
- fprintf (stderr, "gsyncd sibling not found\n");
- goto error;
- }
- /* check if rsync target matches gsyncd target */
- sprintf (path, PROC"/%d/cwd", pida[1]);
- ret = readlink (path, buf, sizeof (buf));
- if (ret == -1 || ret == sizeof (buf))
- goto error;
- if (strcmp (argv[argc - 1], "/") == 0 /* root dir cannot be a target */ ||
- (strcmp (argv[argc - 1], path) /* match against gluster target */ &&
- strcmp (argv[argc - 1], buf) /* match against file target */) != 0) {
- fprintf (stderr, "rsync target does not match "GEOREP" session\n");
- goto error;
- }
-
- argv[0] = RSYNC;
-
- execvp (RSYNC, argv);
-
- fprintf (stderr, "exec of "RSYNC" failed\n");
- return 127;
-
- error:
- fprintf (stderr, "disallowed "RSYNC" invocation\n");
- return 1;
-}
-
-
-struct invocable {
- char *name;
- int (*invoker) (int argc, char **argv);
-};
-
-struct invocable invocables[] = {
- { "rsync", invoke_rsync },
- { "gsyncd", invoke_gsyncd },
- { NULL, NULL}
-};
-
-int
-main (int argc, char **argv)
-{
- char *evas = NULL;
- struct invocable *i = NULL;
- char *b = NULL;
- char *sargv = NULL;
-
-#ifdef USE_LIBGLUSTERFS
- glusterfs_ctx_t *ctx = NULL;
-
- ctx = glusterfs_ctx_new ();
- if (!ctx)
- return ENOMEM;
-
- if (glusterfs_globals_init (ctx))
- return 1;
-
- THIS->ctx = ctx;
-#endif
-
- evas = getenv (_GLUSTERD_CALLED_);
- if (evas && strcmp (evas, "1") == 0)
- /* OK, we know glusterd called us, no need to look for further config
- * ... altough this conclusion should not inherit to our children
- */
- unsetenv (_GLUSTERD_CALLED_);
- else {
- /* we regard all gsyncd invocations unsafe
- * that do not come from glusterd and
- * therefore restrict it
- */
- restricted = 1;
-
- if (!getenv (_GSYNCD_DISPATCHED_)) {
- evas = getenv ("SSH_ORIGINAL_COMMAND");
- if (evas)
- sargv = evas;
- else {
- evas = getenv ("SHELL");
- if (evas && strcmp (basename (evas), "gsyncd") == 0 &&
- argc == 3 && strcmp (argv[1], "-c") == 0)
- sargv = argv[2];
- }
- }
-
- }
-
- if (!(sargv && restricted))
- return invoke_gsyncd (argc, argv);
-
- argc = str2argv (sargv, &argv);
- if (argc == -1 || setenv (_GSYNCD_DISPATCHED_, "1", 1) == -1) {
- fprintf (stderr, "internal error\n");
- return 1;
- }
-
- b = basename (argv[0]);
- for (i = invocables; i->name; i++) {
- if (strcmp (b, i->name) == 0)
- return i->invoker (argc, argv);
- }
-
- fprintf (stderr, "invoking %s in restricted SSH session is not allowed\n",
- b);
-
- return 1;
-}
diff --git a/xlators/features/marker/utils/src/procdiggy.c b/xlators/features/marker/utils/src/procdiggy.c
deleted file mode 100644
index 1eba414c1..000000000
--- a/xlators/features/marker/utils/src/procdiggy.c
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- Copyright (c) 2011-2012 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#ifndef _CONFIG_H
-#define _CONFIG_H
-#include "config.h"
-#endif
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <string.h>
-#include <ctype.h>
-#include <sys/param.h> /* for PATH_MAX */
-
-#include "common-utils.h"
-#include "procdiggy.h"
-
-pid_t
-pidinfo (pid_t pid, char **name)
-{
- char buf[NAME_MAX * 2] = {0,};
- FILE *f = NULL;
- char path[PATH_MAX] = {0,};
- char *p = NULL;
- int ret = 0;
-
- sprintf (path, PROC"/%d/status", pid);
-
- f = fopen (path, "r");
- if (!f)
- return -1;
-
- if (name)
- *name = NULL;
- for (;;) {
- size_t len;
- memset (buf, 0, sizeof (buf));
- if (fgets (buf, sizeof (buf), f) == NULL ||
- (len = strlen (buf)) == 0 ||
- buf[len - 1] != '\n') {
- pid = -1;
- goto out;
- }
- buf[len - 1] = '\0';
-
- if (name && !*name) {
- p = strtail (buf, "Name:");
- if (p) {
- while (isspace (*++p));
- *name = gf_strdup (p);
- if (!*name) {
- pid = -2;
- goto out;
- }
- continue;
- }
- }
-
- p = strtail (buf, "PPid:");
- if (p)
- break;
- }
-
- while (isspace (*++p));
- ret = gf_string2int (p, &pid);
- if (ret == -1)
- pid = -1;
-
- out:
- fclose (f);
- if (pid == -1 && name && *name)
- GF_FREE (name);
- if (pid == -2)
- fprintf (stderr, "out of memory\n");
- return pid;
-}
-
-int
-prociter (int (*proch) (pid_t pid, pid_t ppid, char *tmpname, void *data),
- void *data)
-{
- char *name = NULL;
- DIR *d = NULL;
- struct dirent *de = NULL;
- pid_t pid = -1;
- pid_t ppid = -1;
- int ret = 0;
-
- d = opendir (PROC);
- if (!d)
- return -1;
- while (errno = 0, de = readdir (d)) {
- if (gf_string2int (de->d_name, &pid) != -1 && pid >= 0) {
- ppid = pidinfo (pid, &name);
- switch (ppid) {
- case -1: continue;
- case -2: ret = -1; break;
- }
- ret = proch (pid, ppid, name, data);
- GF_FREE (name);
- if (ret)
- break;
- }
- }
- closedir (d);
- if (!de && errno) {
- fprintf (stderr, "failed to traverse "PROC" (%s)\n",
- strerror (errno));
- ret = -1;
- }
-
- return ret;
-}
diff --git a/xlators/features/marker/utils/syncdaemon/Makefile.am b/xlators/features/marker/utils/syncdaemon/Makefile.am
deleted file mode 100644
index cc7cee102..000000000
--- a/xlators/features/marker/utils/syncdaemon/Makefile.am
+++ /dev/null
@@ -1,6 +0,0 @@
-syncdaemondir = $(libexecdir)/glusterfs/python/syncdaemon
-
-syncdaemon_PYTHON = gconf.py gsyncd.py __init__.py master.py README.md repce.py resource.py configinterface.py syncdutils.py monitor.py libcxattr.py \
- $(top_builddir)/contrib/ipaddr-py/ipaddr.py
-
-CLEANFILES =
diff --git a/xlators/features/marker/utils/syncdaemon/README.md b/xlators/features/marker/utils/syncdaemon/README.md
deleted file mode 100644
index d45006932..000000000
--- a/xlators/features/marker/utils/syncdaemon/README.md
+++ /dev/null
@@ -1,81 +0,0 @@
-gsycnd, the Gluster Syncdaemon
-==============================
-
-REQUIREMENTS
-------------
-
-_gsyncd_ is a program which can operate either in _master_ or in _slave_ mode.
-Requirements are categorized according to this.
-
-* supported OS is GNU/Linux
-* Python >= 2.5, or 2.4 with Ctypes (see below) (both)
-* OpenSSH >= 4.0 (master) / SSH2 compliant sshd (eg. openssh) (slave)
-* rsync (both)
-* glusterfs with marker support (master); glusterfs (optional on slave)
-* FUSE; for supported versions consult glusterfs
-
-INSTALLATION
-------------
-
-As of now, the supported way of operation is running from the source directory.
-
-If you use Python 2.4.x, you need to install the [Ctypes module](http://python.net/crew/theller/ctypes/).
-
-CONFIGURATION
--------------
-
-gsyncd tunables are a subset of the long command-line options; for listing them,
-type
-
- gsyncd.py --help
-
-and see the long options up to "--config-file". (The leading double dash should be omitted;
-interim underscores and dashes are interchangeable.) The set of options bear some resemblance
-to those of glusterfs and rsync.
-
-The config file format matches the following syntax:
-
- <option1>: <value1>
- <option2>: <value2>
- # comment
-
-By default (unless specified by the option `-c`), gsyncd looks for config file at _conf/gsyncd.conf_
-in the source tree.
-
-USAGE
------
-
-gsyncd is a utilitly for continous mirroring, ie. it mirrors master to slave incrementally.
-Assume we have a gluster volume _pop_ at localhost. We try to set up the following mirrors
-for it with gysncd:
-
-1. _/data/mirror_
-2. local gluster volume _yow_
-3. _/data/far_mirror_ at example.com
-4. gluster volume _moz_ at example.com
-
-The respective gsyncd invocations are (demoing some syntax sugaring):
-
-1.
-
- gsyncd.py gluster://localhost:pop file:///data/mirror
-
- or short form
-
- gsyncd.py :pop /data/mirror
-
-2. `gsyncd :pop :yow`
-3.
-
- gsyncd.py :pop ssh://example.com:/data/far_mirror
-
- or short form
-
- gsyncd.py :pop example.com:/data/far_mirror
-
-4. `gsyncd.py :pop example.com::moz`
-
-gsyncd has to be available on both sides; it's location on the remote side has to be specified
-via the "--remote-gsyncd" option (or "remote-gsyncd" config file parameter). (This option can also be
-used for setting options on the remote side, although the suggested mode of operation is to
-set parameters like log file / pid file in the configuration file.)
diff --git a/xlators/features/marker/utils/syncdaemon/__codecheck.py b/xlators/features/marker/utils/syncdaemon/__codecheck.py
deleted file mode 100644
index e3386afba..000000000
--- a/xlators/features/marker/utils/syncdaemon/__codecheck.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import os
-import os.path
-import sys
-import tempfile
-import shutil
-
-ipd = tempfile.mkdtemp(prefix = 'codecheck-aux')
-
-try:
- # add a fake ipaddr module, we don't want to
- # deal with the real one (just test our code)
- f = open(os.path.join(ipd, 'ipaddr.py'), 'w')
- f.write("""
-class IPAddress(object):
- pass
-class IPNetwork(list):
- pass
-""")
- f.close()
- sys.path.append(ipd)
-
- fl = os.listdir(os.path.dirname(sys.argv[0]) or '.')
- fl.sort()
- for f in fl:
- if f[-3:] != '.py' or f[0] == '_':
- continue
- m = f[:-3]
- sys.stdout.write('importing %s ...' % m)
- __import__(m)
- print(' OK.')
-
- def sys_argv_set(a):
- sys.argv = sys.argv[:1] + a
-
- gsyncd = sys.modules['gsyncd']
- for a in [['--help'], ['--version'], ['--canonicalize-escape-url', '/foo']]:
- print('>>> invoking program with args: %s' % ' '.join(a))
- pid = os.fork()
- if not pid:
- sys_argv_set(a)
- gsyncd.main()
- _, r = os.waitpid(pid, 0)
- if r:
- raise RuntimeError('invocation failed')
-finally:
- shutil.rmtree(ipd)
diff --git a/xlators/features/marker/utils/syncdaemon/__init__.py b/xlators/features/marker/utils/syncdaemon/__init__.py
deleted file mode 100644
index e69de29bb..000000000
--- a/xlators/features/marker/utils/syncdaemon/__init__.py
+++ /dev/null
diff --git a/xlators/features/marker/utils/syncdaemon/configinterface.py b/xlators/features/marker/utils/syncdaemon/configinterface.py
deleted file mode 100644
index e55bec519..000000000
--- a/xlators/features/marker/utils/syncdaemon/configinterface.py
+++ /dev/null
@@ -1,224 +0,0 @@
-try:
- import ConfigParser
-except ImportError:
- # py 3
- import configparser as ConfigParser
-import re
-from string import Template
-
-from syncdutils import escape, unescape, norm, update_file, GsyncdError
-
-SECT_ORD = '__section_order__'
-SECT_META = '__meta__'
-config_version = 2.0
-
-re_type = type(re.compile(''))
-
-
-class MultiDict(object):
- """a virtual dict-like class which functions as the union of underlying dicts"""
-
- def __init__(self, *dd):
- self.dicts = dd
-
- def __getitem__(self, key):
- val = None
- for d in self.dicts:
- if d.get(key):
- val = d[key]
- if not val:
- raise KeyError(key)
- return val
-
-
-class GConffile(object):
- """A high-level interface to ConfigParser which flattens the two-tiered
- config layout by implenting automatic section dispatch based on initial
- parameters.
-
- Also ensure section ordering in terms of their time of addition -- a compat
- hack for Python < 2.7.
- """
-
- def _normconfig(self):
- """normalize config keys by s/-/_/g"""
- for n, s in self.config._sections.items():
- if n.find('__') == 0:
- continue
- s2 = type(s)()
- for k, v in s.items():
- if k.find('__') != 0:
- k = norm(k)
- s2[k] = v
- self.config._sections[n] = s2
-
- def __init__(self, path, peers, *dd):
- """
- - .path: location of config file
- - .config: underlying ConfigParser instance
- - .peers: on behalf of whom we flatten .config
- (master, or master-slave url pair)
- - .auxdicts: template subtituents
- """
- self.peers = peers
- self.path = path
- self.auxdicts = dd
- self.config = ConfigParser.RawConfigParser()
- self.config.read(path)
- self._normconfig()
-
- def section(self, rx=False):
- """get the section name of the section representing .peers in .config"""
- peers = self.peers
- if not peers:
- peers = ['.', '.']
- rx = True
- if rx:
- st = 'peersrx'
- else:
- st = 'peers'
- return ' '.join([st] + [escape(u) for u in peers])
-
- @staticmethod
- def parse_section(section):
- """retrieve peers sequence encoded by section name
- (as urls or regexen, depending on section type)
- """
- sl = section.split()
- st = sl.pop(0)
- sl = [unescape(u) for u in sl]
- if st == 'peersrx':
- sl = [re.compile(u) for u in sl]
- return sl
-
- def ord_sections(self):
- """Return an ordered list of sections.
-
- Ordering happens based on the auxiliary
- SECT_ORD section storing indices for each
- section added through the config API.
-
- To not to go corrupt in case of manually
- written config files, we take care to append
- also those sections which are not registered
- in SECT_ORD.
-
- Needed for python 2.{4,5,6} where ConfigParser
- cannot yet order sections/options internally.
- """
- so = {}
- if self.config.has_section(SECT_ORD):
- so = self.config._sections[SECT_ORD]
- so2 = {}
- for k, v in so.items():
- if k != '__name__':
- so2[k] = int(v)
- tv = 0
- if so2:
- tv = max(so2.values()) + 1
- ss = [s for s in self.config.sections() if s.find('__') != 0]
- for s in ss:
- if s in so.keys():
- continue
- so2[s] = tv
- tv += 1
- def scmp(x, y):
- return cmp(*(so2[s] for s in (x, y)))
- ss.sort(scmp)
- return ss
-
- def update_to(self, dct, allow_unresolved=False):
- """update @dct from key/values of ours.
-
- key/values are collected from .config by filtering the regexp sections
- according to match, and from .section. The values are treated as templates,
- which are substituted from .auxdicts and (in case of regexp sections)
- match groups.
- """
- if not self.peers:
- raise GsyncdError('no peers given, cannot select matching options')
- def update_from_sect(sect, mud):
- for k, v in self.config._sections[sect].items():
- if k == '__name__':
- continue
- if allow_unresolved:
- dct[k] = Template(v).safe_substitute(mud)
- else:
- dct[k] = Template(v).substitute(mud)
- for sect in self.ord_sections():
- sp = self.parse_section(sect)
- if isinstance(sp[0], re_type) and len(sp) == len(self.peers):
- match = True
- mad = {}
- for i in range(len(sp)):
- m = sp[i].search(self.peers[i])
- if not m:
- match = False
- break
- for j in range(len(m.groups())):
- mad['match%d_%d' % (i+1, j+1)] = m.groups()[j]
- if match:
- update_from_sect(sect, MultiDict(dct, mad, *self.auxdicts))
- if self.config.has_section(self.section()):
- update_from_sect(self.section(), MultiDict(dct, *self.auxdicts))
-
- def get(self, opt=None):
- """print the matching key/value pairs from .config,
- or if @opt given, the value for @opt (according to the
- logic described in .update_to)
- """
- d = {}
- self.update_to(d, allow_unresolved = True)
- if opt:
- opt = norm(opt)
- v = d.get(opt)
- if v:
- print(v)
- else:
- for k, v in d.iteritems():
- if k == '__name__':
- continue
- print("%s: %s" % (k, v))
-
- def write(self, trfn, opt, *a, **kw):
- """update on-disk config transactionally
-
- @trfn is the transaction function
- """
- def mergeconf(f):
- self.config = ConfigParser.RawConfigParser()
- self.config.readfp(f)
- self._normconfig()
- if not self.config.has_section(SECT_META):
- self.config.add_section(SECT_META)
- self.config.set(SECT_META, 'version', config_version)
- return trfn(norm(opt), *a, **kw)
- def updateconf(f):
- self.config.write(f)
- update_file(self.path, updateconf, mergeconf)
-
- def _set(self, opt, val, rx=False):
- """set @opt to @val in .section"""
- sect = self.section(rx)
- if not self.config.has_section(sect):
- self.config.add_section(sect)
- # regarding SECT_ORD, cf. ord_sections
- if not self.config.has_section(SECT_ORD):
- self.config.add_section(SECT_ORD)
- self.config.set(SECT_ORD, sect, len(self.config._sections[SECT_ORD]))
- self.config.set(sect, opt, val)
- return True
-
- def set(self, opt, *a, **kw):
- """perform ._set transactionally"""
- self.write(self._set, opt, *a, **kw)
-
- def _delete(self, opt, rx=False):
- """delete @opt from .section"""
- sect = self.section(rx)
- if self.config.has_section(sect):
- return self.config.remove_option(sect, opt)
-
- def delete(self, opt, *a, **kw):
- """perform ._delete transactionally"""
- self.write(self._delete, opt, *a, **kw)
diff --git a/xlators/features/marker/utils/syncdaemon/gconf.py b/xlators/features/marker/utils/syncdaemon/gconf.py
deleted file mode 100644
index 146c72a18..000000000
--- a/xlators/features/marker/utils/syncdaemon/gconf.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import os
-
-class GConf(object):
- """singleton class to store globals
- shared between gsyncd modules"""
-
- ssh_ctl_dir = None
- ssh_ctl_args = None
- cpid = None
- pid_file_owned = False
- log_exit = False
- permanent_handles = []
- log_metadata = {}
-
- @classmethod
- def setup_ssh_ctl(cls, ctld):
- cls.ssh_ctl_dir = ctld
- cls.ssh_ctl_args = ["-oControlMaster=auto", "-S", os.path.join(ctld, "gsycnd-ssh-%r@%h:%p")]
-
-gconf = GConf()
diff --git a/xlators/features/marker/utils/syncdaemon/gsyncd.py b/xlators/features/marker/utils/syncdaemon/gsyncd.py
deleted file mode 100644
index 9e9469469..000000000
--- a/xlators/features/marker/utils/syncdaemon/gsyncd.py
+++ /dev/null
@@ -1,403 +0,0 @@
-#!/usr/bin/env python
-
-import os
-import os.path
-import sys
-import time
-import logging
-import signal
-import optparse
-import fcntl
-import fnmatch
-from optparse import OptionParser, SUPPRESS_HELP
-from logging import Logger
-from errno import EEXIST, ENOENT
-
-from ipaddr import IPAddress, IPNetwork
-
-from gconf import gconf
-from syncdutils import FreeObject, norm, grabpidfile, finalize, log_raise_exception
-from syncdutils import GsyncdError, select, set_term_handler
-from configinterface import GConffile
-import resource
-from monitor import monitor
-
-class GLogger(Logger):
- """Logger customizations for gsyncd.
-
- It implements a log format similar to that of glusterfs.
- """
-
- def makeRecord(self, name, level, *a):
- rv = Logger.makeRecord(self, name, level, *a)
- rv.nsecs = (rv.created - int(rv.created)) * 1000000
- fr = sys._getframe(4)
- callee = fr.f_locals.get('self')
- if callee:
- ctx = str(type(callee)).split("'")[1].split('.')[-1]
- else:
- ctx = '<top>'
- if not hasattr(rv, 'funcName'):
- rv.funcName = fr.f_code.co_name
- rv.lvlnam = logging.getLevelName(level)[0]
- rv.ctx = ctx
- return rv
-
- @classmethod
- def setup(cls, **kw):
- lbl = kw.get('label', "")
- if lbl:
- lbl = '(' + lbl + ')'
- lprm = {'datefmt': "%Y-%m-%d %H:%M:%S",
- 'format': "[%(asctime)s.%(nsecs)d] %(lvlnam)s [%(module)s" + lbl + ":%(lineno)s:%(funcName)s] %(ctx)s: %(message)s"}
- lprm.update(kw)
- lvl = kw.get('level', logging.INFO)
- lprm['level'] = lvl
- logging.root = cls("root", lvl)
- logging.setLoggerClass(cls)
- logging.getLogger().handlers = []
- logging.basicConfig(**lprm)
-
- @classmethod
- def _gsyncd_loginit(cls, **kw):
- lkw = {}
- if gconf.log_level:
- lkw['level'] = gconf.log_level
- if kw.get('log_file'):
- if kw['log_file'] in ('-', '/dev/stderr'):
- lkw['stream'] = sys.stderr
- elif kw['log_file'] == '/dev/stdout':
- lkw['stream'] = sys.stdout
- else:
- lkw['filename'] = kw['log_file']
-
- cls.setup(label=kw.get('label'), **lkw)
-
- lkw.update({'saved_label': kw.get('label')})
- gconf.log_metadata = lkw
- gconf.log_exit = True
-
-def startup(**kw):
- """set up logging, pidfile grabbing, daemonization"""
- if getattr(gconf, 'pid_file', None) and kw.get('go_daemon') != 'postconn':
- if not grabpidfile():
- sys.stderr.write("pidfile is taken, exiting.\n")
- sys.exit(2)
- gconf.pid_file_owned = True
-
- if kw.get('go_daemon') == 'should':
- x, y = os.pipe()
- gconf.cpid = os.fork()
- if gconf.cpid:
- os.close(x)
- sys.exit()
- os.close(y)
- os.setsid()
- dn = os.open(os.devnull, os.O_RDWR)
- for f in (sys.stdin, sys.stdout, sys.stderr):
- os.dup2(dn, f.fileno())
- if getattr(gconf, 'pid_file', None):
- if not grabpidfile(gconf.pid_file + '.tmp'):
- raise GsyncdError("cannot grab temporary pidfile")
- os.rename(gconf.pid_file + '.tmp', gconf.pid_file)
- # wait for parent to terminate
- # so we can start up with
- # no messing from the dirty
- # ol' bustard
- select((x,), (), ())
- os.close(x)
-
- GLogger._gsyncd_loginit(**kw)
-
-def main():
- """main routine, signal/exception handling boilerplates"""
- gconf.starttime = time.time()
- set_term_handler()
- GLogger.setup()
- excont = FreeObject(exval = 0)
- try:
- try:
- main_i()
- except:
- log_raise_exception(excont)
- finally:
- finalize(exval = excont.exval)
-
-def main_i():
- """internal main routine
-
- parse command line, decide what action will be taken;
- we can either:
- - query/manipulate configuration
- - format gsyncd urls using gsyncd's url parsing engine
- - start service in following modes, in given stages:
- - monitor: startup(), monitor()
- - master: startup(), connect_remote(), connect(), service_loop()
- - slave: startup(), connect(), service_loop()
- """
- rconf = {'go_daemon': 'should'}
-
- def store_abs(opt, optstr, val, parser):
- if val and val != '-':
- val = os.path.abspath(val)
- setattr(parser.values, opt.dest, val)
- def store_local(opt, optstr, val, parser):
- rconf[opt.dest] = val
- def store_local_curry(val):
- return lambda o, oo, vx, p: store_local(o, oo, val, p)
- def store_local_obj(op, dmake):
- return lambda o, oo, vx, p: store_local(o, oo, FreeObject(op=op, **dmake(vx)), p)
-
- op = OptionParser(usage="%prog [options...] <master> <slave>", version="%prog 0.0.1")
- op.add_option('--gluster-command-dir', metavar='DIR', default='')
- op.add_option('--gluster-log-file', metavar='LOGF', default=os.devnull, type=str, action='callback', callback=store_abs)
- op.add_option('--gluster-log-level', metavar='LVL')
- op.add_option('--gluster-params', metavar='PRMS', default='')
- op.add_option('--gluster-cli-options', metavar='OPTS', default='--log-file=-')
- op.add_option('--mountbroker', metavar='LABEL')
- op.add_option('-p', '--pid-file', metavar='PIDF', type=str, action='callback', callback=store_abs)
- op.add_option('-l', '--log-file', metavar='LOGF', type=str, action='callback', callback=store_abs)
- op.add_option('--state-file', metavar='STATF', type=str, action='callback', callback=store_abs)
- op.add_option('--ignore-deletes', default=False, action='store_true')
- op.add_option('--use-rsync-xattrs', default=False, action='store_true')
- op.add_option('-L', '--log-level', metavar='LVL')
- op.add_option('-r', '--remote-gsyncd', metavar='CMD', default=os.path.abspath(sys.argv[0]))
- op.add_option('--volume-id', metavar='UUID')
- op.add_option('--session-owner', metavar='ID')
- op.add_option('-s', '--ssh-command', metavar='CMD', default='ssh')
- op.add_option('--rsync-command', metavar='CMD', default='rsync')
- op.add_option('--rsync-options', metavar='OPTS', default='--sparse')
- op.add_option('--rsync-ssh-options', metavar='OPTS', default='--compress')
- op.add_option('--timeout', metavar='SEC', type=int, default=120)
- op.add_option('--connection-timeout', metavar='SEC', type=int, default=60, help=SUPPRESS_HELP)
- op.add_option('--sync-jobs', metavar='N', type=int, default=3)
- op.add_option('--turns', metavar='N', type=int, default=0, help=SUPPRESS_HELP)
- op.add_option('--allow-network', metavar='IPS', default='')
- op.add_option('--state-socket-unencoded', metavar='SOCKF', type=str, action='callback', callback=store_abs)
- op.add_option('--checkpoint', metavar='LABEL', default='')
- # tunables for failover/failback mechanism:
- # None - gsyncd behaves as normal
- # blind - gsyncd works with xtime pairs to identify
- # candidates for synchronization
- # wrapup - same as normal mode but does not assign
- # xtimes to orphaned files
- # see crawl() for usage of the above tunables
- op.add_option('--special-sync-mode', type=str, help=SUPPRESS_HELP)
-
- op.add_option('-c', '--config-file', metavar='CONF', type=str, action='callback', callback=store_local)
- # duh. need to specify dest or value will be mapped to None :S
- op.add_option('--monitor', dest='monitor', action='callback', callback=store_local_curry(True))
- op.add_option('--feedback-fd', dest='feedback_fd', type=int, help=SUPPRESS_HELP, action='callback', callback=store_local)
- op.add_option('--listen', dest='listen', help=SUPPRESS_HELP, action='callback', callback=store_local_curry(True))
- op.add_option('-N', '--no-daemon', dest="go_daemon", action='callback', callback=store_local_curry('dont'))
- op.add_option('--debug', dest="go_daemon", action='callback', callback=lambda *a: (store_local_curry('dont')(*a),
- setattr(a[-1].values, 'log_file', '-'),
- setattr(a[-1].values, 'log_level', 'DEBUG'))),
-
- for a in ('check', 'get'):
- op.add_option('--config-' + a, metavar='OPT', type=str, dest='config', action='callback',
- callback=store_local_obj(a, lambda vx: {'opt': vx}))
- op.add_option('--config-get-all', dest='config', action='callback', callback=store_local_obj('get', lambda vx: {'opt': None}))
- for m in ('', '-rx', '-glob'):
- # call this code 'Pythonic' eh?
- # have to define a one-shot local function to be able to inject (a value depending on the)
- # iteration variable into the inner lambda
- def conf_mod_opt_regex_variant(rx):
- op.add_option('--config-set' + m, metavar='OPT VAL', type=str, nargs=2, dest='config', action='callback',
- callback=store_local_obj('set', lambda vx: {'opt': vx[0], 'val': vx[1], 'rx': rx}))
- op.add_option('--config-del' + m, metavar='OPT', type=str, dest='config', action='callback',
- callback=store_local_obj('del', lambda vx: {'opt': vx, 'rx': rx}))
- conf_mod_opt_regex_variant(m and m[1:] or False)
-
- op.add_option('--normalize-url', dest='url_print', action='callback', callback=store_local_curry('normal'))
- op.add_option('--canonicalize-url', dest='url_print', action='callback', callback=store_local_curry('canon'))
- op.add_option('--canonicalize-escape-url', dest='url_print', action='callback', callback=store_local_curry('canon_esc'))
-
- tunables = [ norm(o.get_opt_string()[2:]) for o in op.option_list if o.callback in (store_abs, 'store_true', None) and o.get_opt_string() not in ('--version', '--help') ]
- remote_tunables = [ 'listen', 'go_daemon', 'timeout', 'session_owner', 'config_file', 'use_rsync_xattrs' ]
- rq_remote_tunables = { 'listen': True }
-
- # precedence for sources of values: 1) commandline, 2) cfg file, 3) defaults
- # -- for this to work out we need to tell apart defaults from explicitly set
- # options... so churn out the defaults here and call the parser with virgin
- # values container.
- defaults = op.get_default_values()
- opts, args = op.parse_args(values=optparse.Values())
- confdata = rconf.get('config')
- if not (len(args) == 2 or \
- (len(args) == 1 and rconf.get('listen')) or \
- (len(args) <= 2 and confdata) or \
- rconf.get('url_print')):
- sys.stderr.write("error: incorrect number of arguments\n\n")
- sys.stderr.write(op.get_usage() + "\n")
- sys.exit(1)
-
- restricted = os.getenv('_GSYNCD_RESTRICTED_')
-
- if restricted:
- allopts = {}
- allopts.update(opts.__dict__)
- allopts.update(rconf)
- bannedtuns = set(allopts.keys()) - set(remote_tunables)
- if bannedtuns:
- raise GsyncdError('following tunables cannot be set with restricted SSH invocaton: ' + \
- ', '.join(bannedtuns))
- for k, v in rq_remote_tunables.items():
- if not k in allopts or allopts[k] != v:
- raise GsyncdError('tunable %s is not set to value %s required for restricted SSH invocaton' % \
- (k, v))
-
- confrx = getattr(confdata, 'rx', None)
- if confrx:
- # peers are regexen, don't try to parse them
- if confrx == 'glob':
- args = [ '\A' + fnmatch.translate(a) for a in args ]
- canon_peers = args
- namedict = {}
- else:
- rscs = [resource.parse_url(u) for u in args]
- dc = rconf.get('url_print')
- if dc:
- for r in rscs:
- print(r.get_url(**{'normal': {},
- 'canon': {'canonical': True},
- 'canon_esc': {'canonical': True, 'escaped': True}}[dc]))
- return
- local = remote = None
- if rscs:
- local = rscs[0]
- if len(rscs) > 1:
- remote = rscs[1]
- if not local.can_connect_to(remote):
- raise GsyncdError("%s cannot work with %s" % (local.path, remote and remote.path))
- pa = ([], [], [])
- urlprms = ({}, {'canonical': True}, {'canonical': True, 'escaped': True})
- for x in rscs:
- for i in range(len(pa)):
- pa[i].append(x.get_url(**urlprms[i]))
- peers, canon_peers, canon_esc_peers = pa
- # creating the namedict, a dict representing various ways of referring to / repreenting
- # peers to be fillable in config templates
- mods = (lambda x: x, lambda x: x[0].upper() + x[1:], lambda x: 'e' + x[0].upper() + x[1:])
- if remote:
- rmap = { local: ('local', 'master'), remote: ('remote', 'slave') }
- else:
- rmap = { local: ('local', 'slave') }
- namedict = {}
- for i in range(len(rscs)):
- x = rscs[i]
- for name in rmap[x]:
- for j in range(3):
- namedict[mods[j](name)] = pa[j][i]
- if x.scheme == 'gluster':
- namedict[name + 'vol'] = x.volume
- if not 'config_file' in rconf:
- rconf['config_file'] = os.path.join(os.path.dirname(sys.argv[0]), "conf/gsyncd.conf")
- gcnf = GConffile(rconf['config_file'], canon_peers, defaults.__dict__, opts.__dict__, namedict)
-
- checkpoint_change = False
- if confdata:
- opt_ok = norm(confdata.opt) in tunables + [None]
- if confdata.op == 'check':
- if opt_ok:
- sys.exit(0)
- else:
- sys.exit(1)
- elif not opt_ok:
- raise GsyncdError("not a valid option: " + confdata.opt)
- if confdata.op == 'get':
- gcnf.get(confdata.opt)
- elif confdata.op == 'set':
- gcnf.set(confdata.opt, confdata.val, confdata.rx)
- elif confdata.op == 'del':
- gcnf.delete(confdata.opt, confdata.rx)
- # when modifying checkpoint, it's important to make a log
- # of that, so in that case we go on to set up logging even
- # if its just config invocation
- if confdata.opt == 'checkpoint' and confdata.op in ('set', 'del') and \
- not confdata.rx:
- checkpoint_change = True
- if not checkpoint_change:
- return
-
- gconf.__dict__.update(defaults.__dict__)
- gcnf.update_to(gconf.__dict__)
- gconf.__dict__.update(opts.__dict__)
- gconf.configinterface = gcnf
-
- if restricted and gconf.allow_network:
- ssh_conn = os.getenv('SSH_CONNECTION')
- if not ssh_conn:
- #legacy env var
- ssh_conn = os.getenv('SSH_CLIENT')
- if ssh_conn:
- allowed_networks = [ IPNetwork(a) for a in gconf.allow_network.split(',') ]
- client_ip = IPAddress(ssh_conn.split()[0])
- allowed = False
- for nw in allowed_networks:
- if client_ip in nw:
- allowed = True
- break
- if not allowed:
- raise GsyncdError("client IP address is not allowed")
-
- ffd = rconf.get('feedback_fd')
- if ffd:
- fcntl.fcntl(ffd, fcntl.F_SETFD, fcntl.FD_CLOEXEC)
-
- #normalize loglevel
- lvl0 = gconf.log_level
- if isinstance(lvl0, str):
- lvl1 = lvl0.upper()
- lvl2 = logging.getLevelName(lvl1)
- # I have _never_ _ever_ seen such an utterly braindead
- # error condition
- if lvl2 == "Level " + lvl1:
- raise GsyncdError('cannot recognize log level "%s"' % lvl0)
- gconf.log_level = lvl2
-
- if checkpoint_change:
- GLogger._gsyncd_loginit(log_file=gconf.log_file, label='conf')
- if confdata.op == 'set':
- logging.info('checkpoint %s set' % confdata.val)
- elif confdata.op == 'del':
- logging.info('checkpoint info was reset')
- return
-
- go_daemon = rconf['go_daemon']
- be_monitor = rconf.get('monitor')
-
- if not be_monitor and isinstance(remote, resource.SSH) and \
- go_daemon == 'should':
- go_daemon = 'postconn'
- log_file = None
- else:
- log_file = gconf.log_file
- if be_monitor:
- label = 'monitor'
- elif remote:
- #master
- label = ''
- else:
- label = 'slave'
- startup(go_daemon=go_daemon, log_file=log_file, label=label)
-
- if be_monitor:
- return monitor()
-
- logging.info("syncing: %s" % " -> ".join(peers))
- resource.Popen.init_errhandler()
- if remote:
- go_daemon = remote.connect_remote(go_daemon=go_daemon)
- if go_daemon:
- startup(go_daemon=go_daemon, log_file=gconf.log_file)
- # complete remote connection in child
- remote.connect_remote(go_daemon='done')
- local.connect()
- if ffd:
- os.close(ffd)
- local.service_loop(*[r for r in [remote] if r])
-
-
-if __name__ == "__main__":
- main()
diff --git a/xlators/features/marker/utils/syncdaemon/libcxattr.py b/xlators/features/marker/utils/syncdaemon/libcxattr.py
deleted file mode 100644
index f0a9d2292..000000000
--- a/xlators/features/marker/utils/syncdaemon/libcxattr.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import os
-from ctypes import *
-from ctypes.util import find_library
-
-class Xattr(object):
- """singleton that wraps the extended attribues system
- interface for python using ctypes
-
- Just implement it to the degree we need it, in particular
- - we need just the l*xattr variants, ie. we never want symlinks to be
- followed
- - don't need size discovery for getxattr, as we always know the exact
- sizes we expect
- """
-
- libc = CDLL(find_library("libc"))
-
- @classmethod
- def geterrno(cls):
- return c_int.in_dll(cls.libc, 'errno').value
-
- @classmethod
- def raise_oserr(cls):
- errn = cls.geterrno()
- raise OSError(errn, os.strerror(errn))
-
- @classmethod
- def _query_xattr(cls, path, siz, syscall, *a):
- if siz:
- buf = create_string_buffer('\0' * siz)
- else:
- buf = None
- ret = getattr(cls.libc, syscall)(*((path,) + a + (buf, siz)))
- if ret == -1:
- cls.raise_oserr()
- if siz:
- return buf.raw[:ret]
- else:
- return ret
-
- @classmethod
- def lgetxattr(cls, path, attr, siz=0):
- return cls._query_xattr( path, siz, 'lgetxattr', attr)
-
- @classmethod
- def llistxattr(cls, path, siz=0):
- ret = cls._query_xattr(path, siz, 'llistxattr')
- if isinstance(ret, str):
- ret = ret.split('\0')
- return ret
-
- @classmethod
- def lsetxattr(cls, path, attr, val):
- ret = cls.libc.lsetxattr(path, attr, val, len(val), 0)
- if ret == -1:
- cls.raise_oserr()
-
- @classmethod
- def lremovexattr(cls, path, attr):
- ret = cls.libc.lremovexattr(path, attr)
- if ret == -1:
- cls.raise_oserr()
-
- @classmethod
- def llistxattr_buf(cls, path):
- """listxattr variant with size discovery"""
- size = cls.llistxattr(path)
- if size == -1:
- cls.raise_oserr()
- if size == 0:
- return []
- return cls.llistxattr(path, size)
diff --git a/xlators/features/marker/utils/syncdaemon/master.py b/xlators/features/marker/utils/syncdaemon/master.py
deleted file mode 100644
index 336f59207..000000000
--- a/xlators/features/marker/utils/syncdaemon/master.py
+++ /dev/null
@@ -1,913 +0,0 @@
-import os
-import sys
-import time
-import stat
-import random
-import signal
-import logging
-import socket
-import errno
-from errno import ENOENT, ENODATA, EPIPE
-from threading import currentThread, Condition, Lock
-from datetime import datetime
-try:
- from hashlib import md5 as md5
-except ImportError:
- # py 2.4
- from md5 import new as md5
-
-from gconf import gconf
-from syncdutils import FreeObject, Thread, GsyncdError, boolify, \
- escape, unescape, select
-
-URXTIME = (-1, 0)
-
-# Utility functions to help us to get to closer proximity
-# of the DRY principle (no, don't look for elevated or
-# perspectivistic things here)
-
-def _xtime_now():
- t = time.time()
- sec = int(t)
- nsec = int((t - sec) * 1000000)
- return (sec, nsec)
-
-def _volinfo_hook_relax_foreign(self):
- volinfo_sys = self.get_sys_volinfo()
- fgn_vi = volinfo_sys[self.KFGN]
- if fgn_vi:
- expiry = fgn_vi['timeout'] - int(time.time()) + 1
- logging.info('foreign volume info found, waiting %d sec for expiry' % \
- expiry)
- time.sleep(expiry)
- volinfo_sys = self.get_sys_volinfo()
- self.volinfo_state, state_change = self.volinfo_state_machine(self.volinfo_state,
- volinfo_sys)
- if self.inter_master:
- raise GsyncdError("cannot be intermediate master in special mode")
- return (volinfo_sys, state_change)
-
-
-# The API!
-
-def gmaster_builder():
- """produce the GMaster class variant corresponding
- to sync mode"""
- this = sys.modules[__name__]
- modemixin = gconf.special_sync_mode
- if not modemixin:
- modemixin = 'normal'
- logging.info('setting up master for %s sync mode' % modemixin)
- modemixin = getattr(this, modemixin.capitalize() + 'Mixin')
- sendmarkmixin = boolify(gconf.use_rsync_xattrs) and SendmarkRsyncMixin or SendmarkNormalMixin
- purgemixin = boolify(gconf.ignore_deletes) and PurgeNoopMixin or PurgeNormalMixin
- class _GMaster(GMasterBase, modemixin, sendmarkmixin, purgemixin):
- pass
- return _GMaster
-
-
-# Mixin classes that implement the data format
-# and logic particularities of the certain
-# sync modes
-
-class NormalMixin(object):
- """normal geo-rep behavior"""
-
- minus_infinity = URXTIME
-
- # following staticmethods ideally would be
- # methods of an xtime object (in particular,
- # implementing the hooks needed for comparison
- # operators), but at this point we don't yet
- # have a dedicated xtime class
-
- @staticmethod
- def serialize_xtime(xt):
- return "%d.%d" % tuple(xt)
-
- @staticmethod
- def deserialize_xtime(xt):
- return tuple(int(x) for x in xt.split("."))
-
- @staticmethod
- def native_xtime(xt):
- return xt
-
- @staticmethod
- def xtime_geq(xt0, xt1):
- return xt0 >= xt1
-
- def make_xtime_opts(self, is_master, opts):
- if not 'create' in opts:
- opts['create'] = is_master and not self.inter_master
- if not 'default_xtime' in opts:
- if is_master and self.inter_master:
- opts['default_xtime'] = ENODATA
- else:
- opts['default_xtime'] = URXTIME
-
- def xtime_low(self, server, path, **opts):
- xt = server.xtime(path, self.uuid)
- if isinstance(xt, int) and xt != ENODATA:
- return xt
- if xt == ENODATA or xt < self.volmark:
- if opts['create']:
- xt = _xtime_now()
- server.set_xtime(path, self.uuid, xt)
- else:
- xt = opts['default_xtime']
- return xt
-
- def keepalive_payload_hook(self, timo, gap):
- # first grab a reference as self.volinfo
- # can be changed in main thread
- vi = self.volinfo
- if vi:
- # then have a private copy which we can mod
- vi = vi.copy()
- vi['timeout'] = int(time.time()) + timo
- else:
- # send keep-alives more frequently to
- # avoid a delay in announcing our volume info
- # to slave if it becomes established in the
- # meantime
- gap = min(10, gap)
- return (vi, gap)
-
- def volinfo_hook(self):
- volinfo_sys = self.get_sys_volinfo()
- self.volinfo_state, state_change = self.volinfo_state_machine(self.volinfo_state,
- volinfo_sys)
- return (volinfo_sys, state_change)
-
- def xtime_reversion_hook(self, path, xtl, xtr):
- if xtr > xtl:
- raise GsyncdError("timestamp corruption for " + path)
-
- def need_sync(self, e, xte, xtrd):
- return xte > xtrd
-
- def set_slave_xtime(self, path, mark):
- self.slave.server.set_xtime(path, self.uuid, mark)
-
-class WrapupMixin(NormalMixin):
- """a variant that differs from normal in terms
- of ignoring non-indexed files"""
-
- @staticmethod
- def make_xtime_opts(is_master, opts):
- if not 'create' in opts:
- opts['create'] = False
- if not 'default_xtime' in opts:
- opts['default_xtime'] = URXTIME
-
- @staticmethod
- def keepalive_payload_hook(timo, gap):
- return (None, gap)
-
- def volinfo_hook(self):
- return _volinfo_hook_relax_foreign(self)
-
-class BlindMixin(object):
- """Geo-rep flavor using vectored xtime.
-
- Coordinates are the master, slave uuid pair;
- in master coordinate behavior is normal,
- in slave coordinate we force synchronization
- on any value difference (these are in disjunctive
- relation, ie. if either orders the entry to be
- synced, it shall be synced.
- """
-
- minus_infinity = (URXTIME, None)
-
- @staticmethod
- def serialize_xtime(xt):
- a = []
- for x in xt:
- if not x:
- x = ('None', '')
- a.extend(x)
- return '.'.join(str(n) for n in a)
-
- @staticmethod
- def deserialize_xtime(xt):
- a = xt.split(".")
- a = (tuple(a[0:2]), tuple(a[3:4]))
- b = []
- for p in a:
- if p[0] == 'None':
- p = None
- else:
- p = tuple(int(x) for x in p)
- b.append(p)
- return tuple(b)
-
- @staticmethod
- def native_xtime(xt):
- return xt[0]
-
- @staticmethod
- def xtime_geq(xt0, xt1):
- return (not xt1[0] or xt0[0] >= xt1[0]) and \
- (not xt1[1] or xt0[1] >= xt1[1])
-
- @property
- def ruuid(self):
- if self.volinfo_r:
- return self.volinfo_r['uuid']
-
- @staticmethod
- def make_xtime_opts(is_master, opts):
- if not 'create' in opts:
- opts['create'] = is_master
- if not 'default_xtime' in opts:
- opts['default_xtime'] = URXTIME
-
- def xtime_low(self, server, path, **opts):
- xtd = server.xtime_vec(path, self.uuid, self.ruuid)
- if isinstance(xtd, int):
- return xtd
- xt = (xtd[self.uuid], xtd[self.ruuid])
- if not xt[1] and (not xt[0] or xt[0] < self.volmark):
- if opts['create']:
- # not expected, but can happen if file originates
- # from interrupted gsyncd transfer
- logging.warn('have to fix up missing xtime on ' + path)
- xt0 = _xtime_now()
- server.set_xtime(path, self.uuid, xt0)
- else:
- xt0 = opts['default_xtime']
- xt = (xt0, xt[1])
- return xt
-
- @staticmethod
- def keepalive_payload_hook(timo, gap):
- return (None, gap)
-
- def volinfo_hook(self):
- res = _volinfo_hook_relax_foreign(self)
- volinfo_r_new = self.slave.server.native_volume_info()
- if volinfo_r_new['retval']:
- raise GsyncdError("slave is corrupt")
- if getattr(self, 'volinfo_r', None):
- if self.volinfo_r['uuid'] != volinfo_r_new['uuid']:
- raise GsyncdError("uuid mismatch on slave")
- self.volinfo_r = volinfo_r_new
- return res
-
- def xtime_reversion_hook(self, path, xtl, xtr):
- if not isinstance(xtr[0], int) and \
- (isinstance(xtl[0], int) or xtr[0] > xtl[0]):
- raise GsyncdError("timestamp corruption for " + path)
-
- def need_sync(self, e, xte, xtrd):
- if xte[0]:
- if not xtrd[0] or xte[0] > xtrd[0]:
- # there is outstanding diff at 0th pos,
- # we can short-cut to true
- return True
- # we arrived to this point by either of these
- # two possiblilites:
- # - no outstanding difference at 0th pos,
- # wanna see 1st pos if he raises veto
- # against "no need to sync" proposal
- # - no data at 0th pos, 1st pos will have
- # to decide (due to xtime assignment,
- # in this case 1st pos does carry data
- # -- iow, if 1st pos did not have data,
- # and 0th neither, 0th would have been
- # force-feeded)
- if not xte[1]:
- # no data, no veto
- return False
- # the hard work: for 1st pos,
- # the conduct is fetch corresponding
- # slave data and do a "blind" comparison
- # (ie. do not care who is newer, we trigger
- # sync on non-identical xitmes)
- xtr = self.xtime(e, self.slave)
- return isinstance(xtr, int) or xte[1] != xtr[1]
-
- def set_slave_xtime(self, path, mark):
- xtd = {}
- for (u, t) in zip((self.uuid, self.ruuid), mark):
- if t:
- xtd[u] = t
- self.slave.server.set_xtime_vec(path, xtd)
-
-
-# Further mixins for certain tunable behaviors
-
-class SendmarkNormalMixin(object):
-
- def sendmark_regular(self, *a, **kw):
- return self.sendmark(*a, **kw)
-
-class SendmarkRsyncMixin(object):
-
- def sendmark_regular(self, *a, **kw):
- pass
-
-
-class PurgeNormalMixin(object):
-
- def purge_missing(self, path, names):
- self.slave.server.purge(path, names)
-
-class PurgeNoopMixin(object):
-
- def purge_missing(self, path, names):
- pass
-
-
-
-class GMasterBase(object):
- """abstract class impementling master role"""
-
- KFGN = 0
- KNAT = 1
-
- def get_sys_volinfo(self):
- """query volume marks on fs root
-
- err out on multiple foreign masters
- """
- fgn_vis, nat_vi = self.master.server.foreign_volume_infos(), \
- self.master.server.native_volume_info()
- fgn_vi = None
- if fgn_vis:
- if len(fgn_vis) > 1:
- raise GsyncdError("cannot work with multiple foreign masters")
- fgn_vi = fgn_vis[0]
- return fgn_vi, nat_vi
-
- @property
- def uuid(self):
- if self.volinfo:
- return self.volinfo['uuid']
-
- @property
- def volmark(self):
- if self.volinfo:
- return self.volinfo['volume_mark']
-
- @property
- def inter_master(self):
- """decide if we are an intermediate master
- in a cascading setup
- """
- return self.volinfo_state[self.KFGN] and True or False
-
- def xtime(self, path, *a, **opts):
- """get amended xtime
-
- as of amending, we can create missing xtime, or
- determine a valid value if what we get is expired
- (as of the volume mark expiry); way of amendig
- depends on @opts and on subject of query (master
- or slave).
- """
- if a:
- rsc = a[0]
- else:
- rsc = self.master
- self.make_xtime_opts(rsc == self.master, opts)
- return self.xtime_low(rsc.server, path, **opts)
-
- def __init__(self, master, slave):
- self.master = master
- self.slave = slave
- self.jobtab = {}
- self.syncer = Syncer(slave)
- # crawls vs. turns:
- # - self.crawls is simply the number of crawl() invocations on root
- # - one turn is a maximal consecutive sequence of crawls so that each
- # crawl in it detects a change to be synced
- # - self.turns is the number of turns since start
- # - self.total_turns is a limit so that if self.turns reaches it, then
- # we exit (for diagnostic purposes)
- # so, eg., if the master fs changes unceasingly, self.turns will remain 0.
- self.crawls = 0
- self.turns = 0
- self.total_turns = int(gconf.turns)
- self.lastreport = {'crawls': 0, 'turns': 0}
- self.start = None
- self.change_seen = None
- # the authoritative (foreign, native) volinfo pair
- # which lets us deduce what to do when we refetch
- # the volinfos from system
- uuid_preset = getattr(gconf, 'volume_id', None)
- self.volinfo_state = (uuid_preset and {'uuid': uuid_preset}, None)
- # the actual volinfo we make use of
- self.volinfo = None
- self.terminate = False
- self.checkpoint_thread = None
-
- @classmethod
- def _checkpt_param(cls, chkpt, prm, xtimish=True):
- """use config backend to lookup a parameter belonging to
- checkpoint @chkpt"""
- cprm = getattr(gconf, 'checkpoint_' + prm, None)
- if not cprm:
- return
- chkpt_mapped, val = cprm.split(':', 1)
- if unescape(chkpt_mapped) != chkpt:
- return
- if xtimish:
- val = cls.deserialize_xtime(val)
- return val
-
- @classmethod
- def _set_checkpt_param(cls, chkpt, prm, val, xtimish=True):
- """use config backend to store a parameter associated
- with checkpoint @chkpt"""
- if xtimish:
- val = cls.serialize_xtime(val)
- gconf.configinterface.set('checkpoint_' + prm, "%s:%s" % (escape(chkpt), val))
-
- @staticmethod
- def humantime(*tpair):
- """format xtime-like (sec, nsec) pair to human readable format"""
- ts = datetime.fromtimestamp(float('.'.join(str(n) for n in tpair))).\
- strftime("%Y-%m-%d %H:%M:%S")
- if len(tpair) > 1:
- ts += '.' + str(tpair[1])
- return ts
-
- def checkpt_service(self, chan, chkpt, tgt):
- """checkpoint service loop
-
- monitor and verify checkpoint status for @chkpt, and listen
- for incoming requests for whom we serve a pretty-formatted
- status report"""
- if not chkpt:
- # dummy loop for the case when there is no checkpt set
- while True:
- select([chan], [], [])
- conn, _ = chan.accept()
- conn.send('\0')
- conn.close()
- completed = self._checkpt_param(chkpt, 'completed', xtimish=False)
- if completed:
- completed = tuple(int(x) for x in completed.split('.'))
- while True:
- s,_,_ = select([chan], [], [], (not completed) and 5 or None)
- # either request made and we re-check to not
- # give back stale data, or we still hunting for completion
- if self.native_xtime(tgt) and self.native_xtime(tgt) < self.volmark:
- # indexing has been reset since setting the checkpoint
- status = "is invalid"
- else:
- xtr = self.xtime('.', self.slave)
- if isinstance(xtr, int):
- raise GsyncdError("slave root directory is unaccessible (%s)",
- os.strerror(xtr))
- ncompleted = self.xtime_geq(xtr, tgt)
- if completed and not ncompleted: # stale data
- logging.warn("completion time %s for checkpoint %s became stale" % \
- (self.humantime(*completed), chkpt))
- completed = None
- gconf.confdata.delete('checkpoint-completed')
- if ncompleted and not completed: # just reaching completion
- completed = "%.6f" % time.time()
- self._set_checkpt_param(chkpt, 'completed', completed, xtimish=False)
- completed = tuple(int(x) for x in completed.split('.'))
- logging.info("checkpoint %s completed" % chkpt)
- status = completed and \
- "completed at " + self.humantime(completed[0]) or \
- "not reached yet"
- if s:
- conn = None
- try:
- conn, _ = chan.accept()
- try:
- conn.send(" | checkpoint %s %s\0" % (chkpt, status))
- except:
- exc = sys.exc_info()[1]
- if (isinstance(exc, OSError) or isinstance(exc, IOError)) and \
- exc.errno == EPIPE:
- logging.debug('checkpoint client disconnected')
- else:
- raise
- finally:
- if conn:
- conn.close()
-
- def start_checkpoint_thread(self):
- """prepare and start checkpoint service"""
- if self.checkpoint_thread or not getattr(gconf, 'state_socket_unencoded', None):
- return
- chan = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
- state_socket = "/tmp/%s.socket" % md5(gconf.state_socket_unencoded).hexdigest()
- try:
- os.unlink(state_socket)
- except:
- if sys.exc_info()[0] == OSError:
- pass
- chan.bind(state_socket)
- chan.listen(1)
- checkpt_tgt = None
- if gconf.checkpoint:
- checkpt_tgt = self._checkpt_param(gconf.checkpoint, 'target')
- if not checkpt_tgt:
- checkpt_tgt = self.xtime('.')
- if isinstance(checkpt_tgt, int):
- raise GsyncdError("master root directory is unaccessible (%s)",
- os.strerror(checkpt_tgt))
- self._set_checkpt_param(gconf.checkpoint, 'target', checkpt_tgt)
- logging.debug("checkpoint target %s has been determined for checkpoint %s" % \
- (repr(checkpt_tgt), gconf.checkpoint))
- t = Thread(target=self.checkpt_service, args=(chan, gconf.checkpoint, checkpt_tgt))
- t.start()
- self.checkpoint_thread = t
-
- def crawl_loop(self):
- """start the keep-alive thread and iterate .crawl"""
- timo = int(gconf.timeout or 0)
- if timo > 0:
- def keep_alive():
- while True:
- vi, gap = self.keepalive_payload_hook(timo, timo * 0.5)
- self.slave.server.keep_alive(vi)
- time.sleep(gap)
- t = Thread(target=keep_alive)
- t.start()
- self.lastreport['time'] = time.time()
- while not self.terminate:
- self.crawl()
-
- def add_job(self, path, label, job, *a, **kw):
- """insert @job function to job table at @path with @label"""
- if self.jobtab.get(path) == None:
- self.jobtab[path] = []
- self.jobtab[path].append((label, a, lambda : job(*a, **kw)))
-
- def add_failjob(self, path, label):
- """invoke .add_job with a job that does nothing just fails"""
- logging.debug('salvaged: ' + label)
- self.add_job(path, label, lambda: False)
-
- def wait(self, path, *args):
- """perform jobs registered for @path
-
- Reset jobtab entry for @path,
- determine success as the conjuction of
- success of all the jobs. In case of
- success, call .sendmark on @path
- """
- jobs = self.jobtab.pop(path, [])
- succeed = True
- for j in jobs:
- ret = j[-1]()
- if not ret:
- succeed = False
- if succeed:
- self.sendmark(path, *args)
- return succeed
-
- def sendmark(self, path, mark, adct=None):
- """update slave side xtime for @path to master side xtime
-
- also can send a setattr payload (see Server.setattr).
- """
- if adct:
- self.slave.server.setattr(path, adct)
- self.set_slave_xtime(path, mark)
-
- @staticmethod
- def volinfo_state_machine(volinfo_state, volinfo_sys):
- """compute new volinfo_state from old one and incoming
- as of current system state, also indicating if there was a
- change regarding which volume mark is the authoritative one
-
- @volinfo_state, @volinfo_sys are pairs of volume mark dicts
- (foreign, native).
-
- Note this method is marked as static, ie. the computation is
- pure, without reliance on any excess implicit state. State
- transitions which are deemed as ambiguous or banned will raise
- an exception.
-
- """
- # store the value below "boxed" to emulate proper closures
- # (variables of the enclosing scope are available inner functions
- # provided they are no reassigned; mutation is OK).
- param = FreeObject(relax_mismatch = False, state_change = None, index=-1)
- def select_vi(vi0, vi):
- param.index += 1
- if vi and (not vi0 or vi0['uuid'] == vi['uuid']):
- if not vi0 and not param.relax_mismatch:
- param.state_change = param.index
- # valid new value found; for the rest, we are graceful about
- # uuid mismatch
- param.relax_mismatch = True
- return vi
- if vi0 and vi and vi0['uuid'] != vi['uuid'] and not param.relax_mismatch:
- # uuid mismatch for master candidate, bail out
- raise GsyncdError("aborting on uuid change from %s to %s" % \
- (vi0['uuid'], vi['uuid']))
- # fall back to old
- return vi0
- newstate = tuple(select_vi(*vip) for vip in zip(volinfo_state, volinfo_sys))
- srep = lambda vi: vi and vi['uuid'][0:8]
- logging.debug('(%s, %s) << (%s, %s) -> (%s, %s)' % \
- tuple(srep(vi) for vi in volinfo_state + volinfo_sys + newstate))
- return newstate, param.state_change
-
- def crawl(self, path='.', xtl=None):
- """crawling...
-
- Standing around
- All the right people
- Crawling
- Tennis on Tuesday
- The ladder is long
- It is your nature
- You've gotta suntan
- Football on Sunday
- Society boy
-
- Recursively walk the master side tree and check if updates are
- needed due to xtime differences. One invocation of crawl checks
- children of @path and do a recursive enter only on
- those directory children where there is an update needed.
-
- Way of updates depend on file type:
- - for symlinks, sync them directy and synchronously
- - for regular children, register jobs for @path (cf. .add_job) to start
- and wait on their rsync
- - for directory children, register a job for @path which waits (.wait)
- on jobs for the given child
- (other kind of filesystem nodes are not considered)
-
- Those slave side children which do not exist on master are simply
- purged (see Server.purge).
-
- Behavior is fault tolerant, synchronization is adaptive: if some action fails,
- just go on relentlessly, adding a fail job (see .add_failjob) which will prevent
- the .sendmark on @path, so when the next crawl will arrive to @path it will not
- see it as up-to-date and will try to sync it again. While this semantics can be
- supported by funky design principles (http://c2.com/cgi/wiki?LazinessImpatienceHubris),
- the ultimate reason which excludes other possibilities is simply transience: we cannot
- assert that the file systems (master / slave) underneath do not change and actions
- taken upon some condition will not lose their context by the time they are performed.
- """
- if path == '.':
- if self.start:
- self.crawls += 1
- logging.debug("... crawl #%d done, took %.6f seconds" % \
- (self.crawls, time.time() - self.start))
- time.sleep(1)
- self.start = time.time()
- should_display_info = self.start - self.lastreport['time'] >= 60
- if should_display_info:
- logging.info("completed %d crawls, %d turns",
- self.crawls - self.lastreport['crawls'],
- self.turns - self.lastreport['turns'])
- self.lastreport.update(crawls = self.crawls,
- turns = self.turns,
- time = self.start)
- volinfo_sys, state_change = self.volinfo_hook()
- if self.inter_master:
- self.volinfo = volinfo_sys[self.KFGN]
- else:
- self.volinfo = volinfo_sys[self.KNAT]
- if state_change == self.KFGN or (state_change == self.KNAT and not self.inter_master):
- logging.info('new master is %s', self.uuid)
- if self.volinfo:
- logging.info("%s master with volume id %s ..." % \
- (self.inter_master and "intermediate" or "primary",
- self.uuid))
- if state_change == self.KFGN:
- gconf.configinterface.set('volume_id', self.uuid)
- if self.volinfo:
- if self.volinfo['retval']:
- raise GsyncdError ("master is corrupt")
- self.start_checkpoint_thread()
- else:
- if should_display_info or self.crawls == 0:
- if self.inter_master:
- logging.info("waiting for being synced from %s ..." % \
- self.volinfo_state[self.KFGN]['uuid'])
- else:
- logging.info("waiting for volume info ...")
- return
- logging.debug("entering " + path)
- if not xtl:
- xtl = self.xtime(path)
- if isinstance(xtl, int):
- self.add_failjob(path, 'no-local-node')
- return
- xtr = self.xtime(path, self.slave)
- if isinstance(xtr, int):
- if xtr != ENOENT:
- self.slave.server.purge(path)
- try:
- self.slave.server.mkdir(path)
- except OSError:
- self.add_failjob(path, 'no-remote-node')
- return
- xtr = self.minus_infinity
- else:
- self.xtime_reversion_hook(path, xtl, xtr)
- if xtl == xtr:
- if path == '.' and self.change_seen:
- self.turns += 1
- self.change_seen = False
- if self.total_turns:
- logging.info("finished turn #%s/%s" % \
- (self.turns, self.total_turns))
- if self.turns == self.total_turns:
- logging.info("reached turn limit")
- self.terminate = True
- return
- if path == '.':
- self.change_seen = True
- try:
- dem = self.master.server.entries(path)
- except OSError:
- self.add_failjob(path, 'local-entries-fail')
- return
- random.shuffle(dem)
- try:
- des = self.slave.server.entries(path)
- except OSError:
- self.slave.server.purge(path)
- try:
- self.slave.server.mkdir(path)
- des = self.slave.server.entries(path)
- except OSError:
- self.add_failjob(path, 'remote-entries-fail')
- return
- dd = set(des) - set(dem)
- if dd:
- self.purge_missing(path, dd)
- chld = []
- for e in dem:
- e = os.path.join(path, e)
- xte = self.xtime(e)
- if isinstance(xte, int):
- logging.warn("irregular xtime for %s: %s" % (e, errno.errorcode[xte]))
- elif self.need_sync(e, xte, xtr):
- chld.append((e, xte))
- def indulgently(e, fnc, blame=None):
- if not blame:
- blame = path
- try:
- return fnc(e)
- except (IOError, OSError):
- ex = sys.exc_info()[1]
- if ex.errno == ENOENT:
- logging.warn("salvaged ENOENT for " + e)
- self.add_failjob(blame, 'by-indulgently')
- return False
- else:
- raise
- for e, xte in chld:
- st = indulgently(e, lambda e: os.lstat(e))
- if st == False:
- continue
- mo = st.st_mode
- adct = {'own': (st.st_uid, st.st_gid)}
- if stat.S_ISLNK(mo):
- if indulgently(e, lambda e: self.slave.server.symlink(os.readlink(e), e)) == False:
- continue
- self.sendmark(e, xte, adct)
- elif stat.S_ISREG(mo):
- logging.debug("syncing %s ..." % e)
- pb = self.syncer.add(e)
- def regjob(e, xte, pb):
- if pb.wait():
- logging.debug("synced " + e)
- self.sendmark_regular(e, xte)
- return True
- else:
- logging.warn("failed to sync " + e)
- self.add_job(path, 'reg', regjob, e, xte, pb)
- elif stat.S_ISDIR(mo):
- adct['mode'] = mo
- if indulgently(e, lambda e: (self.add_job(path, 'cwait', self.wait, e, xte, adct),
- self.crawl(e, xte),
- True)[-1], blame=e) == False:
- continue
- else:
- # ignore fifos, sockets and special files
- pass
- if path == '.':
- self.wait(path, xtl)
-
-class BoxClosedErr(Exception):
- pass
-
-class PostBox(list):
- """synchronized collection for storing things thought of as "requests" """
-
- def __init__(self, *a):
- list.__init__(self, *a)
- # too bad Python stdlib does not have read/write locks...
- # it would suffivce to grab the lock in .append as reader, in .close as writer
- self.lever = Condition()
- self.open = True
- self.done = False
-
- def wait(self):
- """wait on requests to be processed"""
- self.lever.acquire()
- if not self.done:
- self.lever.wait()
- self.lever.release()
- return self.result
-
- def wakeup(self, data):
- """wake up requestors with the result"""
- self.result = data
- self.lever.acquire()
- self.done = True
- self.lever.notifyAll()
- self.lever.release()
-
- def append(self, e):
- """post a request"""
- self.lever.acquire()
- if not self.open:
- raise BoxClosedErr
- list.append(self, e)
- self.lever.release()
-
- def close(self):
- """prohibit the posting of further requests"""
- self.lever.acquire()
- self.open = False
- self.lever.release()
-
-class Syncer(object):
- """a staged queue to relay rsync requests to rsync workers
-
- By "staged queue" its meant that when a consumer comes to the
- queue, it takes _all_ entries, leaving the queue empty.
- (I don't know if there is an official term for this pattern.)
-
- The queue uses a PostBox to accumulate incoming items.
- When a consumer (rsync worker) comes, a new PostBox is
- set up and the old one is passed on to the consumer.
-
- Instead of the simplistic scheme of having one big lock
- which synchronizes both the addition of new items and
- PostBox exchanges, use a separate lock to arbitrate consumers,
- and rely on PostBox's synchronization mechanisms take
- care about additions.
-
- There is a corner case racy situation, producers vs. consumers,
- which is not handled by this scheme: namely, when the PostBox
- exchange occurs in between being passed to the producer for posting
- and the post placement. But that's what Postbox.close is for:
- such a posting will find the PostBox closed, in which case
- the producer can re-try posting against the actual PostBox of
- the queue.
-
- To aid accumlation of items in the PostBoxen before grabbed
- by an rsync worker, the worker goes to sleep a bit after
- each completed syncjob.
- """
-
- def __init__(self, slave):
- """spawn worker threads"""
- self.slave = slave
- self.lock = Lock()
- self.pb = PostBox()
- for i in range(int(gconf.sync_jobs)):
- t = Thread(target=self.syncjob)
- t.start()
-
- def syncjob(self):
- """the life of a worker"""
- while True:
- pb = None
- while True:
- self.lock.acquire()
- if self.pb:
- pb, self.pb = self.pb, PostBox()
- self.lock.release()
- if pb:
- break
- time.sleep(0.5)
- pb.close()
- po = self.slave.rsync(pb)
- if po.returncode == 0:
- ret = True
- elif po.returncode in (23, 24):
- # partial transfer (cf. rsync(1)), that's normal
- ret = False
- else:
- po.errfail()
- pb.wakeup(ret)
-
- def add(self, e):
- while True:
- pb = self.pb
- try:
- pb.append(e)
- return pb
- except BoxClosedErr:
- pass
diff --git a/xlators/features/marker/utils/syncdaemon/monitor.py b/xlators/features/marker/utils/syncdaemon/monitor.py
deleted file mode 100644
index b8956dcc2..000000000
--- a/xlators/features/marker/utils/syncdaemon/monitor.py
+++ /dev/null
@@ -1,129 +0,0 @@
-import os
-import sys
-import time
-import signal
-import logging
-from gconf import gconf
-from syncdutils import update_file, select, waitpid, set_term_handler
-
-class Monitor(object):
- """class which spawns and manages gsyncd workers"""
-
- def __init__(self):
- self.state = None
-
- def set_state(self, state):
- """set the state that can be used by external agents
- like glusterd for status reporting"""
- if state == self.state:
- return
- self.state = state
- logging.info('new state: %s' % state)
- if getattr(gconf, 'state_file', None):
- update_file(gconf.state_file, lambda f: f.write(state + '\n'))
-
- def monitor(self):
- """the monitor loop
-
- Basic logic is a blantantly simple blunt heuristics:
- if spawned client survives 60 secs, it's considered OK.
- This servers us pretty well as it's not vulneralbe to
- any kind of irregular behavior of the child...
-
- ... well, except for one: if children is hung up on
- waiting for some event, it can survive aeons, still
- will be defunct. So we tweak the above logic to
- expect the worker to send us a signal within 60 secs
- (in the form of closing its end of a pipe). The worker
- does this when it's done with the setup stage
- ready to enter the service loop (note it's the setup
- stage which is vulnerable to hangs -- the full
- blown worker blows up on EPIPE if the net goes down,
- due to the keep-alive thread)
- """
- def sigcont_handler(*a):
- """
- Re-init logging and send group kill signal
- """
- md = gconf.log_metadata
- logging.shutdown()
- lcls = logging.getLoggerClass()
- lcls.setup(label=md.get('saved_label'), **md)
- pid = os.getpid()
- os.kill(-pid, signal.SIGUSR1)
- signal.signal(signal.SIGUSR1, lambda *a: ())
- signal.signal(signal.SIGCONT, sigcont_handler)
-
- argv = sys.argv[:]
- for o in ('-N', '--no-daemon', '--monitor'):
- while o in argv:
- argv.remove(o)
- argv.extend(('-N', '-p', ''))
- argv.insert(0, os.path.basename(sys.executable))
-
- self.set_state('starting...')
- ret = 0
- def nwait(p, o=0):
- p2, r = waitpid(p, o)
- if not p2:
- return
- return r
- def exit_signalled(s):
- """ child teminated due to receipt of SIGUSR1 """
- return (os.WIFSIGNALED(s) and (os.WTERMSIG(s) == signal.SIGUSR1))
- def exit_status(s):
- if os.WIFEXITED(s):
- return os.WEXITSTATUS(s)
- return 1
- conn_timeout = int(gconf.connection_timeout)
- while ret in (0, 1):
- logging.info('-' * conn_timeout)
- logging.info('starting gsyncd worker')
- pr, pw = os.pipe()
- cpid = os.fork()
- if cpid == 0:
- os.close(pr)
- os.execv(sys.executable, argv + ['--feedback-fd', str(pw)])
- os.close(pw)
- t0 = time.time()
- so = select((pr,), (), (), conn_timeout)[0]
- os.close(pr)
- if so:
- ret = nwait(cpid, os.WNOHANG)
- if ret != None:
- logging.debug("worker died before establishing connection")
- else:
- logging.debug("worker seems to be connected (?? racy check)")
- while time.time() < t0 + conn_timeout:
- ret = nwait(cpid, os.WNOHANG)
- if ret != None:
- logging.debug("worker died in startup phase")
- break
- time.sleep(1)
- else:
- logging.debug("worker not confirmed in %d sec, aborting it" % \
- conn_timeout)
- # relax one SIGTERM by setting a handler that sets back
- # standard handler
- set_term_handler(lambda *a: set_term_handler())
- # give a chance to graceful exit
- os.kill(-os.getpid(), signal.SIGTERM)
- time.sleep(1)
- os.kill(cpid, signal.SIGKILL)
- ret = nwait(cpid)
- if ret == None:
- self.set_state('OK')
- ret = nwait(cpid)
- if exit_signalled(ret):
- ret = 0
- else:
- ret = exit_status(ret)
- if ret in (0,1):
- self.set_state('faulty')
- time.sleep(10)
- self.set_state('inconsistent')
- return ret
-
-def monitor():
- """oh yeah, actually Monitor is used as singleton, too"""
- return Monitor().monitor()
diff --git a/xlators/features/marker/utils/syncdaemon/repce.py b/xlators/features/marker/utils/syncdaemon/repce.py
deleted file mode 100644
index 755fb61df..000000000
--- a/xlators/features/marker/utils/syncdaemon/repce.py
+++ /dev/null
@@ -1,225 +0,0 @@
-import os
-import sys
-import time
-import logging
-from threading import Condition
-try:
- import thread
-except ImportError:
- # py 3
- import _thread as thread
-try:
- from Queue import Queue
-except ImportError:
- # py 3
- from queue import Queue
-try:
- import cPickle as pickle
-except ImportError:
- # py 3
- import pickle
-
-from syncdutils import Thread, select
-
-pickle_proto = -1
-repce_version = 1.0
-
-def ioparse(i, o):
- if isinstance(i, int):
- i = os.fdopen(i)
- # rely on duck typing for recognizing
- # streams as that works uniformly
- # in py2 and py3
- if hasattr(o, 'fileno'):
- o = o.fileno()
- return (i, o)
-
-def send(out, *args):
- """pickle args and write out wholly in one syscall
-
- ie. not use the ability of pickle to dump directly to
- a stream, as that would potentially mess up messages
- by interleaving them
- """
- os.write(out, pickle.dumps(args, pickle_proto))
-
-def recv(inf):
- """load an object from input stream"""
- return pickle.load(inf)
-
-
-class RepceServer(object):
- """RePCe is Hungarian for canola, http://hu.wikipedia.org/wiki/Repce
-
- ... also our homebrewed RPC backend where the transport layer is
- reduced to a pair of filehandles.
-
- This is the server component.
- """
-
- def __init__(self, obj, i, o, wnum=6):
- """register a backend object .obj to which incoming messages
- are dispatched, also incoming/outcoming streams
- """
- self.obj = obj
- self.inf, self.out = ioparse(i, o)
- self.wnum = wnum
- self.q = Queue()
-
- def service_loop(self):
- """fire up worker threads, get messages and dispatch among them"""
- for i in range(self.wnum):
- t = Thread(target=self.worker)
- t.start()
- try:
- while True:
- self.q.put(recv(self.inf))
- except EOFError:
- logging.info("terminating on reaching EOF.")
-
- def worker(self):
- """life of a worker
-
- Get message, extract its id, method name and arguments
- (kwargs not supported), call method on .obj.
- Send back message id + return value.
- If method call throws an exception, rescue it, and send
- back the exception as result (with flag marking it as
- exception).
- """
- while True:
- in_data = self.q.get(True)
- rid = in_data[0]
- rmeth = in_data[1]
- exc = False
- if rmeth == '__repce_version__':
- res = repce_version
- else:
- try:
- res = getattr(self.obj, rmeth)(*in_data[2:])
- except:
- res = sys.exc_info()[1]
- exc = True
- logging.exception("call failed: ")
- send(self.out, rid, exc, res)
-
-
-class RepceJob(object):
- """class representing message status we can use
- for waiting on reply"""
-
- def __init__(self, cbk):
- """
- - .rid: (process-wise) unique id
- - .cbk: what we do upon receiving reply
- """
- self.rid = (os.getpid(), thread.get_ident(), time.time())
- self.cbk = cbk
- self.lever = Condition()
- self.done = False
-
- def __repr__(self):
- return ':'.join([str(x) for x in self.rid])
-
- def wait(self):
- self.lever.acquire()
- if not self.done:
- self.lever.wait()
- self.lever.release()
- return self.result
-
- def wakeup(self, data):
- self.result = data
- self.lever.acquire()
- self.done = True
- self.lever.notify()
- self.lever.release()
-
-
-class RepceClient(object):
- """RePCe is Hungarian for canola, http://hu.wikipedia.org/wiki/Repce
-
- ... also our homebrewed RPC backend where the transport layer is
- reduced to a pair of filehandles.
-
- This is the client component.
- """
-
- def __init__(self, i, o):
- self.inf, self.out = ioparse(i, o)
- self.jtab = {}
- t = Thread(target = self.listen)
- t.start()
-
- def listen(self):
- while True:
- select((self.inf,), (), ())
- rid, exc, res = recv(self.inf)
- rjob = self.jtab.pop(rid)
- if rjob.cbk:
- rjob.cbk(rjob, [exc, res])
-
- def push(self, meth, *args, **kw):
- """wrap arguments in a RepceJob, send them to server
- and return the RepceJob
-
- @cbk to pass on RepceJob can be given as kwarg.
- """
- cbk = kw.get('cbk')
- if not cbk:
- def cbk(rj, res):
- if res[0]:
- raise res[1]
- rjob = RepceJob(cbk)
- self.jtab[rjob.rid] = rjob
- logging.debug("call %s %s%s ..." % (repr(rjob), meth, repr(args)))
- send(self.out, rjob.rid, meth, *args)
- return rjob
-
- def __call__(self, meth, *args):
- """RePCe client is callabe, calling it implements a synchronous remote call
-
- We do a .push with a cbk which does a wakeup upon receiving anwser, then wait
- on the RepceJob.
- """
- rjob = self.push(meth, *args, **{'cbk': lambda rj, res: rj.wakeup(res)})
- exc, res = rjob.wait()
- if exc:
- logging.error('call %s (%s) failed on peer with %s' % (repr(rjob), meth, str(type(res).__name__)))
- raise res
- logging.debug("call %s %s -> %s" % (repr(rjob), meth, repr(res)))
- return res
-
- class mprx(object):
- """method proxy, standard trick to implement rubyesque method_missing
- in Python
-
- A class is a closure factory, you know what I mean, or go read some SICP.
- """
- def __init__(self, ins, meth):
- self.ins = ins
- self.meth = meth
-
- def __call__(self, *a):
- return self.ins(self.meth, *a)
-
- def __getattr__(self, meth):
- """this implements transparent method dispatch to remote object,
- so that you don't need to call the RepceClient instance like
-
- rclient('how_old_are_you_if_born_in', 1979)
-
- but you can make it into an ordinary method call like
-
- rclient.how_old_are_you_if_born_in(1979)
- """
- return self.mprx(self, meth)
-
- def __version__(self):
- """used in handshake to verify compatibility"""
- d = {'proto': self('__repce_version__')}
- try:
- d['object'] = self('version')
- except AttributeError:
- pass
- return d
diff --git a/xlators/features/marker/utils/syncdaemon/resource.py b/xlators/features/marker/utils/syncdaemon/resource.py
deleted file mode 100644
index 7eced825c..000000000
--- a/xlators/features/marker/utils/syncdaemon/resource.py
+++ /dev/null
@@ -1,970 +0,0 @@
-import re
-import os
-import sys
-import stat
-import time
-import fcntl
-import errno
-import struct
-import socket
-import logging
-import tempfile
-import threading
-import subprocess
-from errno import EEXIST, ENOENT, ENODATA, ENOTDIR, ELOOP, EISDIR
-from select import error as selecterror
-
-from gconf import gconf
-import repce
-from repce import RepceServer, RepceClient
-from master import gmaster_builder
-import syncdutils
-from syncdutils import GsyncdError, select, privileged, boolify
-
-UrlRX = re.compile('\A(\w+)://([^ *?[]*)\Z')
-HostRX = re.compile('[a-z\d](?:[a-z\d.-]*[a-z\d])?', re.I)
-UserRX = re.compile("[\w!\#$%&'*+-\/=?^_`{|}~]+")
-
-def sup(x, *a, **kw):
- """a rubyesque "super" for python ;)
-
- invoke caller method in parent class with given args.
- """
- return getattr(super(type(x), x), sys._getframe(1).f_code.co_name)(*a, **kw)
-
-def desugar(ustr):
- """transform sugared url strings to standard <scheme>://<urlbody> form
-
- parsing logic enforces the constraint that sugared forms should contatin
- a ':' or a '/', which ensures that sugared urls do not conflict with
- gluster volume names.
- """
- m = re.match('([^:]*):(.*)', ustr)
- if m:
- if not m.groups()[0]:
- return "gluster://localhost" + ustr
- elif '@' in m.groups()[0] or re.search('[:/]', m.groups()[1]):
- return "ssh://" + ustr
- else:
- return "gluster://" + ustr
- else:
- if ustr[0] != '/':
- raise GsyncdError("cannot resolve sugared url '%s'" % ustr)
- ap = os.path.normpath(ustr)
- if ap.startswith('//'):
- ap = ap[1:]
- return "file://" + ap
-
-def gethostbyname(hnam):
- """gethostbyname wrapper"""
- try:
- return socket.gethostbyname(hnam)
- except socket.gaierror:
- ex = sys.exc_info()[1]
- raise GsyncdError("failed to resolve %s: %s" % \
- (hnam, ex.strerror))
-
-def parse_url(ustr):
- """instantiate an url object by scheme-to-class dispatch
-
- The url classes taken into consideration are the ones in
- this module whose names are full-caps.
- """
- m = UrlRX.match(ustr)
- if not m:
- ustr = desugar(ustr)
- m = UrlRX.match(ustr)
- if not m:
- raise GsyncdError("malformed url")
- sch, path = m.groups()
- this = sys.modules[__name__]
- if not hasattr(this, sch.upper()):
- raise GsyncdError("unknown url scheme " + sch)
- return getattr(this, sch.upper())(path)
-
-
-class _MetaXattr(object):
- """singleton class, a lazy wrapper around the
- libcxattr module
-
- libcxattr (a heavy import due to ctypes) is
- loaded only when when the single
- instance is tried to be used.
-
- This reduces runtime for those invocations
- which do not need filesystem manipulation
- (eg. for config, url parsing)
- """
-
- def __getattr__(self, meth):
- from libcxattr import Xattr as LXattr
- xmeth = [ m for m in dir(LXattr) if m[0] != '_' ]
- if not meth in xmeth:
- return
- for m in xmeth:
- setattr(self, m, getattr(LXattr, m))
- return getattr(self, meth)
-
-Xattr = _MetaXattr()
-
-
-class Popen(subprocess.Popen):
- """customized subclass of subprocess.Popen with a ring
- buffer for children error output"""
-
- @classmethod
- def init_errhandler(cls):
- """start the thread which handles children's error output"""
- cls.errstore = {}
- def tailer():
- while True:
- errstore = cls.errstore.copy()
- try:
- poe, _ ,_ = select([po.stderr for po in errstore], [], [], 1)
- except ValueError, selecterror:
- continue
- for po in errstore:
- if po.stderr not in poe:
- continue
- po.lock.acquire()
- try:
- if po.on_death_row:
- continue
- la = errstore[po]
- try:
- fd = po.stderr.fileno()
- except ValueError: # file is already closed
- continue
- l = os.read(fd, 1024)
- if not l:
- continue
- tots = len(l)
- for lx in la:
- tots += len(lx)
- while tots > 1<<20 and la:
- tots -= len(la.pop(0))
- la.append(l)
- finally:
- po.lock.release()
- t = syncdutils.Thread(target = tailer)
- t.start()
- cls.errhandler = t
-
- @classmethod
- def fork(cls):
- """fork wrapper that restarts errhandler thread in child"""
- pid = os.fork()
- if not pid:
- cls.init_errhandler()
- return pid
-
- def __init__(self, args, *a, **kw):
- """customizations for subprocess.Popen instantiation
-
- - 'close_fds' is taken to be the default
- - if child's stderr is chosen to be managed,
- register it with the error handler thread
- """
- self.args = args
- if 'close_fds' not in kw:
- kw['close_fds'] = True
- self.lock = threading.Lock()
- self.on_death_row = False
- try:
- sup(self, args, *a, **kw)
- except:
- ex = sys.exc_info()[1]
- if not isinstance(ex, OSError):
- raise
- raise GsyncdError("""execution of "%s" failed with %s (%s)""" % \
- (args[0], errno.errorcode[ex.errno], os.strerror(ex.errno)))
- if kw.get('stderr') == subprocess.PIPE:
- assert(getattr(self, 'errhandler', None))
- self.errstore[self] = []
-
- def errlog(self):
- """make a log about child's failure event"""
- filling = ""
- if self.elines:
- filling = ", saying:"
- logging.error("""command "%s" returned with %s%s""" % \
- (" ".join(self.args), repr(self.returncode), filling))
- lp = ''
- def logerr(l):
- logging.error(self.args[0] + "> " + l)
- for l in self.elines:
- ls = l.split('\n')
- ls[0] = lp + ls[0]
- lp = ls.pop()
- for ll in ls:
- logerr(ll)
- if lp:
- logerr(lp)
-
- def errfail(self):
- """fail nicely if child did not terminate with success"""
- self.errlog()
- syncdutils.finalize(exval = 1)
-
- def terminate_geterr(self, fail_on_err = True):
- """kill child, finalize stderr harvesting (unregister
- from errhandler, set up .elines), fail on error if
- asked for
- """
- self.lock.acquire()
- try:
- self.on_death_row = True
- finally:
- self.lock.release()
- elines = self.errstore.pop(self)
- if self.poll() == None:
- self.terminate()
- if self.poll() == None:
- time.sleep(0.1)
- self.kill()
- self.wait()
- while True:
- if not select([self.stderr],[],[],0.1)[0]:
- break
- b = os.read(self.stderr.fileno(), 1024)
- if b:
- elines.append(b)
- else:
- break
- self.stderr.close()
- self.elines = elines
- if fail_on_err and self.returncode != 0:
- self.errfail()
-
-
-class Server(object):
- """singleton implemening those filesystem access primitives
- which are needed for geo-replication functionality
-
- (Singleton in the sense it's a class which has only static
- and classmethods and is used directly, without instantiation.)
- """
-
- GX_NSPACE = (privileged() and "trusted" or "system") + ".glusterfs"
- NTV_FMTSTR = "!" + "B"*19 + "II"
- FRGN_XTRA_FMT = "I"
- FRGN_FMTSTR = NTV_FMTSTR + FRGN_XTRA_FMT
-
- def _pathguard(f):
- """decorator method that checks
- the path argument of the decorated
- functions to make sure it does not
- point out of the managed tree
- """
-
- fc = getattr(f, 'func_code', None)
- if not fc:
- # python 3
- fc = f.__code__
- pi = list(fc.co_varnames).index('path')
- def ff(*a):
- path = a[pi]
- ps = path.split('/')
- if path[0] == '/' or '..' in ps:
- raise ValueError('unsafe path')
- return f(*a)
- return ff
-
- @staticmethod
- @_pathguard
- def entries(path):
- """directory entries in an array"""
- # prevent symlinks being followed
- if not stat.S_ISDIR(os.lstat(path).st_mode):
- raise OSError(ENOTDIR, os.strerror(ENOTDIR))
- return os.listdir(path)
-
- @classmethod
- @_pathguard
- def purge(cls, path, entries=None):
- """force-delete subtrees
-
- If @entries is not specified, delete
- the whole subtree under @path (including
- @path).
-
- Otherwise, @entries should be a
- a sequence of children of @path, and
- the effect is identical with a joint
- @entries-less purge on them, ie.
-
- for e in entries:
- cls.purge(os.path.join(path, e))
- """
- me_also = entries == None
- if not entries:
- try:
- # if it's a symlink, prevent
- # following it
- try:
- os.unlink(path)
- return
- except OSError:
- ex = sys.exc_info()[1]
- if ex.errno == EISDIR:
- entries = os.listdir(path)
- else:
- raise
- except OSError:
- ex = sys.exc_info()[1]
- if ex.errno in (ENOTDIR, ENOENT, ELOOP):
- try:
- os.unlink(path)
- return
- except OSError:
- ex = sys.exc_info()[1]
- if ex.errno == ENOENT:
- return
- raise
- else:
- raise
- for e in entries:
- cls.purge(os.path.join(path, e))
- if me_also:
- os.rmdir(path)
-
- @classmethod
- @_pathguard
- def _create(cls, path, ctor):
- """path creation backend routine"""
- try:
- ctor(path)
- except OSError:
- ex = sys.exc_info()[1]
- if ex.errno == EEXIST:
- cls.purge(path)
- return ctor(path)
- raise
-
- @classmethod
- @_pathguard
- def mkdir(cls, path):
- cls._create(path, os.mkdir)
-
- @classmethod
- @_pathguard
- def symlink(cls, lnk, path):
- cls._create(path, lambda p: os.symlink(lnk, p))
-
- @classmethod
- @_pathguard
- def xtime(cls, path, uuid):
- """query xtime extended attribute
-
- Return xtime of @path for @uuid as a pair of integers.
- "Normal" errors due to non-existent @path or extended attribute
- are tolerated and errno is returned in such a case.
- """
-
- try:
- return struct.unpack('!II', Xattr.lgetxattr(path, '.'.join([cls.GX_NSPACE, uuid, 'xtime']), 8))
- except OSError:
- ex = sys.exc_info()[1]
- if ex.errno in (ENOENT, ENODATA, ENOTDIR):
- return ex.errno
- else:
- raise
-
- @classmethod
- def xtime_vec(cls, path, *uuids):
- """vectored version of @xtime
-
- accepts a list of uuids and returns a dictionary
- with uuid as key(s) and xtime as value(s)
- """
- xt = {}
- for uuid in uuids:
- xtu = cls.xtime(path, uuid)
- if xtu == ENODATA:
- xtu = None
- if isinstance(xtu, int):
- return xtu
- xt[uuid] = xtu
- return xt
-
- @classmethod
- @_pathguard
- def set_xtime(cls, path, uuid, mark):
- """set @mark as xtime for @uuid on @path"""
- Xattr.lsetxattr(path, '.'.join([cls.GX_NSPACE, uuid, 'xtime']), struct.pack('!II', *mark))
-
- @classmethod
- def set_xtime_vec(cls, path, mark_dct):
- """vectored (or dictered) version of set_xtime
-
- ignore values that match @ignore
- """
- for u,t in mark_dct.items():
- cls.set_xtime(path, u, t)
-
- @staticmethod
- @_pathguard
- def setattr(path, adct):
- """set file attributes
-
- @adct is a dict, where 'own', 'mode' and 'times'
- keys are looked for and values used to perform
- chown, chmod or utimes on @path.
- """
- own = adct.get('own')
- if own:
- os.lchown(path, *own)
- mode = adct.get('mode')
- if mode:
- os.chmod(path, stat.S_IMODE(mode))
- times = adct.get('times')
- if times:
- os.utime(path, times)
-
- @staticmethod
- def pid():
- return os.getpid()
-
- last_keep_alive = 0
- @classmethod
- def keep_alive(cls, dct):
- """process keepalive messages.
-
- Return keep-alive counter (number of received keep-alive
- messages).
-
- Now the "keep-alive" message can also have a payload which is
- used to set a foreign volume-mark on the underlying file system.
- """
- if dct:
- key = '.'.join([cls.GX_NSPACE, 'volume-mark', dct['uuid']])
- val = struct.pack(cls.FRGN_FMTSTR,
- *(dct['version'] +
- tuple(int(x,16) for x in re.findall('(?:[\da-f]){2}', dct['uuid'])) +
- (dct['retval'],) + dct['volume_mark'][0:2] + (dct['timeout'],)))
- Xattr.lsetxattr('.', key, val)
- cls.last_keep_alive += 1
- return cls.last_keep_alive
-
- @staticmethod
- def version():
- """version used in handshake"""
- return 1.0
-
-
-class SlaveLocal(object):
- """mix-in class to implement some factes of a slave server
-
- ("mix-in" is sort of like "abstract class", ie. it's not
- instantiated just included in the ancesty DAG. I use "mix-in"
- to indicate that it's not used as an abstract base class,
- rather just taken in to implement additional functionality
- on the basis of the assumed availability of certain interfaces.)
- """
-
- def can_connect_to(self, remote):
- """determine our position in the connectibility matrix"""
- return not remote
-
- def service_loop(self):
- """start a RePCe server serving self's server
-
- stop servicing if a timeout is configured and got no
- keep-alime in that inteval
- """
-
- if boolify(gconf.use_rsync_xattrs) and not privileged():
- raise GsyncdError("using rsync for extended attributes is not supported")
-
- repce = RepceServer(self.server, sys.stdin, sys.stdout, int(gconf.sync_jobs))
- t = syncdutils.Thread(target=lambda: (repce.service_loop(),
- syncdutils.finalize()))
- t.start()
- logging.info("slave listening")
- if gconf.timeout and int(gconf.timeout) > 0:
- while True:
- lp = self.server.last_keep_alive
- time.sleep(int(gconf.timeout))
- if lp == self.server.last_keep_alive:
- logging.info("connection inactive for %d seconds, stopping" % int(gconf.timeout))
- break
- else:
- select((), (), ())
-
-class SlaveRemote(object):
- """mix-in class to implement an interface to a remote slave"""
-
- def connect_remote(self, rargs=[], **opts):
- """connects to a remote slave
-
- Invoke an auxiliary utility (slave gsyncd, possibly wrapped)
- which sets up the connection and set up a RePCe client to
- communicate throuh its stdio.
- """
- slave = opts.get('slave', self.url)
- extra_opts = []
- so = getattr(gconf, 'session_owner', None)
- if so:
- extra_opts += ['--session-owner', so]
- if boolify(gconf.use_rsync_xattrs):
- extra_opts.append('--use-rsync-xattrs')
- po = Popen(rargs + gconf.remote_gsyncd.split() + extra_opts + \
- ['-N', '--listen', '--timeout', str(gconf.timeout), slave],
- stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
- gconf.transport = po
- return self.start_fd_client(po.stdout, po.stdin, **opts)
-
- def start_fd_client(self, i, o, **opts):
- """set up RePCe client, handshake with server
-
- It's cut out as a separate method to let
- subclasses hook into client startup
- """
- self.server = RepceClient(i, o)
- rv = self.server.__version__()
- exrv = {'proto': repce.repce_version, 'object': Server.version()}
- da0 = (rv, exrv)
- da1 = ({}, {})
- for i in range(2):
- for k, v in da0[i].iteritems():
- da1[i][k] = int(v)
- if da1[0] != da1[1]:
- raise GsyncdError("RePCe major version mismatch: local %s, remote %s" % (exrv, rv))
-
- def rsync(self, files, *args):
- """invoke rsync"""
- if not files:
- raise GsyncdError("no files to sync")
- logging.debug("files: " + ", ".join(files))
- argv = gconf.rsync_command.split() + \
- ['-aR0', '--files-from=-', '--super', '--numeric-ids', '--no-implied-dirs'] + \
- gconf.rsync_options.split() + (boolify(gconf.use_rsync_xattrs) and ['--xattrs'] or []) + \
- ['.'] + list(args)
- po = Popen(argv, stdin=subprocess.PIPE, stderr=subprocess.PIPE)
- for f in files:
- po.stdin.write(f)
- po.stdin.write('\0')
- po.stdin.close()
- po.wait()
- po.terminate_geterr(fail_on_err = False)
- return po
-
-
-class AbstractUrl(object):
- """abstract base class for url scheme classes"""
-
- def __init__(self, path, pattern):
- m = re.search(pattern, path)
- if not m:
- raise GsyncdError("malformed path")
- self.path = path
- return m.groups()
-
- @property
- def scheme(self):
- return type(self).__name__.lower()
-
- def canonical_path(self):
- return self.path
-
- def get_url(self, canonical=False, escaped=False):
- """format self's url in various styles"""
- if canonical:
- pa = self.canonical_path()
- else:
- pa = self.path
- u = "://".join((self.scheme, pa))
- if escaped:
- u = syncdutils.escape(u)
- return u
-
- @property
- def url(self):
- return self.get_url()
-
-
- ### Concrete resource classes ###
-
-
-class FILE(AbstractUrl, SlaveLocal, SlaveRemote):
- """scheme class for file:// urls
-
- can be used to represent a file slave server
- on slave side, or interface to a remote file
- file server on master side
- """
-
- class FILEServer(Server):
- """included server flavor"""
- pass
-
- server = FILEServer
-
- def __init__(self, path):
- sup(self, path, '^/')
-
- def connect(self):
- """inhibit the resource beyond"""
- os.chdir(self.path)
-
- def rsync(self, files):
- return sup(self, files, self.path)
-
-
-class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote):
- """scheme class for gluster:// urls
-
- can be used to represent a gluster slave server
- on slave side, or interface to a remote gluster
- slave on master side, or to represent master
- (slave-ish features come from the mixins, master
- functionality is outsourced to GMaster from master)
- """
-
- class GLUSTERServer(Server):
- "server enhancements for a glusterfs backend"""
-
- @classmethod
- def _attr_unpack_dict(cls, xattr, extra_fields = ''):
- """generic volume mark fetching/parsing backed"""
- fmt_string = cls.NTV_FMTSTR + extra_fields
- buf = Xattr.lgetxattr('.', xattr, struct.calcsize(fmt_string))
- vm = struct.unpack(fmt_string, buf)
- m = re.match('(.{8})(.{4})(.{4})(.{4})(.{12})', "".join(['%02x' % x for x in vm[2:18]]))
- uuid = '-'.join(m.groups())
- volinfo = { 'version': vm[0:2],
- 'uuid' : uuid,
- 'retval' : vm[18],
- 'volume_mark': vm[19:21],
- }
- if extra_fields:
- return volinfo, vm[-len(extra_fields):]
- else:
- return volinfo
-
- @classmethod
- def foreign_volume_infos(cls):
- """return list of valid (not expired) foreign volume marks"""
- dict_list = []
- xattr_list = Xattr.llistxattr_buf('.')
- for ele in xattr_list:
- if ele.find('.'.join([cls.GX_NSPACE, 'volume-mark', ''])) == 0:
- d, x = cls._attr_unpack_dict(ele, cls.FRGN_XTRA_FMT)
- now = int(time.time())
- if x[0] > now:
- logging.debug("volinfo[%s] expires: %d (%d sec later)" % \
- (d['uuid'], x[0], x[0] - now))
- d['timeout'] = x[0]
- dict_list.append(d)
- else:
- try:
- Xattr.lremovexattr('.', ele)
- except OSError:
- pass
- return dict_list
-
- @classmethod
- def native_volume_info(cls):
- """get the native volume mark of the underlying gluster volume"""
- try:
- return cls._attr_unpack_dict('.'.join([cls.GX_NSPACE, 'volume-mark']))
- except OSError:
- ex = sys.exc_info()[1]
- if ex.errno != ENODATA:
- raise
-
- server = GLUSTERServer
-
- def __init__(self, path):
- self.host, self.volume = sup(self, path, '^(%s):(.+)' % HostRX.pattern)
-
- def canonical_path(self):
- return ':'.join([gethostbyname(self.host), self.volume])
-
- def can_connect_to(self, remote):
- """determine our position in the connectibility matrix"""
- return True
-
- class Mounter(object):
- """Abstract base class for mounter backends"""
-
- def __init__(self, params):
- self.params = params
- self.mntpt = None
-
- @classmethod
- def get_glusterprog(cls):
- return os.path.join(gconf.gluster_command_dir, cls.glusterprog)
-
- def umount_l(self, d):
- """perform lazy umount"""
- po = Popen(self.make_umount_argv(d), stderr=subprocess.PIPE)
- po.wait()
- return po
-
- @classmethod
- def make_umount_argv(cls, d):
- raise NotImplementedError
-
- def make_mount_argv(self, *a):
- raise NotImplementedError
-
- def cleanup_mntpt(self, *a):
- pass
-
- def handle_mounter(self, po):
- po.wait()
-
- def inhibit(self, *a):
- """inhibit a gluster filesystem
-
- Mount glusterfs over a temporary mountpoint,
- change into the mount, and lazy unmount the
- filesystem.
- """
-
- mpi, mpo = os.pipe()
- mh = Popen.fork()
- if mh:
- os.close(mpi)
- fcntl.fcntl(mpo, fcntl.F_SETFD, fcntl.FD_CLOEXEC)
- d = None
- margv = self.make_mount_argv(*a)
- if self.mntpt:
- # mntpt is determined pre-mount
- d = self.mntpt
- os.write(mpo, d + '\0')
- po = Popen(margv, **self.mountkw)
- self.handle_mounter(po)
- po.terminate_geterr()
- logging.debug('auxiliary glusterfs mount in place')
- if not d:
- # mntpt is determined during mount
- d = self.mntpt
- os.write(mpo, d + '\0')
- os.write(mpo, 'M')
- t = syncdutils.Thread(target=lambda: os.chdir(d))
- t.start()
- tlim = gconf.starttime + int(gconf.connection_timeout)
- while True:
- if not t.isAlive():
- break
- if time.time() >= tlim:
- syncdutils.finalize(exval = 1)
- time.sleep(1)
- os.close(mpo)
- _, rv = syncdutils.waitpid(mh, 0)
- if rv:
- rv = (os.WIFEXITED(rv) and os.WEXITSTATUS(rv) or 0) - \
- (os.WIFSIGNALED(rv) and os.WTERMSIG(rv) or 0)
- logging.warn('stale mount possibly left behind on ' + d)
- raise GsyncdError("cleaning up temp mountpoint %s failed with status %d" % \
- (d, rv))
- else:
- rv = 0
- try:
- os.setsid()
- os.close(mpo)
- mntdata = ''
- while True:
- c = os.read(mpi, 1)
- if not c:
- break
- mntdata += c
- if mntdata:
- mounted = False
- if mntdata[-1] == 'M':
- mntdata = mntdata[:-1]
- assert(mntdata)
- mounted = True
- assert(mntdata[-1] == '\0')
- mntpt = mntdata[:-1]
- assert(mntpt)
- if mounted:
- po = self.umount_l(mntpt)
- po.terminate_geterr(fail_on_err = False)
- if po.returncode != 0:
- po.errlog()
- rv = po.returncode
- self.cleanup_mntpt(mntpt)
- except:
- logging.exception('mount cleanup failure:')
- rv = 200
- os._exit(rv)
- logging.debug('auxiliary glusterfs mount prepared')
-
- class DirectMounter(Mounter):
- """mounter backend which calls mount(8), umount(8) directly"""
-
- mountkw = {'stderr': subprocess.PIPE}
- glusterprog = 'glusterfs'
-
- @staticmethod
- def make_umount_argv(d):
- return ['umount', '-l', d]
-
- def make_mount_argv(self):
- self.mntpt = tempfile.mkdtemp(prefix = 'gsyncd-aux-mount-')
- return [self.get_glusterprog()] + ['--' + p for p in self.params] + [self.mntpt]
-
- def cleanup_mntpt(self, mntpt = None):
- if not mntpt:
- mntpt = self.mntpt
- os.rmdir(mntpt)
-
- class MountbrokerMounter(Mounter):
- """mounter backend using the mountbroker gluster service"""
-
- mountkw = {'stderr': subprocess.PIPE, 'stdout': subprocess.PIPE}
- glusterprog = 'gluster'
-
- @classmethod
- def make_cli_argv(cls):
- return [cls.get_glusterprog()] + gconf.gluster_cli_options.split() + ['system::']
-
- @classmethod
- def make_umount_argv(cls, d):
- return cls.make_cli_argv() + ['umount', d, 'lazy']
-
- def make_mount_argv(self, label):
- return self.make_cli_argv() + \
- ['mount', label, 'user-map-root=' + syncdutils.getusername()] + self.params
-
- def handle_mounter(self, po):
- self.mntpt = po.stdout.readline()[:-1]
- po.stdout.close()
- sup(self, po)
- if po.returncode != 0:
- # if cli terminated with error due to being
- # refused by glusterd, what it put
- # out on stdout is a diagnostic message
- logging.error('glusterd answered: %s' % self.mntpt)
-
- def connect(self):
- """inhibit the resource beyond
-
- Choose mounting backend (direct or mountbroker),
- set up glusterfs parameters and perform the mount
- with given backend
- """
-
- label = getattr(gconf, 'mountbroker', None)
- if not label and not privileged():
- label = syncdutils.getusername()
- mounter = label and self.MountbrokerMounter or self.DirectMounter
- params = gconf.gluster_params.split() + \
- (gconf.gluster_log_level and ['log-level=' + gconf.gluster_log_level] or []) + \
- ['log-file=' + gconf.gluster_log_file, 'volfile-server=' + self.host,
- 'volfile-id=' + self.volume, 'client-pid=-1']
- mounter(params).inhibit(*[l for l in [label] if l])
-
- def connect_remote(self, *a, **kw):
- sup(self, *a, **kw)
- self.slavedir = "/proc/%d/cwd" % self.server.pid()
-
- def service_loop(self, *args):
- """enter service loop
-
- - if slave given, instantiate GMaster and
- pass control to that instance, which implements
- master behavior
- - else do that's what's inherited
- """
- if args:
- gmaster_builder()(self, args[0]).crawl_loop()
- else:
- sup(self, *args)
-
- def rsync(self, files):
- return sup(self, files, self.slavedir)
-
-
-class SSH(AbstractUrl, SlaveRemote):
- """scheme class for ssh:// urls
-
- interface to remote slave on master side
- implementing an ssh based proxy
- """
-
- def __init__(self, path):
- self.remote_addr, inner_url = sup(self, path,
- '^((?:%s@)?%s):(.+)' % tuple([ r.pattern for r in (UserRX, HostRX) ]))
- self.inner_rsc = parse_url(inner_url)
-
- def canonical_path(self):
- m = re.match('([^@]+)@(.+)', self.remote_addr)
- if m:
- u, h = m.groups()
- else:
- u, h = syncdutils.getusername(), self.remote_addr
- remote_addr = '@'.join([u, gethostbyname(h)])
- return ':'.join([remote_addr, self.inner_rsc.get_url(canonical=True)])
-
- def can_connect_to(self, remote):
- """determine our position in the connectibility matrix"""
- return False
-
- def start_fd_client(self, *a, **opts):
- """customizations for client startup
-
- - be a no-op if we are to daemonize (client startup is deferred
- to post-daemon stage)
- - determine target url for rsync after consulting server
- """
- if opts.get('deferred'):
- return a
- sup(self, *a)
- ityp = type(self.inner_rsc)
- if ityp == FILE:
- slavepath = self.inner_rsc.path
- elif ityp == GLUSTER:
- slavepath = "/proc/%d/cwd" % self.server.pid()
- else:
- raise NotImplementedError
- self.slaveurl = ':'.join([self.remote_addr, slavepath])
-
- def connect_remote(self, go_daemon=None):
- """connect to inner slave url through outer ssh url
-
- Wrap the connecting utility in ssh.
-
- Much care is put into daemonizing: in that case
- ssh is started before daemonization, but
- RePCe client is to be created after that (as ssh
- interactive password auth would be defeated by
- a daemonized ssh, while client should be present
- only in the final process). In that case the action
- is taken apart to two parts, this method is ivoked
- once pre-daemon, once post-daemon. Use @go_daemon
- to deiced what part to perform.
-
- [NB. ATM gluster product does not makes use of interactive
- authentication.]
- """
- if go_daemon == 'done':
- return self.start_fd_client(*self.fd_pair)
- gconf.setup_ssh_ctl(tempfile.mkdtemp(prefix='gsyncd-aux-ssh-'))
- deferred = go_daemon == 'postconn'
- ret = sup(self, gconf.ssh_command.split() + gconf.ssh_ctl_args + [self.remote_addr], slave=self.inner_rsc.url, deferred=deferred)
- if deferred:
- # send a message to peer so that we can wait for
- # the answer from which we know connection is
- # established and we can proceed with daemonization
- # (doing that too early robs the ssh passwd prompt...)
- # However, we'd better not start the RepceClient
- # before daemonization (that's not preserved properly
- # in daemon), we just do a an ad-hoc linear put/get.
- i, o = ret
- inf = os.fdopen(i)
- repce.send(o, None, '__repce_version__')
- select((inf,), (), ())
- repce.recv(inf)
- # hack hack hack: store a global reference to the file
- # to save it from getting GC'd which implies closing it
- gconf.permanent_handles.append(inf)
- self.fd_pair = (i, o)
- return 'should'
-
- def rsync(self, files):
- return sup(self, files, '-e', " ".join(gconf.ssh_command.split() + gconf.ssh_ctl_args),
- *(gconf.rsync_ssh_options.split() + [self.slaveurl]))
diff --git a/xlators/features/marker/utils/syncdaemon/syncdutils.py b/xlators/features/marker/utils/syncdaemon/syncdutils.py
deleted file mode 100644
index 1d4eb2003..000000000
--- a/xlators/features/marker/utils/syncdaemon/syncdutils.py
+++ /dev/null
@@ -1,282 +0,0 @@
-import os
-import sys
-import pwd
-import time
-import fcntl
-import shutil
-import logging
-from threading import Lock, Thread as baseThread
-from errno import EACCES, EAGAIN, EPIPE, ENOTCONN, ECONNABORTED, EINTR, errorcode
-from signal import signal, SIGTERM, SIGKILL
-from time import sleep
-import select as oselect
-from os import waitpid as owaitpid
-try:
- from cPickle import PickleError
-except ImportError:
- # py 3
- from pickle import PickleError
-
-from gconf import gconf
-
-try:
- # py 3
- from urllib import parse as urllib
-except ImportError:
- import urllib
-
-def escape(s):
- """the chosen flavor of string escaping, used all over
- to turn whatever data to creatable representation"""
- return urllib.quote_plus(s)
-
-def unescape(s):
- """inverse of .escape"""
- return urllib.unquote_plus(s)
-
-def norm(s):
- if s:
- return s.replace('-', '_')
-
-def update_file(path, updater, merger = lambda f: True):
- """update a file in a transaction-like manner"""
-
- fr = fw = None
- try:
- fd = os.open(path, os.O_CREAT|os.O_RDWR)
- try:
- fr = os.fdopen(fd, 'r+b')
- except:
- os.close(fd)
- raise
- fcntl.lockf(fr, fcntl.LOCK_EX)
- if not merger(fr):
- return
-
- tmpp = path + '.tmp.' + str(os.getpid())
- fd = os.open(tmpp, os.O_CREAT|os.O_EXCL|os.O_WRONLY)
- try:
- fw = os.fdopen(fd, 'wb', 0)
- except:
- os.close(fd)
- raise
- updater(fw)
- os.fsync(fd)
- os.rename(tmpp, path)
- finally:
- for fx in (fr, fw):
- if fx:
- fx.close()
-
-def grabfile(fname, content=None):
- """open @fname + contest for its fcntl lock
-
- @content: if given, set the file content to it
- """
- # damn those messy open() mode codes
- fd = os.open(fname, os.O_CREAT|os.O_RDWR)
- f = os.fdopen(fd, 'r+b', 0)
- try:
- fcntl.lockf(f, fcntl.LOCK_EX|fcntl.LOCK_NB)
- except:
- ex = sys.exc_info()[1]
- f.close()
- if isinstance(ex, IOError) and ex.errno in (EACCES, EAGAIN):
- # cannot grab, it's taken
- return
- raise
- if content:
- try:
- f.truncate()
- f.write(content)
- except:
- f.close()
- raise
- gconf.permanent_handles.append(f)
- return f
-
-def grabpidfile(fname=None, setpid=True):
- """.grabfile customization for pid files"""
- if not fname:
- fname = gconf.pid_file
- content = None
- if setpid:
- content = str(os.getpid()) + '\n'
- return grabfile(fname, content=content)
-
-final_lock = Lock()
-
-def finalize(*a, **kw):
- """all those messy final steps we go trough upon termination
-
- Do away with pidfile, ssh control dir and logging.
- """
- final_lock.acquire()
- if getattr(gconf, 'pid_file', None):
- rm_pidf = gconf.pid_file_owned
- if gconf.cpid:
- # exit path from parent branch of daemonization
- rm_pidf = False
- while True:
- f = grabpidfile(setpid=False)
- if not f:
- # child has already taken over pidfile
- break
- if os.waitpid(gconf.cpid, os.WNOHANG)[0] == gconf.cpid:
- # child has terminated
- rm_pidf = True
- break;
- time.sleep(0.1)
- if rm_pidf:
- try:
- os.unlink(gconf.pid_file)
- except:
- ex = sys.exc_info()[1]
- if ex.errno == ENOENT:
- pass
- else:
- raise
- if gconf.ssh_ctl_dir and not gconf.cpid:
- shutil.rmtree(gconf.ssh_ctl_dir)
- if getattr(gconf, 'state_socket', None):
- try:
- os.unlink(gconf.state_socket)
- except:
- if sys.exc_info()[0] == OSError:
- pass
- if gconf.log_exit:
- logging.info("exiting.")
- sys.stdout.flush()
- sys.stderr.flush()
- os._exit(kw.get('exval', 0))
-
-def log_raise_exception(excont):
- """top-level exception handler
-
- Try to some fancy things to cover up we face with an error.
- Translate some weird sounding but well understood exceptions
- into human-friendly lingo
- """
- is_filelog = False
- for h in logging.getLogger().handlers:
- fno = getattr(getattr(h, 'stream', None), 'fileno', None)
- if fno and not os.isatty(fno()):
- is_filelog = True
-
- exc = sys.exc_info()[1]
- if isinstance(exc, SystemExit):
- excont.exval = exc.code or 0
- raise
- else:
- logtag = None
- if isinstance(exc, GsyncdError):
- if is_filelog:
- logging.error(exc.message)
- sys.stderr.write('failure: ' + exc.message + "\n")
- elif isinstance(exc, PickleError) or isinstance(exc, EOFError) or \
- ((isinstance(exc, OSError) or isinstance(exc, IOError)) and \
- exc.errno == EPIPE):
- logging.error('connection to peer is broken')
- if hasattr(gconf, 'transport'):
- gconf.transport.wait()
- gconf.transport.terminate_geterr()
- elif isinstance(exc, OSError) and exc.errno in (ENOTCONN, ECONNABORTED):
- logging.error('glusterfs session went down [%s]', errorcode[exc.errno])
- else:
- logtag = "FAIL"
- if not logtag and logging.getLogger().isEnabledFor(logging.DEBUG):
- logtag = "FULL EXCEPTION TRACE"
- if logtag:
- logging.exception(logtag + ": ")
- sys.stderr.write("failed with %s.\n" % type(exc).__name__)
- excont.exval = 1
- sys.exit(excont.exval)
-
-
-class FreeObject(object):
- """wildcard class for which any attribute can be set"""
-
- def __init__(self, **kw):
- for k,v in kw.items():
- setattr(self, k, v)
-
-class Thread(baseThread):
- """thread class flavor for gsyncd
-
- - always a daemon thread
- - force exit for whole program if thread
- function coughs up an exception
- """
- def __init__(self, *a, **kw):
- tf = kw.get('target')
- if tf:
- def twrap(*aa):
- excont = FreeObject(exval = 0)
- try:
- tf(*aa)
- except:
- try:
- log_raise_exception(excont)
- finally:
- finalize(exval = excont.exval)
- kw['target'] = twrap
- baseThread.__init__(self, *a, **kw)
- self.setDaemon(True)
-
-class GsyncdError(Exception):
- pass
-
-def getusername(uid = None):
- if uid == None:
- uid = os.geteuid()
- return pwd.getpwuid(uid).pw_name
-
-def privileged():
- return os.geteuid() == 0
-
-def boolify(s):
- """
- Generic string to boolean converter
-
- return
- - Quick return if string 's' is of type bool
- - True if it's in true_list
- - False if it's in false_list
- - Warn if it's not present in either and return False
- """
- true_list = ['true', 'yes', '1', 'on']
- false_list = ['false', 'no', '0', 'off']
-
- if isinstance(s, bool):
- return s
-
- rv = False
- lstr = s.lower()
- if lstr in true_list:
- rv = True
- elif not lstr in false_list:
- logging.warn("Unknown string (%s) in string to boolean conversion defaulting to False\n" % (s))
-
- return rv
-
-def eintr_wrap(func, exc, *a):
- """
- wrapper around syscalls resilient to interrupt caused
- by signals
- """
- while True:
- try:
- return func(*a)
- except exc:
- ex = sys.exc_info()[1]
- if not ex.args[0] == EINTR:
- raise
-
-def select(*a):
- return eintr_wrap(oselect.select, oselect.error, *a)
-
-def waitpid (*a):
- return eintr_wrap(owaitpid, OSError, *a)
-
-def set_term_handler(hook=lambda *a: finalize(*a, **{'exval': 1})):
- signal(SIGTERM, hook)
diff --git a/xlators/features/path-convertor/src/Makefile.am b/xlators/features/path-convertor/src/Makefile.am
index f1112756b..393a7bd08 100644
--- a/xlators/features/path-convertor/src/Makefile.am
+++ b/xlators/features/path-convertor/src/Makefile.am
@@ -2,7 +2,7 @@
xlator_LTLIBRARIES = path-converter.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/features
-path_converter_la_LDFLAGS = -module -avoidversion
+path_converter_la_LDFLAGS = -module -avoid-version
path_converter_la_SOURCES = path.c
path_converter_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
diff --git a/xlators/features/protect/Makefile.am b/xlators/features/protect/Makefile.am
new file mode 100644
index 000000000..d471a3f92
--- /dev/null
+++ b/xlators/features/protect/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/features/protect/src/Makefile.am b/xlators/features/protect/src/Makefile.am
new file mode 100644
index 000000000..968e88c45
--- /dev/null
+++ b/xlators/features/protect/src/Makefile.am
@@ -0,0 +1,21 @@
+xlator_LTLIBRARIES = prot_dht.la prot_client.la prot_server.la
+
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+prot_dht_la_LDFLAGS = -module -avoid-version
+prot_dht_la_SOURCES = prot_dht.c
+prot_dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+prot_client_la_LDFLAGS = -module -avoid-version
+prot_client_la_SOURCES = prot_client.c
+prot_client_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+prot_server_la_LDFLAGS = -module -avoid-version
+prot_server_la_SOURCES = prot_server.c
+prot_server_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
+
diff --git a/xlators/features/protect/src/prot_client.c b/xlators/features/protect/src/prot_client.c
new file mode 100644
index 000000000..d09715067
--- /dev/null
+++ b/xlators/features/protect/src/prot_client.c
@@ -0,0 +1,217 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "xlator.h"
+#include "defaults.h"
+
+#ifndef __NetBSD__
+#include <execinfo.h>
+#endif
+
+#define NUM_FRAMES 20
+
+static char PROTECT_KEY[] = "trusted.glusterfs.protect";
+
+enum {
+ PROT_ACT_NONE = 0,
+ PROT_ACT_LOG,
+ PROT_ACT_REJECT,
+};
+
+void
+pcli_print_trace (char *name, call_frame_t *frame)
+{
+ void *frames[NUM_FRAMES];
+ char **symbols;
+ int size;
+ int i;
+
+ gf_log (name, GF_LOG_INFO, "Translator stack:");
+ while (frame) {
+ gf_log (name, GF_LOG_INFO, "%s (%s)",
+ frame->wind_from, frame->this->name);
+ frame = frame->next;
+ }
+
+ size = backtrace(frames,NUM_FRAMES);
+ if (size <= 0) {
+ return;
+ }
+ symbols = backtrace_symbols(frames,size);
+ if (!symbols) {
+ return;
+ }
+
+ gf_log(name, GF_LOG_INFO, "Processor stack:");
+ for (i = 0; i < size; ++i) {
+ gf_log (name, GF_LOG_INFO, "%s", symbols[i]);
+ }
+ free(symbols);
+}
+
+int32_t
+pcli_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc,
+ loc_t *newloc, dict_t *xdata)
+{
+ uint64_t value;
+
+ if (newloc->parent == oldloc->parent) {
+ gf_log (this->name, GF_LOG_DEBUG, "rename in same directory");
+ goto simple_unwind;
+ }
+ if (!oldloc->parent) {
+ goto simple_unwind;
+ }
+ if (inode_ctx_get(oldloc->parent,this,&value) != 0) {
+ goto simple_unwind;
+ }
+
+ if (value != PROT_ACT_NONE) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "got rename for protected %s", oldloc->path);
+ pcli_print_trace(this->name,frame->next);
+ if (value == PROT_ACT_REJECT) {
+ STACK_UNWIND_STRICT (rename, frame, -1, EPERM,
+ NULL, NULL, NULL, NULL, NULL,
+ xdata);
+ return 0;
+ }
+ }
+
+simple_unwind:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->rename, oldloc, newloc,
+ xdata);
+ return 0;
+}
+
+int32_t
+pcli_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+ int32_t flags, dict_t *xdata)
+{
+ data_t *data;
+ uint64_t value;
+
+ /*
+ * We can't use dict_get_str and strcmp here, because the value comes
+ * directly from the user and might not be NUL-terminated (it would
+ * be if we had set it ourselves.
+ */
+
+ data = dict_get(dict,PROTECT_KEY);
+ if (!data) {
+ goto simple_wind;
+ }
+
+ if (dict->count > 1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "attempted to mix %s with other keys", PROTECT_KEY);
+ goto simple_wind;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG, "got %s request", PROTECT_KEY);
+ if (!strncmp(data->data,"log",data->len)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "logging removals on %s", loc->path);
+ value = PROT_ACT_LOG;
+ }
+ else if (!strncmp(data->data,"reject",data->len)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "rejecting removals on %s", loc->path);
+ value = PROT_ACT_REJECT;
+ }
+ else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "removing protection on %s", loc->path);
+ value = PROT_ACT_NONE;
+ }
+ /* Right now the value doesn't matter - just the presence. */
+ if (inode_ctx_set(loc->inode,this,&value) != 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to set protection status for %s", loc->path);
+ }
+ STACK_UNWIND_STRICT (setxattr, frame, 0, 0, NULL);
+ return 0;
+
+simple_wind:
+ STACK_WIND_TAIL (frame,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->setxattr,
+ loc, dict, flags, xdata);
+ return 0;
+}
+
+int32_t
+pcli_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+ dict_t *xdata)
+{
+ uint64_t value;
+
+ if (!loc->parent || (inode_ctx_get(loc->parent,this,&value) != 0)) {
+ goto simple_unwind;
+ }
+
+ if (value != PROT_ACT_NONE) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "got unlink for protected %s", loc->path);
+ pcli_print_trace(this->name,frame->next);
+ if (value == PROT_ACT_REJECT) {
+ STACK_UNWIND_STRICT (unlink, frame, -1, EPERM,
+ NULL, NULL, NULL);
+ return 0;
+ }
+ }
+
+simple_unwind:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata);
+ return 0;
+}
+
+int32_t
+init (xlator_t *this)
+{
+ if (!this->children || this->children->next) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "translator not configured with exactly one child");
+ return -1;
+ }
+
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "dangling volume. check volfile ");
+ }
+
+ return 0;
+}
+
+
+void
+fini (xlator_t *this)
+{
+ return;
+}
+
+
+struct xlator_fops fops = {
+ .rename = pcli_rename,
+ .setxattr = pcli_setxattr,
+ .unlink = pcli_unlink,
+};
+
+struct xlator_cbks cbks = {
+};
+
+struct volume_options options[] = {
+ { .key = {NULL} },
+};
diff --git a/xlators/features/protect/src/prot_dht.c b/xlators/features/protect/src/prot_dht.c
new file mode 100644
index 000000000..feec6ffd6
--- /dev/null
+++ b/xlators/features/protect/src/prot_dht.c
@@ -0,0 +1,168 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "xlator.h"
+#include "defaults.h"
+
+enum gf_pdht_mem_types_ {
+ gf_pdht_mt_coord_t = gf_common_mt_end + 1,
+ gf_pdht_mt_end
+};
+
+typedef struct {
+ pthread_mutex_t lock;
+ uint16_t refs;
+ int32_t op_ret;
+ int32_t op_errno;
+ dict_t *xdata;
+} pdht_coord_t;
+
+static char PROTECT_KEY[] = "trusted.glusterfs.protect";
+
+void
+pdht_unref_and_unlock (call_frame_t *frame, xlator_t *this,
+ pdht_coord_t *coord)
+{
+ gf_boolean_t should_unwind;
+
+ should_unwind = (--(coord->refs) == 0);
+ pthread_mutex_unlock(&coord->lock);
+
+ if (should_unwind) {
+ STACK_UNWIND_STRICT (setxattr, frame,
+ coord->op_ret, coord->op_errno,
+ coord->xdata);
+ if (coord->xdata) {
+ dict_unref(coord->xdata);
+ }
+ GF_FREE(coord);
+ }
+}
+
+int32_t
+pdht_recurse_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ pdht_coord_t *coord = cookie;
+
+ pthread_mutex_lock(&coord->lock);
+ if (op_ret) {
+ coord->op_ret = op_ret;
+ coord->op_errno = op_errno;
+ }
+ if (xdata) {
+ if (coord->xdata) {
+ dict_unref(coord->xdata);
+ }
+ coord->xdata = dict_ref(xdata);
+ }
+ pdht_unref_and_unlock(frame,this,coord);
+
+ return 0;
+}
+
+void
+pdht_recurse (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+ int32_t flags, dict_t *xdata, xlator_t *xl, pdht_coord_t *coord)
+{
+ xlator_list_t *iter;
+
+ if (!strcmp(xl->type,"features/prot_client")) {
+ pthread_mutex_lock(&coord->lock);
+ ++(coord->refs);
+ pthread_mutex_unlock(&coord->lock);
+ STACK_WIND_COOKIE (frame, pdht_recurse_cbk, coord, xl,
+ xl->fops->setxattr, loc, dict, flags, xdata);
+ }
+
+ else for (iter = xl->children; iter; iter = iter->next) {
+ pdht_recurse (frame, this, loc, dict, flags, xdata,
+ iter->xlator, coord);
+ }
+}
+
+int32_t
+pdht_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+ int32_t flags, dict_t *xdata)
+{
+ pdht_coord_t *coord;
+
+ if (!dict_get(dict,PROTECT_KEY)) {
+ goto simple_wind;
+ }
+
+ if (dict->count > 1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "attempted to mix %s with other keys", PROTECT_KEY);
+ goto simple_wind;
+ }
+
+ coord = GF_CALLOC(1,sizeof(*coord),gf_pdht_mt_coord_t);
+ if (!coord) {
+ gf_log (this->name, GF_LOG_WARNING, "allocation failed");
+ goto simple_wind;
+ }
+
+ pthread_mutex_init(&coord->lock,NULL);
+ coord->refs = 1;
+ coord->op_ret = 0;
+ coord->xdata = NULL;
+
+ pdht_recurse(frame,this,loc,dict,flags,xdata,this,coord);
+ pthread_mutex_lock(&coord->lock);
+ pdht_unref_and_unlock(frame,this,coord);
+
+ return 0;
+
+simple_wind:
+ STACK_WIND_TAIL (frame,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->setxattr,
+ loc, dict, flags, xdata);
+ return 0;
+}
+
+int32_t
+init (xlator_t *this)
+{
+ if (!this->children || this->children->next) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "translator not configured with exactly one child");
+ return -1;
+ }
+
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "dangling volume. check volfile ");
+ }
+
+ return 0;
+}
+
+
+void
+fini (xlator_t *this)
+{
+ return;
+}
+
+struct xlator_fops fops = {
+ .setxattr = pdht_setxattr,
+};
+
+struct xlator_cbks cbks = {
+};
+
+struct volume_options options[] = {
+ { .key = {NULL} },
+};
diff --git a/xlators/features/protect/src/prot_server.c b/xlators/features/protect/src/prot_server.c
new file mode 100644
index 000000000..beaee0889
--- /dev/null
+++ b/xlators/features/protect/src/prot_server.c
@@ -0,0 +1,51 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "xlator.h"
+#include "defaults.h"
+
+int32_t
+init (xlator_t *this)
+{
+ if (!this->children || this->children->next) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "translator not configured with exactly one child");
+ return -1;
+ }
+
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "dangling volume. check volfile ");
+ }
+
+ return 0;
+}
+
+
+void
+fini (xlator_t *this)
+{
+ return;
+}
+
+
+struct xlator_fops fops = {
+};
+
+struct xlator_cbks cbks = {
+};
+
+struct volume_options options[] = {
+ { .key = {NULL} },
+};
diff --git a/xlators/features/qemu-block/Makefile.am b/xlators/features/qemu-block/Makefile.am
new file mode 100644
index 000000000..af437a64d
--- /dev/null
+++ b/xlators/features/qemu-block/Makefile.am
@@ -0,0 +1 @@
+SUBDIRS = src
diff --git a/xlators/features/qemu-block/src/Makefile.am b/xlators/features/qemu-block/src/Makefile.am
new file mode 100644
index 000000000..08a7b62a0
--- /dev/null
+++ b/xlators/features/qemu-block/src/Makefile.am
@@ -0,0 +1,155 @@
+if ENABLE_QEMU_BLOCK
+xlator_LTLIBRARIES = qemu-block.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
+
+qemu_block_la_LDFLAGS = -module -avoid-version
+qemu_block_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(GLIB_LIBS) -lz -lrt
+
+qemu_block_la_SOURCES_qemu = \
+ $(CONTRIBDIR)/qemu/qemu-coroutine.c \
+ $(CONTRIBDIR)/qemu/qemu-coroutine-lock.c \
+ $(CONTRIBDIR)/qemu/qemu-coroutine-sleep.c \
+ $(CONTRIBDIR)/qemu/coroutine-ucontext.c \
+ $(CONTRIBDIR)/qemu/block.c \
+ $(CONTRIBDIR)/qemu/nop-symbols.c
+
+qemu_block_la_SOURCES_qemu_util = \
+ $(CONTRIBDIR)/qemu/util/aes.c \
+ $(CONTRIBDIR)/qemu/util/bitmap.c \
+ $(CONTRIBDIR)/qemu/util/bitops.c \
+ $(CONTRIBDIR)/qemu/util/cutils.c \
+ $(CONTRIBDIR)/qemu/util/error.c \
+ $(CONTRIBDIR)/qemu/util/hbitmap.c \
+ $(CONTRIBDIR)/qemu/util/iov.c \
+ $(CONTRIBDIR)/qemu/util/module.c \
+ $(CONTRIBDIR)/qemu/util/oslib-posix.c \
+ $(CONTRIBDIR)/qemu/util/qemu-option.c \
+ $(CONTRIBDIR)/qemu/util/qemu-error.c \
+ $(CONTRIBDIR)/qemu/util/qemu-thread-posix.c \
+ $(CONTRIBDIR)/qemu/util/unicode.c \
+ $(CONTRIBDIR)/qemu/util/hexdump.c
+
+qemu_block_la_SOURCES_qemu_block = \
+ $(CONTRIBDIR)/qemu/block/snapshot.c \
+ $(CONTRIBDIR)/qemu/block/qcow2-cache.c \
+ $(CONTRIBDIR)/qemu/block/qcow2-cluster.c \
+ $(CONTRIBDIR)/qemu/block/qcow2-refcount.c \
+ $(CONTRIBDIR)/qemu/block/qcow2-snapshot.c \
+ $(CONTRIBDIR)/qemu/block/qcow2.c \
+ $(CONTRIBDIR)/qemu/block/qed-check.c \
+ $(CONTRIBDIR)/qemu/block/qed-cluster.c \
+ $(CONTRIBDIR)/qemu/block/qed-gencb.c \
+ $(CONTRIBDIR)/qemu/block/qed-l2-cache.c \
+ $(CONTRIBDIR)/qemu/block/qed-table.c \
+ $(CONTRIBDIR)/qemu/block/qed.c
+
+qemu_block_la_SOURCES_qemu_qobject = \
+ $(CONTRIBDIR)/qemu/qobject/json-lexer.c \
+ $(CONTRIBDIR)/qemu/qobject/json-parser.c \
+ $(CONTRIBDIR)/qemu/qobject/json-streamer.c \
+ $(CONTRIBDIR)/qemu/qobject/qbool.c \
+ $(CONTRIBDIR)/qemu/qobject/qdict.c \
+ $(CONTRIBDIR)/qemu/qobject/qerror.c \
+ $(CONTRIBDIR)/qemu/qobject/qfloat.c \
+ $(CONTRIBDIR)/qemu/qobject/qint.c \
+ $(CONTRIBDIR)/qemu/qobject/qjson.c \
+ $(CONTRIBDIR)/qemu/qobject/qlist.c \
+ $(CONTRIBDIR)/qemu/qobject/qstring.c
+
+qemu_block_la_SOURCES = \
+ $(qemu_block_la_SOURCES_qemu) \
+ $(qemu_block_la_SOURCES_qemu_util) \
+ $(qemu_block_la_SOURCES_qemu_block) \
+ $(qemu_block_la_SOURCES_qemu_qobject) \
+ bdrv-xlator.c \
+ coroutine-synctask.c \
+ bh-syncop.c \
+ monitor-logging.c \
+ clock-timer.c \
+ qemu-block.c \
+ qb-coroutines.c
+
+noinst_HEADERS_qemu = \
+ $(CONTRIBDIR)/qemu/config-host.h \
+ $(CONTRIBDIR)/qemu/qapi-types.h \
+ $(CONTRIBDIR)/qemu/qmp-commands.h \
+ $(CONTRIBDIR)/qemu/trace/generated-tracers.h \
+ $(CONTRIBDIR)/qemu/include/config.h \
+ $(CONTRIBDIR)/qemu/include/glib-compat.h \
+ $(CONTRIBDIR)/qemu/include/qemu-common.h \
+ $(CONTRIBDIR)/qemu/include/trace.h \
+ $(CONTRIBDIR)/qemu/include/block/coroutine.h \
+ $(CONTRIBDIR)/qemu/include/block/aio.h \
+ $(CONTRIBDIR)/qemu/include/block/block.h \
+ $(CONTRIBDIR)/qemu/include/block/block_int.h \
+ $(CONTRIBDIR)/qemu/include/block/blockjob.h \
+ $(CONTRIBDIR)/qemu/include/block/coroutine.h \
+ $(CONTRIBDIR)/qemu/include/block/coroutine_int.h \
+ $(CONTRIBDIR)/qemu/include/block/snapshot.h \
+ $(CONTRIBDIR)/qemu/include/exec/cpu-common.h \
+ $(CONTRIBDIR)/qemu/include/exec/hwaddr.h \
+ $(CONTRIBDIR)/qemu/include/exec/poison.h \
+ $(CONTRIBDIR)/qemu/include/fpu/softfloat.h \
+ $(CONTRIBDIR)/qemu/include/migration/migration.h \
+ $(CONTRIBDIR)/qemu/include/migration/qemu-file.h \
+ $(CONTRIBDIR)/qemu/include/migration/vmstate.h \
+ $(CONTRIBDIR)/qemu/include/monitor/monitor.h \
+ $(CONTRIBDIR)/qemu/include/monitor/readline.h \
+ $(CONTRIBDIR)/qemu/include/qapi/error.h \
+ $(CONTRIBDIR)/qemu/include/qapi/qmp/json-lexer.h \
+ $(CONTRIBDIR)/qemu/include/qapi/qmp/json-parser.h \
+ $(CONTRIBDIR)/qemu/include/qapi/qmp/json-streamer.h \
+ $(CONTRIBDIR)/qemu/include/qapi/qmp/qbool.h \
+ $(CONTRIBDIR)/qemu/include/qapi/qmp/qdict.h \
+ $(CONTRIBDIR)/qemu/include/qapi/qmp/qerror.h \
+ $(CONTRIBDIR)/qemu/include/qapi/qmp/qfloat.h \
+ $(CONTRIBDIR)/qemu/include/qapi/qmp/qint.h \
+ $(CONTRIBDIR)/qemu/include/qapi/qmp/qjson.h \
+ $(CONTRIBDIR)/qemu/include/qapi/qmp/qlist.h \
+ $(CONTRIBDIR)/qemu/include/qapi/qmp/qobject.h \
+ $(CONTRIBDIR)/qemu/include/qapi/qmp/qstring.h \
+ $(CONTRIBDIR)/qemu/include/qapi/qmp/types.h \
+ $(CONTRIBDIR)/qemu/include/qemu/aes.h \
+ $(CONTRIBDIR)/qemu/include/qemu/atomic.h \
+ $(CONTRIBDIR)/qemu/include/qemu/bitmap.h \
+ $(CONTRIBDIR)/qemu/include/qemu/bitops.h \
+ $(CONTRIBDIR)/qemu/include/qemu/bswap.h \
+ $(CONTRIBDIR)/qemu/include/qemu/compiler.h \
+ $(CONTRIBDIR)/qemu/include/qemu/error-report.h \
+ $(CONTRIBDIR)/qemu/include/qemu/event_notifier.h \
+ $(CONTRIBDIR)/qemu/include/qemu/hbitmap.h \
+ $(CONTRIBDIR)/qemu/include/qemu/host-utils.h \
+ $(CONTRIBDIR)/qemu/include/qemu/iov.h \
+ $(CONTRIBDIR)/qemu/include/qemu/main-loop.h \
+ $(CONTRIBDIR)/qemu/include/qemu/module.h \
+ $(CONTRIBDIR)/qemu/include/qemu/notify.h \
+ $(CONTRIBDIR)/qemu/include/qemu/option.h \
+ $(CONTRIBDIR)/qemu/include/qemu/option_int.h \
+ $(CONTRIBDIR)/qemu/include/qemu/osdep.h \
+ $(CONTRIBDIR)/qemu/include/qemu/queue.h \
+ $(CONTRIBDIR)/qemu/include/qemu/sockets.h \
+ $(CONTRIBDIR)/qemu/include/qemu/thread-posix.h \
+ $(CONTRIBDIR)/qemu/include/qemu/thread.h \
+ $(CONTRIBDIR)/qemu/include/qemu/timer.h \
+ $(CONTRIBDIR)/qemu/include/qemu/typedefs.h \
+ $(CONTRIBDIR)/qemu/include/sysemu/sysemu.h \
+ $(CONTRIBDIR)/qemu/include/sysemu/os-posix.h \
+ $(CONTRIBDIR)/qemu/block/qcow2.h \
+ $(CONTRIBDIR)/qemu/block/qed.h
+
+noinst_HEADERS = \
+ $(noinst_HEADERS_qemu) \
+ qemu-block.h \
+ qemu-block-memory-types.h \
+ qb-coroutines.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+ -I$(CONTRIBDIR)/qemu \
+ -I$(CONTRIBDIR)/qemu/include \
+ -DGLUSTER_XLATOR
+
+AM_CFLAGS = -fno-strict-aliasing -Wall $(GF_CFLAGS) $(GLIB_CFLAGS)
+
+CLEANFILES =
+
+endif
diff --git a/xlators/features/qemu-block/src/bdrv-xlator.c b/xlators/features/qemu-block/src/bdrv-xlator.c
new file mode 100644
index 000000000..1e55b5fb7
--- /dev/null
+++ b/xlators/features/qemu-block/src/bdrv-xlator.c
@@ -0,0 +1,389 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "inode.h"
+#include "syncop.h"
+#include "qemu-block.h"
+#include "block/block_int.h"
+
+typedef struct BDRVGlusterState {
+ inode_t *inode;
+} BDRVGlusterState;
+
+static QemuOptsList runtime_opts = {
+ .name = "gluster",
+ .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+ .desc = {
+ {
+ .name = "filename",
+ .type = QEMU_OPT_STRING,
+ .help = "GFID of file",
+ },
+ { /* end of list */ }
+ },
+};
+
+inode_t *
+qb_inode_from_filename (const char *filename)
+{
+ const char *iptr = NULL;
+ inode_t *inode = NULL;
+
+ iptr = filename + 17;
+ sscanf (iptr, "%p", &inode);
+
+ return inode;
+}
+
+
+int
+qb_inode_to_filename (inode_t *inode, char *filename, int size)
+{
+ return snprintf (filename, size, "gluster://inodep:%p", inode);
+}
+
+
+static fd_t *
+fd_from_bs (BlockDriverState *bs)
+{
+ BDRVGlusterState *s = bs->opaque;
+
+ return fd_anonymous (s->inode);
+}
+
+
+static int
+qemu_gluster_open (BlockDriverState *bs, QDict *options, int bdrv_flags)
+{
+ inode_t *inode = NULL;
+ BDRVGlusterState *s = bs->opaque;
+ QemuOpts *opts = NULL;
+ Error *local_err = NULL;
+ const char *filename = NULL;
+ char gfid_str[128];
+ int ret;
+ qb_conf_t *conf = THIS->private;
+
+ opts = qemu_opts_create_nofail(&runtime_opts);
+ qemu_opts_absorb_qdict(opts, options, &local_err);
+ if (error_is_set(&local_err)) {
+ qerror_report_err(local_err);
+ error_free(local_err);
+ return -EINVAL;
+ }
+
+ filename = qemu_opt_get(opts, "filename");
+
+ /*
+ * gfid:<gfid> format means we're opening a backing image.
+ */
+ ret = sscanf(filename, "gluster://gfid:%s", gfid_str);
+ if (ret) {
+ loc_t loc = {0,};
+ struct iatt buf = {0,};
+ uuid_t gfid;
+
+ uuid_parse(gfid_str, gfid);
+
+ loc.inode = inode_find(conf->root_inode->table, gfid);
+ if (!loc.inode) {
+ loc.inode = inode_new(conf->root_inode->table);
+ uuid_copy(loc.inode->gfid, gfid);
+ }
+
+ uuid_copy(loc.gfid, loc.inode->gfid);
+ ret = syncop_lookup(FIRST_CHILD(THIS), &loc, NULL, &buf, NULL,
+ NULL);
+ if (ret) {
+ loc_wipe(&loc);
+ return ret;
+ }
+
+ s->inode = inode_ref(loc.inode);
+ loc_wipe(&loc);
+ } else {
+ inode = qb_inode_from_filename (filename);
+ if (!inode)
+ return -EINVAL;
+
+ s->inode = inode_ref(inode);
+ }
+
+ return 0;
+}
+
+
+static int
+qemu_gluster_create (const char *filename, QEMUOptionParameter *options)
+{
+ uint64_t total_size = 0;
+ inode_t *inode = NULL;
+ fd_t *fd = NULL;
+ struct iatt stat = {0, };
+ int ret = 0;
+
+ inode = qb_inode_from_filename (filename);
+ if (!inode)
+ return -EINVAL;
+
+ while (options && options->name) {
+ if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+ total_size = options->value.n / BDRV_SECTOR_SIZE;
+ }
+ options++;
+ }
+
+ fd = fd_anonymous (inode);
+ if (!fd)
+ return -ENOMEM;
+
+ ret = syncop_fstat (FIRST_CHILD(THIS), fd, &stat);
+ if (ret) {
+ fd_unref (fd);
+ return ret;
+ }
+
+ if (stat.ia_size) {
+ /* format ONLY if the filesize is 0 bytes */
+ fd_unref (fd);
+ return -EFBIG;
+ }
+
+ if (total_size) {
+ ret = syncop_ftruncate (FIRST_CHILD(THIS), fd, total_size);
+ if (ret) {
+ fd_unref (fd);
+ return ret;
+ }
+ }
+
+ fd_unref (fd);
+ return 0;
+}
+
+
+static int
+qemu_gluster_co_readv (BlockDriverState *bs, int64_t sector_num, int nb_sectors,
+ QEMUIOVector *qiov)
+{
+ fd_t *fd = NULL;
+ off_t offset = 0;
+ size_t size = 0;
+ struct iovec *iov = NULL;
+ int count = 0;
+ struct iobref *iobref = NULL;
+ int ret = 0;
+
+ fd = fd_from_bs (bs);
+ if (!fd)
+ return -EIO;
+
+ offset = sector_num * BDRV_SECTOR_SIZE;
+ size = nb_sectors * BDRV_SECTOR_SIZE;
+
+ ret = syncop_readv (FIRST_CHILD(THIS), fd, size, offset, 0,
+ &iov, &count, &iobref);
+ if (ret < 0)
+ goto out;
+
+ iov_copy (qiov->iov, qiov->niov, iov, count); /* *choke!* */
+
+out:
+ GF_FREE (iov);
+ if (iobref)
+ iobref_unref (iobref);
+ fd_unref (fd);
+ return ret;
+}
+
+
+static int
+qemu_gluster_co_writev (BlockDriverState *bs, int64_t sector_num, int nb_sectors,
+ QEMUIOVector *qiov)
+{
+ fd_t *fd = NULL;
+ off_t offset = 0;
+ size_t size = 0;
+ struct iobref *iobref = NULL;
+ struct iobuf *iobuf = NULL;
+ struct iovec iov = {0, };
+ int ret = -ENOMEM;
+
+ fd = fd_from_bs (bs);
+ if (!fd)
+ return -EIO;
+
+ offset = sector_num * BDRV_SECTOR_SIZE;
+ size = nb_sectors * BDRV_SECTOR_SIZE;
+
+ iobuf = iobuf_get2 (THIS->ctx->iobuf_pool, size);
+ if (!iobuf)
+ goto out;
+
+ iobref = iobref_new ();
+ if (!iobref) {
+ goto out;
+ }
+
+ iobref_add (iobref, iobuf);
+
+ iov_unload (iobuf_ptr (iobuf), qiov->iov, qiov->niov); /* *choke!* */
+
+ iov.iov_base = iobuf_ptr (iobuf);
+ iov.iov_len = size;
+
+ ret = syncop_writev (FIRST_CHILD(THIS), fd, &iov, 1, offset, iobref, 0);
+
+out:
+ if (iobuf)
+ iobuf_unref (iobuf);
+ if (iobref)
+ iobref_unref (iobref);
+ fd_unref (fd);
+ return ret;
+}
+
+
+static int
+qemu_gluster_co_flush (BlockDriverState *bs)
+{
+ fd_t *fd = NULL;
+ int ret = 0;
+
+ fd = fd_from_bs (bs);
+
+ ret = syncop_flush (FIRST_CHILD(THIS), fd);
+
+ fd_unref (fd);
+
+ return ret;
+}
+
+
+static int
+qemu_gluster_co_fsync (BlockDriverState *bs)
+{
+ fd_t *fd = NULL;
+ int ret = 0;
+
+ fd = fd_from_bs (bs);
+
+ ret = syncop_fsync (FIRST_CHILD(THIS), fd, 0);
+
+ fd_unref (fd);
+
+ return ret;
+}
+
+
+static int
+qemu_gluster_truncate (BlockDriverState *bs, int64_t offset)
+{
+ fd_t *fd = NULL;
+ int ret = 0;
+
+ fd = fd_from_bs (bs);
+
+ ret = syncop_ftruncate (FIRST_CHILD(THIS), fd, offset);
+
+ fd_unref (fd);
+
+ return ret;
+}
+
+
+static int64_t
+qemu_gluster_getlength (BlockDriverState *bs)
+{
+ fd_t *fd = NULL;
+ int ret = 0;
+ struct iatt iatt = {0, };
+
+ fd = fd_from_bs (bs);
+
+ ret = syncop_fstat (FIRST_CHILD(THIS), fd, &iatt);
+ if (ret < 0)
+ return -1;
+
+ return iatt.ia_size;
+}
+
+
+static int64_t
+qemu_gluster_allocated_file_size (BlockDriverState *bs)
+{
+ fd_t *fd = NULL;
+ int ret = 0;
+ struct iatt iatt = {0, };
+
+ fd = fd_from_bs (bs);
+
+ ret = syncop_fstat (FIRST_CHILD(THIS), fd, &iatt);
+ if (ret < 0)
+ return -1;
+
+ return iatt.ia_blocks * 512;
+}
+
+
+static void
+qemu_gluster_close (BlockDriverState *bs)
+{
+ BDRVGlusterState *s = NULL;
+
+ s = bs->opaque;
+
+ inode_unref (s->inode);
+
+ return;
+}
+
+
+static QEMUOptionParameter qemu_gluster_create_options[] = {
+ {
+ .name = BLOCK_OPT_SIZE,
+ .type = OPT_SIZE,
+ .help = "Virtual disk size"
+ },
+ { NULL }
+};
+
+
+static BlockDriver bdrv_gluster = {
+ .format_name = "gluster",
+ .protocol_name = "gluster",
+ .instance_size = sizeof(BDRVGlusterState),
+ .bdrv_file_open = qemu_gluster_open,
+ .bdrv_close = qemu_gluster_close,
+ .bdrv_create = qemu_gluster_create,
+ .bdrv_getlength = qemu_gluster_getlength,
+ .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
+ .bdrv_co_readv = qemu_gluster_co_readv,
+ .bdrv_co_writev = qemu_gluster_co_writev,
+ .bdrv_co_flush_to_os = qemu_gluster_co_flush,
+ .bdrv_co_flush_to_disk = qemu_gluster_co_fsync,
+ .bdrv_truncate = qemu_gluster_truncate,
+ .create_options = qemu_gluster_create_options,
+};
+
+
+static void bdrv_gluster_init(void)
+{
+ bdrv_register(&bdrv_gluster);
+}
+
+
+block_init(bdrv_gluster_init);
diff --git a/xlators/features/qemu-block/src/bh-syncop.c b/xlators/features/qemu-block/src/bh-syncop.c
new file mode 100644
index 000000000..e8686f6d4
--- /dev/null
+++ b/xlators/features/qemu-block/src/bh-syncop.c
@@ -0,0 +1,48 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "logging.h"
+#include "dict.h"
+#include "xlator.h"
+#include "syncop.h"
+#include "qemu-block-memory-types.h"
+
+#include "block/aio.h"
+
+void
+qemu_bh_schedule (QEMUBH *bh)
+{
+ return;
+}
+
+void
+qemu_bh_cancel (QEMUBH *bh)
+{
+ return;
+}
+
+void
+qemu_bh_delete (QEMUBH *bh)
+{
+
+}
+
+QEMUBH *
+qemu_bh_new (QEMUBHFunc *cb, void *opaque)
+{
+ return NULL;
+}
diff --git a/xlators/features/qemu-block/src/clock-timer.c b/xlators/features/qemu-block/src/clock-timer.c
new file mode 100644
index 000000000..fcbec6ad1
--- /dev/null
+++ b/xlators/features/qemu-block/src/clock-timer.c
@@ -0,0 +1,60 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "logging.h"
+#include "dict.h"
+#include "xlator.h"
+#include "syncop.h"
+#include "qemu-block-memory-types.h"
+
+#include "qemu/timer.h"
+
+QEMUClock *vm_clock;
+int use_rt_clock = 0;
+
+QEMUTimer *qemu_new_timer (QEMUClock *clock, int scale,
+ QEMUTimerCB *cb, void *opaque)
+{
+ return NULL;
+}
+
+int64_t qemu_get_clock_ns (QEMUClock *clock)
+{
+ return 0;
+}
+
+void qemu_mod_timer (QEMUTimer *ts, int64_t expire_time)
+{
+ return;
+}
+
+void qemu_free_timer (QEMUTimer *ts)
+{
+
+}
+
+void qemu_del_timer (QEMUTimer *ts)
+{
+
+}
+
+bool qemu_aio_wait()
+{
+ synctask_wake (synctask_get());
+ synctask_yield (synctask_get());
+ return 0;
+}
diff --git a/xlators/features/qemu-block/src/coroutine-synctask.c b/xlators/features/qemu-block/src/coroutine-synctask.c
new file mode 100644
index 000000000..e43988a95
--- /dev/null
+++ b/xlators/features/qemu-block/src/coroutine-synctask.c
@@ -0,0 +1,116 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "logging.h"
+#include "dict.h"
+#include "xlator.h"
+#include "syncop.h"
+#include "qemu-block-memory-types.h"
+
+#include "qemu-block.h"
+
+/*
+ * This code serves as the bridge from the main glusterfs context to the qemu
+ * coroutine context via synctask. We create a single threaded syncenv with a
+ * single synctask responsible for processing a queue of coroutines. The qemu
+ * code invoked from within the synctask function handlers uses the ucontext
+ * coroutine implementation and scheduling logic internal to qemu. This
+ * effectively donates a thread of execution to qemu and its internal coroutine
+ * management.
+ *
+ * NOTE: The existence of concurrent synctasks has proven quite racy with regard
+ * to qemu coroutine management, particularly related to the lifecycle
+ * differences with top-level synctasks and internally created coroutines and
+ * interactions with qemu-internal queues (and locks, in turn). We explicitly
+ * disallow this scenario, via the queue, until it is more well supported.
+ */
+
+static struct {
+ struct list_head queue;
+ gf_lock_t lock;
+ struct synctask *task;
+} qb_co;
+
+static void
+init_qbco()
+{
+ INIT_LIST_HEAD(&qb_co.queue);
+ LOCK_INIT(&qb_co.lock);
+}
+
+static int
+synctask_nop_cbk (int ret, call_frame_t *frame, void *opaque)
+{
+ return 0;
+}
+
+static int
+qb_synctask_wrap (void *opaque)
+{
+ qb_local_t *qb_local, *tmp;
+
+ LOCK(&qb_co.lock);
+
+ while (!list_empty(&qb_co.queue)) {
+ list_for_each_entry_safe(qb_local, tmp, &qb_co.queue, list) {
+ list_del_init(&qb_local->list);
+ break;
+ }
+
+ UNLOCK(&qb_co.lock);
+
+ qb_local->synctask_fn(qb_local);
+ /* qb_local is now unwound and gone! */
+
+ LOCK(&qb_co.lock);
+ }
+
+ qb_co.task = NULL;
+
+ UNLOCK(&qb_co.lock);
+
+ return 0;
+}
+
+int
+qb_coroutine (call_frame_t *frame, synctask_fn_t fn)
+{
+ qb_local_t *qb_local = NULL;
+ qb_conf_t *qb_conf = NULL;
+ static int init = 0;
+
+ qb_local = frame->local;
+ qb_local->synctask_fn = fn;
+ qb_conf = frame->this->private;
+
+ if (!init) {
+ init = 1;
+ init_qbco();
+ }
+
+ LOCK(&qb_co.lock);
+
+ if (!qb_co.task)
+ qb_co.task = synctask_create(qb_conf->env, qb_synctask_wrap,
+ synctask_nop_cbk, frame, NULL);
+
+ list_add_tail(&qb_local->list, &qb_co.queue);
+
+ UNLOCK(&qb_co.lock);
+
+ return 0;
+}
diff --git a/xlators/features/qemu-block/src/monitor-logging.c b/xlators/features/qemu-block/src/monitor-logging.c
new file mode 100644
index 000000000..d37c37f0f
--- /dev/null
+++ b/xlators/features/qemu-block/src/monitor-logging.c
@@ -0,0 +1,50 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "logging.h"
+#include "dict.h"
+#include "xlator.h"
+#include "qemu-block-memory-types.h"
+
+#include "block/block_int.h"
+
+Monitor *cur_mon;
+
+int
+monitor_cur_is_qmp()
+{
+ /* No QMP support here */
+ return 0;
+}
+
+void
+monitor_set_error (Monitor *mon, QError *qerror)
+{
+ /* NOP here */
+ return;
+}
+
+
+void
+monitor_vprintf(Monitor *mon, const char *fmt, va_list ap)
+{
+ char buf[4096];
+
+ vsnprintf(buf, sizeof(buf), fmt, ap);
+
+ gf_log (THIS->name, GF_LOG_ERROR, "%s", buf);
+}
diff --git a/xlators/features/qemu-block/src/qb-coroutines.c b/xlators/features/qemu-block/src/qb-coroutines.c
new file mode 100644
index 000000000..974312f12
--- /dev/null
+++ b/xlators/features/qemu-block/src/qb-coroutines.c
@@ -0,0 +1,667 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "logging.h"
+#include "dict.h"
+#include "xlator.h"
+#include "inode.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "qemu-block-memory-types.h"
+#include "qemu-block.h"
+#include "qb-coroutines.h"
+
+
+int
+qb_format_and_resume (void *opaque)
+{
+ qb_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ call_stub_t *stub = NULL;
+ inode_t *inode = NULL;
+ char filename[64];
+ char base_filename[128];
+ int use_base = 0;
+ qb_inode_t *qb_inode = NULL;
+ Error *local_err = NULL;
+ fd_t *fd = NULL;
+ dict_t *xattr = NULL;
+ qb_conf_t *qb_conf = NULL;
+ int ret = -1;
+
+ local = opaque;
+ frame = local->frame;
+ stub = local->stub;
+ inode = local->inode;
+ qb_conf = frame->this->private;
+
+ qb_inode_to_filename (inode, filename, 64);
+
+ qb_inode = qb_inode_ctx_get (frame->this, inode);
+
+ /*
+ * See if the caller specified a backing image.
+ */
+ if (!uuid_is_null(qb_inode->backing_gfid) || qb_inode->backing_fname) {
+ loc_t loc = {0,};
+ char gfid_str[64];
+ struct iatt buf;
+
+ if (!uuid_is_null(qb_inode->backing_gfid)) {
+ loc.inode = inode_find(qb_conf->root_inode->table,
+ qb_inode->backing_gfid);
+ if (!loc.inode) {
+ loc.inode = inode_new(qb_conf->root_inode->table);
+ uuid_copy(loc.inode->gfid,
+ qb_inode->backing_gfid);
+ }
+ uuid_copy(loc.gfid, loc.inode->gfid);
+ } else if (qb_inode->backing_fname) {
+ loc.inode = inode_new(qb_conf->root_inode->table);
+ loc.name = qb_inode->backing_fname;
+ loc.parent = inode_parent(inode, NULL, NULL);
+ loc_path(&loc, loc.name);
+ }
+
+ /*
+ * Lookup the backing image. Verify existence and/or get the
+ * gfid if we don't already have it.
+ */
+ ret = syncop_lookup(FIRST_CHILD(frame->this), &loc, NULL, &buf,
+ NULL, NULL);
+ GF_FREE(qb_inode->backing_fname);
+ if (ret) {
+ loc_wipe(&loc);
+ ret = -ret;
+ goto err;
+ }
+
+ uuid_copy(qb_inode->backing_gfid, buf.ia_gfid);
+ loc_wipe(&loc);
+
+ /*
+ * We pass the filename of the backing image into the qemu block
+ * subsystem as the associated gfid. This is embedded into the
+ * clone image and passed along to the gluster bdrv backend when
+ * the block subsystem needs to operate on the backing image on
+ * behalf of the clone.
+ */
+ uuid_unparse(qb_inode->backing_gfid, gfid_str);
+ snprintf(base_filename, sizeof(base_filename),
+ "gluster://gfid:%s", gfid_str);
+ use_base = 1;
+ }
+
+ bdrv_img_create (filename, qb_inode->fmt,
+ use_base ? base_filename : NULL, 0, 0, qb_inode->size,
+ 0, &local_err, true);
+
+ if (error_is_set (&local_err)) {
+ gf_log (frame->this->name, GF_LOG_ERROR, "%s",
+ error_get_pretty (local_err));
+ error_free (local_err);
+ QB_STUB_UNWIND (stub, -1, EIO);
+ return 0;
+ }
+
+ fd = fd_anonymous (inode);
+ if (!fd) {
+ gf_log (frame->this->name, GF_LOG_ERROR,
+ "could not create anonymous fd for %s",
+ uuid_utoa (inode->gfid));
+ QB_STUB_UNWIND (stub, -1, ENOMEM);
+ return 0;
+ }
+
+ xattr = dict_new ();
+ if (!xattr) {
+ gf_log (frame->this->name, GF_LOG_ERROR,
+ "could not allocate xattr dict for %s",
+ uuid_utoa (inode->gfid));
+ QB_STUB_UNWIND (stub, -1, ENOMEM);
+ fd_unref (fd);
+ return 0;
+ }
+
+ ret = dict_set_str (xattr, qb_conf->qb_xattr_key, local->fmt);
+ if (ret) {
+ gf_log (frame->this->name, GF_LOG_ERROR,
+ "could not dict_set for %s",
+ uuid_utoa (inode->gfid));
+ QB_STUB_UNWIND (stub, -1, ENOMEM);
+ fd_unref (fd);
+ dict_unref (xattr);
+ return 0;
+ }
+
+ ret = syncop_fsetxattr (FIRST_CHILD(THIS), fd, xattr, 0);
+ if (ret) {
+ gf_log (frame->this->name, GF_LOG_ERROR,
+ "failed to setxattr for %s",
+ uuid_utoa (inode->gfid));
+ QB_STUB_UNWIND (stub, -1, -ret);
+ fd_unref (fd);
+ dict_unref (xattr);
+ return 0;
+ }
+
+ fd_unref (fd);
+ dict_unref (xattr);
+
+ QB_STUB_UNWIND (stub, 0, 0);
+
+ return 0;
+
+err:
+ QB_STUB_UNWIND(stub, -1, ret);
+ return 0;
+}
+
+
+static BlockDriverState *
+qb_bs_create (inode_t *inode, const char *fmt)
+{
+ char filename[64];
+ BlockDriverState *bs = NULL;
+ BlockDriver *drv = NULL;
+ int op_errno = 0;
+ int ret = 0;
+
+ bs = bdrv_new (uuid_utoa (inode->gfid));
+ if (!bs) {
+ op_errno = ENOMEM;
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "could not allocate @bdrv for gfid:%s",
+ uuid_utoa (inode->gfid));
+ goto err;
+ }
+
+ drv = bdrv_find_format (fmt);
+ if (!drv) {
+ op_errno = EINVAL;
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "Unknown file format: %s for gfid:%s",
+ fmt, uuid_utoa (inode->gfid));
+ goto err;
+ }
+
+ qb_inode_to_filename (inode, filename, 64);
+
+ ret = bdrv_open (bs, filename, NULL, BDRV_O_RDWR, drv);
+ if (ret < 0) {
+ op_errno = -ret;
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "Unable to bdrv_open() gfid:%s (%s)",
+ uuid_utoa (inode->gfid), strerror (op_errno));
+ goto err;
+ }
+
+ return bs;
+err:
+ errno = op_errno;
+ return NULL;
+}
+
+
+int
+qb_co_open (void *opaque)
+{
+ qb_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ call_stub_t *stub = NULL;
+ inode_t *inode = NULL;
+ qb_inode_t *qb_inode = NULL;
+
+ local = opaque;
+ frame = local->frame;
+ stub = local->stub;
+ inode = local->inode;
+
+ qb_inode = qb_inode_ctx_get (frame->this, inode);
+ if (!qb_inode->bs) {
+ /* FIXME: we need locks around this when
+ enabling multithreaded syncop/coroutine
+ for qemu-block
+ */
+
+ qb_inode->bs = qb_bs_create (inode, qb_inode->fmt);
+ if (!qb_inode->bs) {
+ QB_STUB_UNWIND (stub, -1, errno);
+ return 0;
+ }
+ }
+ qb_inode->refcnt++;
+
+ QB_STUB_RESUME (stub);
+
+ return 0;
+}
+
+
+int
+qb_co_writev (void *opaque)
+{
+ qb_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ call_stub_t *stub = NULL;
+ inode_t *inode = NULL;
+ qb_inode_t *qb_inode = NULL;
+ QEMUIOVector qiov = {0, };
+ int ret = 0;
+
+ local = opaque;
+ frame = local->frame;
+ stub = local->stub;
+ inode = local->inode;
+
+ qb_inode = qb_inode_ctx_get (frame->this, inode);
+ if (!qb_inode->bs) {
+ /* FIXME: we need locks around this when
+ enabling multithreaded syncop/coroutine
+ for qemu-block
+ */
+
+ qb_inode->bs = qb_bs_create (inode, qb_inode->fmt);
+ if (!qb_inode->bs) {
+ QB_STUB_UNWIND (stub, -1, errno);
+ return 0;
+ }
+ }
+
+ qemu_iovec_init_external (&qiov, stub->args.vector, stub->args.count);
+
+ ret = bdrv_pwritev (qb_inode->bs, stub->args.offset, &qiov);
+
+ if (ret < 0) {
+ QB_STUB_UNWIND (stub, -1, -ret);
+ } else {
+ QB_STUB_UNWIND (stub, ret, 0);
+ }
+
+ return 0;
+}
+
+
+int
+qb_co_readv (void *opaque)
+{
+ qb_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ call_stub_t *stub = NULL;
+ inode_t *inode = NULL;
+ qb_inode_t *qb_inode = NULL;
+ struct iobuf *iobuf = NULL;
+ struct iobref *iobref = NULL;
+ struct iovec iov = {0, };
+ int ret = 0;
+
+ local = opaque;
+ frame = local->frame;
+ stub = local->stub;
+ inode = local->inode;
+
+ qb_inode = qb_inode_ctx_get (frame->this, inode);
+ if (!qb_inode->bs) {
+ /* FIXME: we need locks around this when
+ enabling multithreaded syncop/coroutine
+ for qemu-block
+ */
+
+ qb_inode->bs = qb_bs_create (inode, qb_inode->fmt);
+ if (!qb_inode->bs) {
+ QB_STUB_UNWIND (stub, -1, errno);
+ return 0;
+ }
+ }
+
+ if (stub->args.offset >= qb_inode->size) {
+ QB_STUB_UNWIND (stub, 0, 0);
+ return 0;
+ }
+
+ iobuf = iobuf_get2 (frame->this->ctx->iobuf_pool, stub->args.size);
+ if (!iobuf) {
+ QB_STUB_UNWIND (stub, -1, ENOMEM);
+ return 0;
+ }
+
+ iobref = iobref_new ();
+ if (!iobref) {
+ QB_STUB_UNWIND (stub, -1, ENOMEM);
+ iobuf_unref (iobuf);
+ return 0;
+ }
+
+ if (iobref_add (iobref, iobuf) < 0) {
+ iobuf_unref (iobuf);
+ iobref_unref (iobref);
+ QB_STUB_UNWIND (stub, -1, ENOMEM);
+ return 0;
+ }
+
+ ret = bdrv_pread (qb_inode->bs, stub->args.offset, iobuf_ptr (iobuf),
+ stub->args.size);
+
+ if (ret < 0) {
+ QB_STUB_UNWIND (stub, -1, -ret);
+ iobref_unref (iobref);
+ return 0;
+ }
+
+ iov.iov_base = iobuf_ptr (iobuf);
+ iov.iov_len = ret;
+
+ stub->args_cbk.vector = iov_dup (&iov, 1);
+ stub->args_cbk.count = 1;
+ stub->args_cbk.iobref = iobref;
+
+ QB_STUB_UNWIND (stub, ret, 0);
+
+ return 0;
+}
+
+
+int
+qb_co_fsync (void *opaque)
+{
+ qb_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ call_stub_t *stub = NULL;
+ inode_t *inode = NULL;
+ qb_inode_t *qb_inode = NULL;
+ int ret = 0;
+
+ local = opaque;
+ frame = local->frame;
+ stub = local->stub;
+ inode = local->inode;
+
+ qb_inode = qb_inode_ctx_get (frame->this, inode);
+ if (!qb_inode->bs) {
+ /* FIXME: we need locks around this when
+ enabling multithreaded syncop/coroutine
+ for qemu-block
+ */
+
+ qb_inode->bs = qb_bs_create (inode, qb_inode->fmt);
+ if (!qb_inode->bs) {
+ QB_STUB_UNWIND (stub, -1, errno);
+ return 0;
+ }
+ }
+
+ ret = bdrv_flush (qb_inode->bs);
+
+ if (ret < 0) {
+ QB_STUB_UNWIND (stub, -1, -ret);
+ } else {
+ QB_STUB_UNWIND (stub, ret, 0);
+ }
+
+ return 0;
+}
+
+
+static void
+qb_update_size_xattr (xlator_t *this, fd_t *fd, const char *fmt, off_t offset)
+{
+ char val[QB_XATTR_VAL_MAX];
+ qb_conf_t *qb_conf = NULL;
+ dict_t *xattr = NULL;
+
+ qb_conf = this->private;
+
+ snprintf (val, QB_XATTR_VAL_MAX, "%s:%llu",
+ fmt, (long long unsigned) offset);
+
+ xattr = dict_new ();
+ if (!xattr)
+ return;
+
+ if (dict_set_str (xattr, qb_conf->qb_xattr_key, val) != 0) {
+ dict_unref (xattr);
+ return;
+ }
+
+ syncop_fsetxattr (FIRST_CHILD(this), fd, xattr, 0);
+ dict_unref (xattr);
+}
+
+
+int
+qb_co_truncate (void *opaque)
+{
+ qb_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ call_stub_t *stub = NULL;
+ inode_t *inode = NULL;
+ qb_inode_t *qb_inode = NULL;
+ int ret = 0;
+ off_t offset = 0;
+ xlator_t *this = NULL;
+
+ this = THIS;
+
+ local = opaque;
+ frame = local->frame;
+ stub = local->stub;
+ inode = local->inode;
+
+ qb_inode = qb_inode_ctx_get (frame->this, inode);
+ if (!qb_inode->bs) {
+ /* FIXME: we need locks around this when
+ enabling multithreaded syncop/coroutine
+ for qemu-block
+ */
+
+ qb_inode->bs = qb_bs_create (inode, qb_inode->fmt);
+ if (!qb_inode->bs) {
+ QB_STUB_UNWIND (stub, -1, errno);
+ return 0;
+ }
+ }
+
+ ret = syncop_fstat (FIRST_CHILD(this), local->fd,
+ &stub->args_cbk.prestat);
+ if (ret < 0)
+ goto out;
+ stub->args_cbk.prestat.ia_size = qb_inode->size;
+
+ ret = bdrv_truncate (qb_inode->bs, stub->args.offset);
+ if (ret < 0)
+ goto out;
+
+ offset = bdrv_getlength (qb_inode->bs);
+
+ qb_inode->size = offset;
+
+ ret = syncop_fstat (FIRST_CHILD(this), local->fd,
+ &stub->args_cbk.poststat);
+ if (ret < 0)
+ goto out;
+ stub->args_cbk.poststat.ia_size = qb_inode->size;
+
+ qb_update_size_xattr (this, local->fd, qb_inode->fmt, qb_inode->size);
+
+out:
+ if (ret < 0) {
+ QB_STUB_UNWIND (stub, -1, -ret);
+ } else {
+ QB_STUB_UNWIND (stub, ret, 0);
+ }
+
+ return 0;
+}
+
+
+int
+qb_co_close (void *opaque)
+{
+ qb_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ inode_t *inode = NULL;
+ qb_inode_t *qb_inode = NULL;
+ BlockDriverState *bs = NULL;
+
+ local = opaque;
+ inode = local->inode;
+
+ qb_inode = qb_inode_ctx_get (THIS, inode);
+
+ if (!--qb_inode->refcnt) {
+ bs = qb_inode->bs;
+ qb_inode->bs = NULL;
+ bdrv_delete (bs);
+ }
+
+ frame = local->frame;
+ frame->local = NULL;
+ qb_local_free (THIS, local);
+ STACK_DESTROY (frame->root);
+
+ return 0;
+}
+
+
+int
+qb_snapshot_create (void *opaque)
+{
+ qb_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ call_stub_t *stub = NULL;
+ inode_t *inode = NULL;
+ qb_inode_t *qb_inode = NULL;
+ QEMUSnapshotInfo sn;
+ struct timeval tv = {0, };
+ int ret = 0;
+
+ local = opaque;
+ frame = local->frame;
+ stub = local->stub;
+ inode = local->inode;
+
+ qb_inode = qb_inode_ctx_get (frame->this, inode);
+ if (!qb_inode->bs) {
+ /* FIXME: we need locks around this when
+ enabling multithreaded syncop/coroutine
+ for qemu-block
+ */
+
+ qb_inode->bs = qb_bs_create (inode, qb_inode->fmt);
+ if (!qb_inode->bs) {
+ QB_STUB_UNWIND (stub, -1, errno);
+ return 0;
+ }
+ }
+
+ memset (&sn, 0, sizeof (sn));
+ pstrcpy (sn.name, sizeof(sn.name), local->name);
+ gettimeofday (&tv, NULL);
+ sn.date_sec = tv.tv_sec;
+ sn.date_nsec = tv.tv_usec * 1000;
+
+ ret = bdrv_snapshot_create (qb_inode->bs, &sn);
+ if (ret < 0) {
+ QB_STUB_UNWIND (stub, -1, -ret);
+ } else {
+ QB_STUB_UNWIND (stub, ret, 0);
+ }
+
+ return 0;
+}
+
+
+int
+qb_snapshot_delete (void *opaque)
+{
+ qb_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ call_stub_t *stub = NULL;
+ inode_t *inode = NULL;
+ qb_inode_t *qb_inode = NULL;
+ int ret = 0;
+
+ local = opaque;
+ frame = local->frame;
+ stub = local->stub;
+ inode = local->inode;
+
+ qb_inode = qb_inode_ctx_get (frame->this, inode);
+ if (!qb_inode->bs) {
+ /* FIXME: we need locks around this when
+ enabling multithreaded syncop/coroutine
+ for qemu-block
+ */
+
+ qb_inode->bs = qb_bs_create (inode, qb_inode->fmt);
+ if (!qb_inode->bs) {
+ QB_STUB_UNWIND (stub, -1, errno);
+ return 0;
+ }
+ }
+
+ ret = bdrv_snapshot_delete (qb_inode->bs, local->name);
+
+ if (ret < 0) {
+ QB_STUB_UNWIND (stub, -1, -ret);
+ } else {
+ QB_STUB_UNWIND (stub, ret, 0);
+ }
+
+ return 0;
+}
+
+
+int
+qb_snapshot_goto (void *opaque)
+{
+ qb_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ call_stub_t *stub = NULL;
+ inode_t *inode = NULL;
+ qb_inode_t *qb_inode = NULL;
+ int ret = 0;
+
+ local = opaque;
+ frame = local->frame;
+ stub = local->stub;
+ inode = local->inode;
+
+ qb_inode = qb_inode_ctx_get (frame->this, inode);
+ if (!qb_inode->bs) {
+ /* FIXME: we need locks around this when
+ enabling multithreaded syncop/coroutine
+ for qemu-block
+ */
+
+ qb_inode->bs = qb_bs_create (inode, qb_inode->fmt);
+ if (!qb_inode->bs) {
+ QB_STUB_UNWIND (stub, -1, errno);
+ return 0;
+ }
+ }
+
+ ret = bdrv_snapshot_goto (qb_inode->bs, local->name);
+
+ if (ret < 0) {
+ QB_STUB_UNWIND (stub, -1, -ret);
+ } else {
+ QB_STUB_UNWIND (stub, ret, 0);
+ }
+
+ return 0;
+}
diff --git a/xlators/features/qemu-block/src/qb-coroutines.h b/xlators/features/qemu-block/src/qb-coroutines.h
new file mode 100644
index 000000000..583319f3b
--- /dev/null
+++ b/xlators/features/qemu-block/src/qb-coroutines.h
@@ -0,0 +1,30 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __QB_COROUTINES_H
+#define __QB_COROUTINES_H
+
+#include "syncop.h"
+#include "call-stub.h"
+#include "block/block_int.h"
+#include "monitor/monitor.h"
+
+int qb_format_and_resume (void *opaque);
+int qb_snapshot_create (void *opaque);
+int qb_snapshot_delete (void *opaque);
+int qb_snapshot_goto (void *opaque);
+int qb_co_open (void *opaque);
+int qb_co_close (void *opaque);
+int qb_co_writev (void *opaque);
+int qb_co_readv (void *opaque);
+int qb_co_fsync (void *opaque);
+int qb_co_truncate (void *opaque);
+
+#endif /* __QB_COROUTINES_H */
diff --git a/xlators/features/qemu-block/src/qemu-block-memory-types.h b/xlators/features/qemu-block/src/qemu-block-memory-types.h
new file mode 100644
index 000000000..267b3893f
--- /dev/null
+++ b/xlators/features/qemu-block/src/qemu-block-memory-types.h
@@ -0,0 +1,25 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef __QB_MEM_TYPES_H__
+#define __QB_MEM_TYPES_H__
+
+#include "mem-types.h"
+
+enum gf_qb_mem_types_ {
+ gf_qb_mt_qb_conf_t = gf_common_mt_end + 1,
+ gf_qb_mt_qb_inode_t,
+ gf_qb_mt_qb_local_t,
+ gf_qb_mt_coroutinesynctask_t,
+ gf_qb_mt_end
+};
+#endif
+
diff --git a/xlators/features/qemu-block/src/qemu-block.c b/xlators/features/qemu-block/src/qemu-block.c
new file mode 100644
index 000000000..48bbf3140
--- /dev/null
+++ b/xlators/features/qemu-block/src/qemu-block.c
@@ -0,0 +1,1140 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "logging.h"
+#include "dict.h"
+#include "xlator.h"
+#include "inode.h"
+#include "call-stub.h"
+#include "defaults.h"
+#include "qemu-block-memory-types.h"
+#include "qemu-block.h"
+#include "qb-coroutines.h"
+
+
+qb_inode_t *
+__qb_inode_ctx_get (xlator_t *this, inode_t *inode)
+{
+ uint64_t value = 0;
+ qb_inode_t *qb_inode = NULL;
+
+ __inode_ctx_get (inode, this, &value);
+ qb_inode = (qb_inode_t *)(unsigned long) value;
+
+ return qb_inode;
+}
+
+
+qb_inode_t *
+qb_inode_ctx_get (xlator_t *this, inode_t *inode)
+{
+ qb_inode_t *qb_inode = NULL;
+
+ LOCK (&inode->lock);
+ {
+ qb_inode = __qb_inode_ctx_get (this, inode);
+ }
+ UNLOCK (&inode->lock);
+
+ return qb_inode;
+}
+
+
+qb_inode_t *
+qb_inode_ctx_del (xlator_t *this, inode_t *inode)
+{
+ uint64_t value = 0;
+ qb_inode_t *qb_inode = NULL;
+
+ inode_ctx_del (inode, this, &value);
+ qb_inode = (qb_inode_t *)(unsigned long) value;
+
+ return qb_inode;
+}
+
+
+int
+qb_inode_cleanup (xlator_t *this, inode_t *inode, int warn)
+{
+ qb_inode_t *qb_inode = NULL;
+
+ qb_inode = qb_inode_ctx_del (this, inode);
+
+ if (!qb_inode)
+ return 0;
+
+ if (warn)
+ gf_log (this->name, GF_LOG_WARNING,
+ "inode %s no longer block formatted",
+ uuid_utoa (inode->gfid));
+
+ /* free (qb_inode->bs); */
+
+ GF_FREE (qb_inode);
+
+ return 0;
+}
+
+
+int
+qb_iatt_fixup (xlator_t *this, inode_t *inode, struct iatt *iatt)
+{
+ qb_inode_t *qb_inode = NULL;
+
+ qb_inode = qb_inode_ctx_get (this, inode);
+ if (!qb_inode)
+ return 0;
+
+ iatt->ia_size = qb_inode->size;
+
+ return 0;
+}
+
+
+int
+qb_format_extract (xlator_t *this, char *format, inode_t *inode)
+{
+ char *s, *save;
+ uint64_t size = 0;
+ char fmt[QB_XATTR_VAL_MAX+1] = {0, };
+ qb_inode_t *qb_inode = NULL;
+ char *formatstr = NULL;
+ uuid_t gfid = {0,};
+ char gfid_str[64] = {0,};
+ int ret;
+
+ strncpy(fmt, format, QB_XATTR_VAL_MAX);
+
+ s = strtok_r(fmt, ":", &save);
+ if (!s)
+ goto invalid;
+ formatstr = gf_strdup(s);
+
+ s = strtok_r(NULL, ":", &save);
+ if (!s)
+ goto invalid;
+ if (gf_string2bytesize (s, &size))
+ goto invalid;
+ if (!size)
+ goto invalid;
+
+ s = strtok_r(NULL, "\0", &save);
+ if (s && !strncmp(s, "<gfid:", strlen("<gfid:"))) {
+ /*
+ * Check for valid gfid backing image specifier.
+ */
+ if (strlen(s) + 1 > sizeof(gfid_str))
+ goto invalid;
+ ret = sscanf(s, "<gfid:%[^>]s", gfid_str);
+ if (ret == 1) {
+ ret = uuid_parse(gfid_str, gfid);
+ if (ret < 0)
+ goto invalid;
+ }
+ }
+
+ qb_inode = qb_inode_ctx_get (this, inode);
+ if (!qb_inode)
+ qb_inode = GF_CALLOC (1, sizeof (*qb_inode),
+ gf_qb_mt_qb_inode_t);
+ if (!qb_inode) {
+ GF_FREE(formatstr);
+ return ENOMEM;
+ }
+
+ strncpy(qb_inode->fmt, formatstr, QB_XATTR_VAL_MAX);
+ qb_inode->size = size;
+
+ /*
+ * If a backing gfid was not specified, interpret any remaining bytes
+ * associated with a backing image as a filename local to the parent
+ * directory. The format processing will validate further.
+ */
+ if (!uuid_is_null(gfid))
+ uuid_copy(qb_inode->backing_gfid, gfid);
+ else if (s)
+ qb_inode->backing_fname = gf_strdup(s);
+
+ inode_ctx_set (inode, this, (void *)&qb_inode);
+
+ GF_FREE(formatstr);
+
+ return 0;
+
+invalid:
+ GF_FREE(formatstr);
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "invalid format '%s' in inode %s", format,
+ uuid_utoa (inode->gfid));
+ return EINVAL;
+}
+
+
+void
+qb_local_free (xlator_t *this, qb_local_t *local)
+{
+ if (local->inode)
+ inode_unref (local->inode);
+ if (local->fd)
+ fd_unref (local->fd);
+ GF_FREE (local);
+}
+
+
+int
+qb_local_init (call_frame_t *frame)
+{
+ qb_local_t *qb_local = NULL;
+
+ qb_local = GF_CALLOC (1, sizeof (*qb_local), gf_qb_mt_qb_local_t);
+ if (!qb_local)
+ return -1;
+ INIT_LIST_HEAD(&qb_local->list);
+
+ qb_local->frame = frame;
+ frame->local = qb_local;
+
+ return 0;
+}
+
+
+int
+qb_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode, struct iatt *buf,
+ dict_t *xdata, struct iatt *postparent)
+{
+ char *format = NULL;
+ qb_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (op_ret == -1)
+ goto out;
+
+ /*
+ * Cache the root inode for dealing with backing images. The format
+ * coroutine and the gluster qemu backend driver both use the root inode
+ * table to verify and/or redirect I/O to the backing image via
+ * anonymous fd's.
+ */
+ if (!conf->root_inode && __is_root_gfid(inode->gfid))
+ conf->root_inode = inode_ref(inode);
+
+ if (!xdata)
+ goto out;
+
+ if (dict_get_str (xdata, conf->qb_xattr_key, &format))
+ goto out;
+
+ if (!format) {
+ qb_inode_cleanup (this, inode, 1);
+ goto out;
+ }
+
+ op_errno = qb_format_extract (this, format, inode);
+ if (op_errno)
+ op_ret = -1;
+
+ qb_iatt_fixup (this, inode, buf);
+out:
+ QB_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, buf,
+ xdata, postparent);
+ return 0;
+}
+
+
+int
+qb_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ qb_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ xdata = xdata ? dict_ref (xdata) : dict_new ();
+
+ if (!xdata)
+ goto enomem;
+
+ if (dict_set_int32 (xdata, conf->qb_xattr_key, 0))
+ goto enomem;
+
+ STACK_WIND (frame, qb_lookup_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lookup, loc, xdata);
+ dict_unref (xdata);
+ return 0;
+enomem:
+ QB_STACK_UNWIND (lookup, frame, -1, ENOMEM, 0, 0, 0, 0);
+ if (xdata)
+ dict_unref (xdata);
+ return 0;
+}
+
+
+int
+qb_setxattr_format (call_frame_t *frame, xlator_t *this, call_stub_t *stub,
+ dict_t *xattr, inode_t *inode)
+{
+ char *format = NULL;
+ int op_errno = 0;
+ qb_local_t *qb_local = NULL;
+ data_t *data = NULL;
+ qb_inode_t *qb_inode;
+
+ if (!(data = dict_get (xattr, "trusted.glusterfs.block-format"))) {
+ QB_STUB_RESUME (stub);
+ return 0;
+ }
+
+ format = alloca (data->len + 1);
+ memcpy (format, data->data, data->len);
+ format[data->len] = 0;
+
+ op_errno = qb_format_extract (this, format, inode);
+ if (op_errno) {
+ QB_STUB_UNWIND (stub, -1, op_errno);
+ return 0;
+ }
+ qb_inode = qb_inode_ctx_get(this, inode);
+
+ qb_local = frame->local;
+
+ qb_local->stub = stub;
+ qb_local->inode = inode_ref (inode);
+
+ snprintf(qb_local->fmt, QB_XATTR_VAL_MAX, "%s:%lu", qb_inode->fmt,
+ qb_inode->size);
+
+ qb_coroutine (frame, qb_format_and_resume);
+
+ return 0;
+}
+
+
+int
+qb_setxattr_snapshot_create (call_frame_t *frame, xlator_t *this,
+ call_stub_t *stub, dict_t *xattr, inode_t *inode)
+{
+ qb_local_t *qb_local = NULL;
+ char *name = NULL;
+ data_t *data = NULL;
+
+ if (!(data = dict_get (xattr, "trusted.glusterfs.block-snapshot-create"))) {
+ QB_STUB_RESUME (stub);
+ return 0;
+ }
+
+ name = alloca (data->len + 1);
+ memcpy (name, data->data, data->len);
+ name[data->len] = 0;
+
+ qb_local = frame->local;
+
+ qb_local->stub = stub;
+ qb_local->inode = inode_ref (inode);
+ strncpy (qb_local->name, name, 128);
+
+ qb_coroutine (frame, qb_snapshot_create);
+
+ return 0;
+}
+
+
+int
+qb_setxattr_snapshot_delete (call_frame_t *frame, xlator_t *this,
+ call_stub_t *stub, dict_t *xattr, inode_t *inode)
+{
+ qb_local_t *qb_local = NULL;
+ char *name = NULL;
+ data_t *data = NULL;
+
+ if (!(data = dict_get (xattr, "trusted.glusterfs.block-snapshot-delete"))) {
+ QB_STUB_RESUME (stub);
+ return 0;
+ }
+
+ name = alloca (data->len + 1);
+ memcpy (name, data->data, data->len);
+ name[data->len] = 0;
+
+ qb_local = frame->local;
+
+ qb_local->stub = stub;
+ qb_local->inode = inode_ref (inode);
+ strncpy (qb_local->name, name, 128);
+
+ qb_coroutine (frame, qb_snapshot_delete);
+
+ return 0;
+}
+
+int
+qb_setxattr_snapshot_goto (call_frame_t *frame, xlator_t *this,
+ call_stub_t *stub, dict_t *xattr, inode_t *inode)
+{
+ qb_local_t *qb_local = NULL;
+ char *name = NULL;
+ data_t *data = NULL;
+
+ if (!(data = dict_get (xattr, "trusted.glusterfs.block-snapshot-goto"))) {
+ QB_STUB_RESUME (stub);
+ return 0;
+ }
+
+ name = alloca (data->len + 1);
+ memcpy (name, data->data, data->len);
+ name[data->len] = 0;
+
+ qb_local = frame->local;
+
+ qb_local->stub = stub;
+ qb_local->inode = inode_ref (inode);
+ strncpy (qb_local->name, name, 128);
+
+ qb_coroutine (frame, qb_snapshot_goto);
+
+ return 0;
+}
+
+
+int
+qb_setxattr_common (call_frame_t *frame, xlator_t *this, call_stub_t *stub,
+ dict_t *xattr, inode_t *inode)
+{
+ data_t *data = NULL;
+
+ if ((data = dict_get (xattr, "trusted.glusterfs.block-format"))) {
+ qb_setxattr_format (frame, this, stub, xattr, inode);
+ return 0;
+ }
+
+ if ((data = dict_get (xattr, "trusted.glusterfs.block-snapshot-create"))) {
+ qb_setxattr_snapshot_create (frame, this, stub, xattr, inode);
+ return 0;
+ }
+
+ if ((data = dict_get (xattr, "trusted.glusterfs.block-snapshot-delete"))) {
+ qb_setxattr_snapshot_delete (frame, this, stub, xattr, inode);
+ return 0;
+ }
+
+ if ((data = dict_get (xattr, "trusted.glusterfs.block-snapshot-goto"))) {
+ qb_setxattr_snapshot_goto (frame, this, stub, xattr, inode);
+ return 0;
+ }
+
+ QB_STUB_RESUME (stub);
+
+ return 0;
+}
+
+
+int
+qb_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr,
+ int flags, dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+
+ if (qb_local_init (frame) != 0)
+ goto enomem;
+
+ stub = fop_setxattr_stub (frame, default_setxattr_resume, loc, xattr,
+ flags, xdata);
+ if (!stub)
+ goto enomem;
+
+ qb_setxattr_common (frame, this, stub, xattr, loc->inode);
+
+ return 0;
+enomem:
+ QB_STACK_UNWIND (setxattr, frame, -1, ENOMEM, 0);
+ return 0;
+}
+
+
+int
+qb_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xattr,
+ int flags, dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+
+ if (qb_local_init (frame) != 0)
+ goto enomem;
+
+ stub = fop_fsetxattr_stub (frame, default_fsetxattr_resume, fd, xattr,
+ flags, xdata);
+ if (!stub)
+ goto enomem;
+
+ qb_setxattr_common (frame, this, stub, xattr, fd->inode);
+
+ return 0;
+enomem:
+ QB_STACK_UNWIND (fsetxattr, frame, -1, ENOMEM, 0);
+ return 0;
+}
+
+
+int
+qb_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, fd_t *fd, dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+ qb_local_t *qb_local = NULL;
+
+ qb_local = frame->local;
+
+ if (op_ret < 0)
+ goto unwind;
+
+ if (!qb_inode_ctx_get (this, qb_local->inode))
+ goto unwind;
+
+ stub = fop_open_cbk_stub (frame, NULL, op_ret, op_errno, fd, xdata);
+ if (!stub) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+
+ qb_local->stub = stub;
+
+ qb_coroutine (frame, qb_co_open);
+
+ return 0;
+unwind:
+ QB_STACK_UNWIND (open, frame, op_ret, op_errno, fd, xdata);
+ return 0;
+}
+
+
+int
+qb_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+ fd_t *fd, dict_t *xdata)
+{
+ qb_local_t *qb_local = NULL;
+ qb_inode_t *qb_inode = NULL;
+
+ qb_inode = qb_inode_ctx_get (this, loc->inode);
+ if (!qb_inode) {
+ STACK_WIND (frame, default_open_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->open, loc, flags, fd,
+ xdata);
+ return 0;
+ }
+
+ if (qb_local_init (frame) != 0)
+ goto enomem;
+
+ qb_local = frame->local;
+
+ qb_local->inode = inode_ref (loc->inode);
+ qb_local->fd = fd_ref (fd);
+
+ STACK_WIND (frame, qb_open_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
+ return 0;
+enomem:
+ QB_STACK_UNWIND (open, frame, -1, ENOMEM, 0, 0);
+ return 0;
+}
+
+
+int
+qb_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+ int count, off_t offset, uint32_t flags, struct iobref *iobref,
+ dict_t *xdata)
+{
+ qb_local_t *qb_local = NULL;
+ qb_inode_t *qb_inode = NULL;
+
+ qb_inode = qb_inode_ctx_get (this, fd->inode);
+ if (!qb_inode) {
+ STACK_WIND (frame, default_writev_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev, fd, vector, count,
+ offset, flags, iobref, xdata);
+ return 0;
+ }
+
+ if (qb_local_init (frame) != 0)
+ goto enomem;
+
+ qb_local = frame->local;
+
+ qb_local->inode = inode_ref (fd->inode);
+ qb_local->fd = fd_ref (fd);
+
+ qb_local->stub = fop_writev_stub (frame, NULL, fd, vector, count,
+ offset, flags, iobref, xdata);
+ if (!qb_local->stub)
+ goto enomem;
+
+ qb_coroutine (frame, qb_co_writev);
+
+ return 0;
+enomem:
+ QB_STACK_UNWIND (writev, frame, -1, ENOMEM, 0, 0, 0);
+ return 0;
+}
+
+
+int
+qb_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata)
+{
+ qb_local_t *qb_local = NULL;
+ qb_inode_t *qb_inode = NULL;
+
+ qb_inode = qb_inode_ctx_get (this, fd->inode);
+ if (!qb_inode) {
+ STACK_WIND (frame, default_readv_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readv, fd, size, offset,
+ flags, xdata);
+ return 0;
+ }
+
+ if (qb_local_init (frame) != 0)
+ goto enomem;
+
+ qb_local = frame->local;
+
+ qb_local->inode = inode_ref (fd->inode);
+ qb_local->fd = fd_ref (fd);
+
+ qb_local->stub = fop_readv_stub (frame, NULL, fd, size, offset,
+ flags, xdata);
+ if (!qb_local->stub)
+ goto enomem;
+
+ qb_coroutine (frame, qb_co_readv);
+
+ return 0;
+enomem:
+ QB_STACK_UNWIND (readv, frame, -1, ENOMEM, 0, 0, 0, 0, 0);
+ return 0;
+}
+
+
+int
+qb_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int dsync,
+ dict_t *xdata)
+{
+ qb_local_t *qb_local = NULL;
+ qb_inode_t *qb_inode = NULL;
+
+ qb_inode = qb_inode_ctx_get (this, fd->inode);
+ if (!qb_inode) {
+ STACK_WIND (frame, default_fsync_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsync, fd, dsync, xdata);
+ return 0;
+ }
+
+ if (qb_local_init (frame) != 0)
+ goto enomem;
+
+ qb_local = frame->local;
+
+ qb_local->inode = inode_ref (fd->inode);
+ qb_local->fd = fd_ref (fd);
+
+ qb_local->stub = fop_fsync_stub (frame, NULL, fd, dsync, xdata);
+
+ if (!qb_local->stub)
+ goto enomem;
+
+ qb_coroutine (frame, qb_co_fsync);
+
+ return 0;
+enomem:
+ QB_STACK_UNWIND (fsync, frame, -1, ENOMEM, 0, 0, 0);
+ return 0;
+}
+
+
+int
+qb_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ qb_local_t *qb_local = NULL;
+ qb_inode_t *qb_inode = NULL;
+
+ qb_inode = qb_inode_ctx_get (this, fd->inode);
+ if (!qb_inode) {
+ STACK_WIND (frame, default_flush_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->flush, fd, xdata);
+ return 0;
+ }
+
+ if (qb_local_init (frame) != 0)
+ goto enomem;
+
+ qb_local = frame->local;
+
+ qb_local->inode = inode_ref (fd->inode);
+ qb_local->fd = fd_ref (fd);
+
+ qb_local->stub = fop_flush_stub (frame, NULL, fd, xdata);
+
+ if (!qb_local->stub)
+ goto enomem;
+
+ qb_coroutine (frame, qb_co_fsync);
+
+ return 0;
+enomem:
+ QB_STACK_UNWIND (flush, frame, -1, ENOMEM, 0);
+ return 0;
+}
+
+static int32_t
+qb_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+ dict_t *xdata)
+{
+ qb_conf_t *conf = this->private;
+ gf_dirent_t *entry;
+ char *format;
+
+ list_for_each_entry(entry, &entries->list, list) {
+ if (!entry->inode || !entry->dict)
+ continue;
+
+ format = NULL;
+ if (dict_get_str(entry->dict, conf->qb_xattr_key, &format))
+ continue;
+
+ if (!format) {
+ qb_inode_cleanup(this, entry->inode, 1);
+ continue;
+ }
+
+ if (qb_format_extract(this, format, entry->inode))
+ continue;
+
+ qb_iatt_fixup(this, entry->inode, &entry->d_stat);
+ }
+
+ STACK_UNWIND_STRICT(readdirp, frame, op_ret, op_errno, entries, xdata);
+ return 0;
+}
+
+static int32_t
+qb_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t off, dict_t *xdata)
+{
+ qb_conf_t *conf = this->private;
+
+ xdata = xdata ? dict_ref(xdata) : dict_new();
+ if (!xdata)
+ goto enomem;
+
+ if (dict_set_int32 (xdata, conf->qb_xattr_key, 0))
+ goto enomem;
+
+ STACK_WIND(frame, qb_readdirp_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdirp, fd, size, off, xdata);
+
+ dict_unref(xdata);
+ return 0;
+
+enomem:
+ QB_STACK_UNWIND(readdirp, frame, -1, ENOMEM, NULL, NULL);
+ if (xdata)
+ dict_unref(xdata);
+ return 0;
+}
+
+int
+qb_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+ dict_t *xdata)
+{
+ qb_local_t *qb_local = NULL;
+ qb_inode_t *qb_inode = NULL;
+
+ qb_inode = qb_inode_ctx_get (this, loc->inode);
+ if (!qb_inode) {
+ STACK_WIND (frame, default_truncate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->truncate, loc, offset,
+ xdata);
+ return 0;
+ }
+
+ if (qb_local_init (frame) != 0)
+ goto enomem;
+
+ qb_local = frame->local;
+
+ qb_local->inode = inode_ref (loc->inode);
+ qb_local->fd = fd_anonymous (loc->inode);
+
+ qb_local->stub = fop_truncate_stub (frame, NULL, loc, offset, xdata);
+
+ if (!qb_local->stub)
+ goto enomem;
+
+ qb_coroutine (frame, qb_co_truncate);
+
+ return 0;
+enomem:
+ QB_STACK_UNWIND (truncate, frame, -1, ENOMEM, 0, 0, 0);
+ return 0;
+}
+
+
+int
+qb_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata)
+{
+ qb_local_t *qb_local = NULL;
+ qb_inode_t *qb_inode = NULL;
+
+ qb_inode = qb_inode_ctx_get (this, fd->inode);
+ if (!qb_inode) {
+ STACK_WIND (frame, default_ftruncate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ftruncate, fd, offset,
+ xdata);
+ return 0;
+ }
+
+ if (qb_local_init (frame) != 0)
+ goto enomem;
+
+ qb_local = frame->local;
+
+ qb_local->inode = inode_ref (fd->inode);
+ qb_local->fd = fd_ref (fd);
+
+ qb_local->stub = fop_ftruncate_stub (frame, NULL, fd, offset, xdata);
+
+ if (!qb_local->stub)
+ goto enomem;
+
+ qb_coroutine (frame, qb_co_truncate);
+
+ return 0;
+enomem:
+ QB_STACK_UNWIND (ftruncate, frame, -1, ENOMEM, 0, 0, 0);
+ return 0;
+}
+
+
+int
+qb_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *iatt, dict_t *xdata)
+{
+ inode_t *inode = NULL;
+
+ inode = frame->local;
+ frame->local = NULL;
+
+ if (inode) {
+ qb_iatt_fixup (this, inode, iatt);
+ inode_unref (inode);
+ }
+
+ QB_STACK_UNWIND (stat, frame, op_ret, op_errno, iatt, xdata);
+
+ return 0;
+}
+
+int
+qb_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ if (qb_inode_ctx_get (this, loc->inode))
+ frame->local = inode_ref (loc->inode);
+
+ STACK_WIND (frame, qb_stat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->stat, loc, xdata);
+ return 0;
+}
+
+
+int
+qb_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *iatt, dict_t *xdata)
+{
+ inode_t *inode = NULL;
+
+ inode = frame->local;
+ frame->local = NULL;
+
+ if (inode) {
+ qb_iatt_fixup (this, inode, iatt);
+ inode_unref (inode);
+ }
+
+ QB_STACK_UNWIND (fstat, frame, op_ret, op_errno, iatt, xdata);
+
+ return 0;
+}
+
+
+int
+qb_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ if (qb_inode_ctx_get (this, fd->inode))
+ frame->local = inode_ref (fd->inode);
+
+ STACK_WIND (frame, qb_fstat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fstat, fd, xdata);
+ return 0;
+}
+
+
+int
+qb_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre, struct iatt *post,
+ dict_t *xdata)
+{
+ inode_t *inode = NULL;
+
+ inode = frame->local;
+ frame->local = NULL;
+
+ if (inode) {
+ qb_iatt_fixup (this, inode, pre);
+ qb_iatt_fixup (this, inode, post);
+ inode_unref (inode);
+ }
+
+ QB_STACK_UNWIND (setattr, frame, op_ret, op_errno, pre, post, xdata);
+
+ return 0;
+}
+
+
+int
+qb_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf,
+ int valid, dict_t *xdata)
+{
+ if (qb_inode_ctx_get (this, loc->inode))
+ frame->local = inode_ref (loc->inode);
+
+ STACK_WIND (frame, qb_setattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setattr, loc, buf, valid, xdata);
+ return 0;
+}
+
+
+int
+qb_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre, struct iatt *post,
+ dict_t *xdata)
+{
+ inode_t *inode = NULL;
+
+ inode = frame->local;
+ frame->local = NULL;
+
+ if (inode) {
+ qb_iatt_fixup (this, inode, pre);
+ qb_iatt_fixup (this, inode, post);
+ inode_unref (inode);
+ }
+
+ QB_STACK_UNWIND (fsetattr, frame, op_ret, op_errno, pre, post, xdata);
+
+ return 0;
+}
+
+
+int
+qb_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf,
+ int valid, dict_t *xdata)
+{
+ if (qb_inode_ctx_get (this, fd->inode))
+ frame->local = inode_ref (fd->inode);
+
+ STACK_WIND (frame, qb_setattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetattr, fd, buf, valid, xdata);
+ return 0;
+}
+
+
+int
+qb_forget (xlator_t *this, inode_t *inode)
+{
+ return qb_inode_cleanup (this, inode, 0);
+}
+
+
+int
+qb_release (xlator_t *this, fd_t *fd)
+{
+ call_frame_t *frame = NULL;
+
+ frame = create_frame (this, this->ctx->pool);
+ if (!frame) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not allocate frame. "
+ "Leaking QEMU BlockDriverState");
+ return -1;
+ }
+
+ if (qb_local_init (frame) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not allocate local. "
+ "Leaking QEMU BlockDriverState");
+ STACK_DESTROY (frame->root);
+ return -1;
+ }
+
+ if (qb_coroutine (frame, qb_co_close) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not allocate coroutine. "
+ "Leaking QEMU BlockDriverState");
+ qb_local_free (this, frame->local);
+ frame->local = NULL;
+ STACK_DESTROY (frame->root);
+ }
+
+ return 0;
+}
+
+int
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ ret = xlator_mem_acct_init (this, gf_qb_mt_end + 1);
+
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR, "Memory accounting init "
+ "failed");
+ return ret;
+}
+
+
+int
+reconfigure (xlator_t *this, dict_t *options)
+{
+ return 0;
+}
+
+
+int
+init (xlator_t *this)
+{
+ qb_conf_t *conf = NULL;
+ int32_t ret = -1;
+ static int bdrv_inited = 0;
+
+ if (!this->children || this->children->next) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "FATAL: qemu-block (%s) not configured with exactly "
+ "one child", this->name);
+ goto out;
+ }
+
+ conf = GF_CALLOC (1, sizeof (*conf), gf_qb_mt_qb_conf_t);
+ if (!conf)
+ goto out;
+
+ /* configure 'option window-size <size>' */
+ GF_OPTION_INIT ("default-password", conf->default_password, str, out);
+
+ /* qemu coroutines use "co_mutex" for synchronizing among themselves.
+ However "co_mutex" itself is not threadsafe if the coroutine framework
+ is multithreaded (which usually is not). However synctasks are
+ fundamentally multithreaded, so for now create a syncenv which has
+ scaling limits set to max 1 thread so that the qemu coroutines can
+ execute "safely".
+
+ Future work: provide an implementation of "co_mutex" which is
+ threadsafe and use the global multithreaded ctx->env syncenv.
+ */
+ conf->env = syncenv_new (0, 1, 1);
+
+ this->private = conf;
+
+ ret = 0;
+
+ snprintf (conf->qb_xattr_key, QB_XATTR_KEY_MAX, QB_XATTR_KEY_FMT,
+ this->name);
+
+ cur_mon = (void *) 1;
+
+ if (!bdrv_inited) {
+ bdrv_init ();
+ bdrv_inited = 1;
+ }
+
+out:
+ if (ret)
+ GF_FREE (conf);
+
+ return ret;
+}
+
+
+void
+fini (xlator_t *this)
+{
+ qb_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ this->private = NULL;
+
+ if (conf->root_inode)
+ inode_unref(conf->root_inode);
+ GF_FREE (conf);
+
+ return;
+}
+
+
+struct xlator_fops fops = {
+ .lookup = qb_lookup,
+ .fsetxattr = qb_fsetxattr,
+ .setxattr = qb_setxattr,
+ .open = qb_open,
+ .writev = qb_writev,
+ .readv = qb_readv,
+ .fsync = qb_fsync,
+ .truncate = qb_truncate,
+ .ftruncate = qb_ftruncate,
+ .stat = qb_stat,
+ .fstat = qb_fstat,
+ .setattr = qb_setattr,
+ .fsetattr = qb_fsetattr,
+ .flush = qb_flush,
+/*
+ .getxattr = qb_getxattr,
+ .fgetxattr = qb_fgetxattr
+*/
+ .readdirp = qb_readdirp,
+};
+
+
+struct xlator_cbks cbks = {
+ .forget = qb_forget,
+ .release = qb_release,
+};
+
+
+struct xlator_dumpops dumpops = {
+};
+
+
+struct volume_options options[] = {
+ { .key = {"default-password"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = "",
+ .description = "Default password for the AES encrypted block images."
+ },
+ { .key = {NULL} },
+};
diff --git a/xlators/features/qemu-block/src/qemu-block.h b/xlators/features/qemu-block/src/qemu-block.h
new file mode 100644
index 000000000..c95f2799a
--- /dev/null
+++ b/xlators/features/qemu-block/src/qemu-block.h
@@ -0,0 +1,109 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __QEMU_BLOCK_H
+#define __QEMU_BLOCK_H
+
+#include "syncop.h"
+#include "call-stub.h"
+#include "block/block_int.h"
+#include "monitor/monitor.h"
+
+/* QB_XATTR_KEY_FMT is the on-disk xattr stored in the inode which
+ indicates that the file must be "interpreted" by the block format
+ logic. The value of the key is of the pattern:
+
+ "format:virtual_size"
+
+ e.g
+
+ "qcow2:20GB" or "qed:100GB"
+
+ The format and virtual size are colon separated. The format is
+ a case sensitive string which qemu recognizes. virtual_size is
+ specified as a size which glusterfs recognizes as size (i.e.,
+ value accepted by gf_string2bytesize())
+*/
+#define QB_XATTR_KEY_FMT "trusted.glusterfs.%s.format"
+
+#define QB_XATTR_KEY_MAX 64
+
+#define QB_XATTR_VAL_MAX 64
+
+
+typedef struct qb_inode {
+ char fmt[QB_XATTR_VAL_MAX]; /* this is only the format, not "format:size" */
+ size_t size; /* virtual size in bytes */
+ BlockDriverState *bs;
+ int refcnt;
+ uuid_t backing_gfid;
+ char *backing_fname;
+} qb_inode_t;
+
+
+typedef struct qb_conf {
+ Monitor *mon;
+ struct syncenv *env;
+ char qb_xattr_key[QB_XATTR_KEY_MAX];
+ char *default_password;
+ inode_t *root_inode;
+} qb_conf_t;
+
+
+typedef struct qb_local {
+ call_frame_t *frame; /* backpointer */
+ call_stub_t *stub;
+ inode_t *inode;
+ fd_t *fd;
+ char fmt[QB_XATTR_VAL_MAX+1];
+ char name[256];
+ synctask_fn_t synctask_fn;
+ struct list_head list;
+} qb_local_t;
+
+void qb_local_free (xlator_t *this, qb_local_t *local);
+int qb_coroutine (call_frame_t *frame, synctask_fn_t fn);
+inode_t *qb_inode_from_filename (const char *filename);
+int qb_inode_to_filename (inode_t *inode, char *filename, int size);
+int qb_format_extract (xlator_t *this, char *format, inode_t *inode);
+
+qb_inode_t *qb_inode_ctx_get (xlator_t *this, inode_t *inode);
+
+#define QB_STACK_UNWIND(typ, frame, args ...) do { \
+ qb_local_t *__local = frame->local; \
+ xlator_t *__this = frame->this; \
+ \
+ frame->local = NULL; \
+ STACK_UNWIND_STRICT (typ, frame, args); \
+ if (__local) \
+ qb_local_free (__this, __local); \
+ } while (0)
+
+#define QB_STUB_UNWIND(stub, op_ret, op_errno) do { \
+ qb_local_t *__local = stub->frame->local; \
+ xlator_t *__this = stub->frame->this; \
+ \
+ stub->frame->local = NULL; \
+ call_unwind_error (stub, op_ret, op_errno); \
+ if (__local) \
+ qb_local_free (__this, __local); \
+ } while (0)
+
+#define QB_STUB_RESUME(stub_errno) do { \
+ qb_local_t *__local = stub->frame->local; \
+ xlator_t *__this = stub->frame->this; \
+ \
+ stub->frame->local = NULL; \
+ call_resume (stub); \
+ if (__local) \
+ qb_local_free (__this, __local); \
+ } while (0)
+
+#endif /* !__QEMU_BLOCK_H */
diff --git a/xlators/features/quiesce/src/Makefile.am b/xlators/features/quiesce/src/Makefile.am
index ee6e29b65..15e46629e 100644
--- a/xlators/features/quiesce/src/Makefile.am
+++ b/xlators/features/quiesce/src/Makefile.am
@@ -1,7 +1,7 @@
xlator_LTLIBRARIES = quiesce.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
-quiesce_la_LDFLAGS = -module -avoidversion
+quiesce_la_LDFLAGS = -module -avoid-version
quiesce_la_SOURCES = quiesce.c
quiesce_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
diff --git a/xlators/features/quiesce/src/quiesce.c b/xlators/features/quiesce/src/quiesce.c
index 527a48b3e..24c7dc6ed 100644
--- a/xlators/features/quiesce/src/quiesce.c
+++ b/xlators/features/quiesce/src/quiesce.c
@@ -111,7 +111,7 @@ void
gf_quiesce_enqueue (xlator_t *this, call_stub_t *stub)
{
quiesce_priv_t *priv = NULL;
- struct timeval timeout = {0,};
+ struct timespec timeout = {0,};
priv = this->private;
if (!priv) {
@@ -129,7 +129,7 @@ gf_quiesce_enqueue (xlator_t *this, call_stub_t *stub)
if (!priv->timer) {
timeout.tv_sec = 20;
- timeout.tv_usec = 0;
+ timeout.tv_nsec = 0;
priv->timer = gf_timer_call_after (this->ctx,
timeout,
@@ -2492,7 +2492,7 @@ notify (xlator_t *this, int event, void *data, ...)
{
int ret = 0;
quiesce_priv_t *priv = NULL;
- struct timeval timeout = {0,};
+ struct timespec timeout = {0,};
priv = this->private;
if (!priv)
@@ -2525,7 +2525,7 @@ notify (xlator_t *this, int event, void *data, ...)
if (priv->timer)
break;
timeout.tv_sec = 20;
- timeout.tv_usec = 0;
+ timeout.tv_nsec = 0;
priv->timer = gf_timer_call_after (this->ctx,
timeout,
@@ -2599,12 +2599,10 @@ struct xlator_fops fops = {
};
-struct xlator_dumpops dumpops = {
-};
+struct xlator_dumpops dumpops;
-struct xlator_cbks cbks = {
-};
+struct xlator_cbks cbks;
struct volume_options options[] = {
diff --git a/xlators/features/quota/src/Makefile.am b/xlators/features/quota/src/Makefile.am
index 6c97e7591..7165adc59 100644
--- a/xlators/features/quota/src/Makefile.am
+++ b/xlators/features/quota/src/Makefile.am
@@ -1,17 +1,22 @@
-xlator_LTLIBRARIES = quota.la
+xlator_LTLIBRARIES = quota.la quotad.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
-quota_la_LDFLAGS = -module -avoidversion
+quota_la_LDFLAGS = -module -avoid-version
+quotad_la_LDFLAGS = -module -avoid-version
-quota_la_SOURCES = quota.c
+quota_la_SOURCES = quota.c quota-enforcer-client.c
quota_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-noinst_HEADERS = quota-mem-types.h quota.h
+quotad_la_SOURCES = quotad.c quotad-helpers.c quotad-aggregator.c
+quotad_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = quota-mem-types.h quota.h quotad-aggregator.h quotad-helpers.h
AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
- -I$(top_srcdir)/xlators/cluster/dht/src
+ -I$(top_srcdir)/xlators/cluster/dht/src -I$(top_srcdir)/rpc/xdr/src/ \
+ -I$(top_srcdir)/rpc/rpc-lib/src
AM_CFLAGS = -Wall $(GF_CFLAGS)
-CLEANFILES =
+CLEANFILES =
diff --git a/xlators/features/quota/src/quota-enforcer-client.c b/xlators/features/quota/src/quota-enforcer-client.c
new file mode 100644
index 000000000..7d8ab937d
--- /dev/null
+++ b/xlators/features/quota/src/quota-enforcer-client.c
@@ -0,0 +1,403 @@
+/*
+ Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#include <stdio.h>
+#include <string.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/resource.h>
+#include <sys/file.h>
+#include <netdb.h>
+#include <signal.h>
+#include <libgen.h>
+
+#include <sys/utsname.h>
+
+#include <stdint.h>
+#include <pthread.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <semaphore.h>
+#include <errno.h>
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#ifdef HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+
+#ifdef HAVE_MALLOC_STATS
+#ifdef DEBUG
+#include <mcheck.h>
+#endif
+#endif
+
+#include "quota.h"
+
+extern struct rpc_clnt_program quota_enforcer_clnt;
+
+int32_t
+quota_validate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *postparent);
+
+int
+quota_enforcer_submit_request (void *req, call_frame_t *frame,
+ rpc_clnt_prog_t *prog,
+ int procnum, struct iobref *iobref,
+ xlator_t *this, fop_cbk_fn_t cbkfn,
+ xdrproc_t xdrproc)
+{
+ int ret = -1;
+ int count = 0;
+ struct iovec iov = {0, };
+ struct iobuf *iobuf = NULL;
+ char new_iobref = 0;
+ ssize_t xdr_size = 0;
+ quota_priv_t *priv = NULL;
+
+ GF_ASSERT (this);
+
+ priv = this->private;
+
+ if (req) {
+ xdr_size = xdr_sizeof (xdrproc, req);
+ iobuf = iobuf_get2 (this->ctx->iobuf_pool, xdr_size);
+ if (!iobuf) {
+ goto out;
+ }
+
+ if (!iobref) {
+ iobref = iobref_new ();
+ if (!iobref) {
+ goto out;
+ }
+
+ new_iobref = 1;
+ }
+
+ iobref_add (iobref, iobuf);
+
+ iov.iov_base = iobuf->ptr;
+ iov.iov_len = iobuf_size (iobuf);
+
+ /* Create the xdr payload */
+ ret = xdr_serialize_generic (iov, req, xdrproc);
+ if (ret == -1) {
+ goto out;
+ }
+ iov.iov_len = ret;
+ count = 1;
+ }
+
+ /* Send the msg */
+ ret = rpc_clnt_submit (priv->rpc_clnt, prog, procnum, cbkfn,
+ &iov, count,
+ NULL, 0, iobref, frame, NULL, 0, NULL, 0, NULL);
+ ret = 0;
+
+out:
+ if (new_iobref)
+ iobref_unref (iobref);
+ if (iobuf)
+ iobuf_unref (iobuf);
+
+ return ret;
+}
+
+int
+quota_enforcer_lookup_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ quota_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ int ret = 0;
+ gfs3_lookup_rsp rsp = {0,};
+ struct iatt stbuf = {0,};
+ struct iatt postparent = {0,};
+ int op_errno = EINVAL;
+ dict_t *xdata = NULL;
+ inode_t *inode = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+
+ frame = myframe;
+ local = frame->local;
+ inode = local->validate_loc.inode;
+
+ if (-1 == req->rpc_status) {
+ rsp.op_ret = -1;
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gfs3_lookup_rsp);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "XDR decoding failed");
+ rsp.op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ op_errno = gf_error_to_errno (rsp.op_errno);
+ gf_stat_to_iatt (&rsp.postparent, &postparent);
+
+ if (rsp.op_ret == -1)
+ goto out;
+
+ rsp.op_ret = -1;
+ gf_stat_to_iatt (&rsp.stat, &stbuf);
+
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->this, xdata, (rsp.xdata.xdata_val),
+ (rsp.xdata.xdata_len), rsp.op_ret,
+ op_errno, out);
+
+ if ((!uuid_is_null (inode->gfid))
+ && (uuid_compare (stbuf.ia_gfid, inode->gfid) != 0)) {
+ gf_log (frame->this->name, GF_LOG_DEBUG,
+ "gfid changed for %s", local->validate_loc.path);
+ rsp.op_ret = -1;
+ op_errno = ESTALE;
+ goto out;
+ }
+
+ rsp.op_ret = 0;
+
+out:
+ rsp.op_errno = op_errno;
+ if (rsp.op_ret == -1) {
+ /* any error other than ENOENT */
+ if (rsp.op_errno != ENOENT)
+ gf_log (this->name, GF_LOG_WARNING,
+ "remote operation failed: %s. Path: %s (%s)",
+ strerror (rsp.op_errno),
+ local->validate_loc.path,
+ loc_gfid_utoa (&local->validate_loc));
+ else
+ gf_log (this->name, GF_LOG_TRACE,
+ "not found on remote node");
+
+ }
+
+ local->validate_cbk (frame, NULL, this, rsp.op_ret, rsp.op_errno, inode,
+ &stbuf, xdata, &postparent);
+
+ if (xdata)
+ dict_unref (xdata);
+
+ free (rsp.xdata.xdata_val);
+
+ return 0;
+}
+
+int
+quota_enforcer_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ dict_t *xdata, fop_lookup_cbk_t validate_cbk)
+{
+ quota_local_t *local = NULL;
+ gfs3_lookup_req req = {{0,},};
+ int ret = 0;
+ int op_errno = ESTALE;
+ quota_priv_t *priv = NULL;
+
+ if (!frame || !this || !loc)
+ goto unwind;
+
+ local = frame->local;
+ local->validate_cbk = validate_cbk;
+
+ priv = this->private;
+
+ if (!(loc && loc->inode))
+ goto unwind;
+
+ if (!uuid_is_null (loc->inode->gfid))
+ memcpy (req.gfid, loc->inode->gfid, 16);
+ else
+ memcpy (req.gfid, loc->gfid, 16);
+
+ if (xdata) {
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata,
+ (&req.xdata.xdata_val),
+ req.xdata.xdata_len,
+ op_errno, unwind);
+ }
+
+ if (loc->name)
+ req.bname = (char *)loc->name;
+ else
+ req.bname = "";
+
+ ret = quota_enforcer_submit_request (&req, frame,
+ priv->quota_enforcer,
+ GF_AGGREGATOR_LOOKUP,
+ NULL, this,
+ quota_enforcer_lookup_cbk,
+ (xdrproc_t)xdr_gfs3_lookup_req);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "failed to send the fop");
+ }
+
+ GF_FREE (req.xdata.xdata_val);
+
+ return 0;
+
+unwind:
+ validate_cbk (frame, NULL, this, -1, op_errno, NULL, NULL, NULL, NULL);
+
+ GF_FREE (req.xdata.xdata_val);
+
+ return 0;
+}
+
+int
+quota_enforcer_notify (struct rpc_clnt *rpc, void *mydata,
+ rpc_clnt_event_t event, void *data)
+{
+ xlator_t *this = NULL;
+ int ret = 0;
+
+ this = mydata;
+
+ switch (event) {
+ case RPC_CLNT_CONNECT:
+ {
+ gf_log (this->name, GF_LOG_TRACE, "got RPC_CLNT_CONNECT");
+ break;
+ }
+
+ case RPC_CLNT_DISCONNECT:
+ {
+ gf_log (this->name, GF_LOG_TRACE, "got RPC_CLNT_DISCONNECT");
+ break;
+ }
+
+ default:
+ gf_log (this->name, GF_LOG_TRACE,
+ "got some other RPC event %d", event);
+ ret = 0;
+ break;
+ }
+
+ return ret;
+}
+
+int
+quota_enforcer_blocking_connect (rpc_clnt_t *rpc)
+{
+ dict_t *options = NULL;
+ int ret = -1;
+
+ options = dict_new ();
+ if (options == NULL)
+ goto out;
+
+ ret = dict_set_str (options, "non-blocking-io", "no");
+ if (ret)
+ goto out;
+
+ rpc->conn.trans->reconfigure (rpc->conn.trans, options);
+
+ rpc_clnt_start (rpc);
+
+ ret = dict_set_str (options, "non-blocking-io", "yes");
+ if (ret)
+ goto out;
+
+ rpc->conn.trans->reconfigure (rpc->conn.trans, options);
+
+ ret = 0;
+out:
+ dict_unref (options);
+
+ return ret;
+}
+
+//Returns a started rpc_clnt. Creates a new rpc_clnt if quota_priv doesn't have
+//one already
+struct rpc_clnt *
+quota_enforcer_init (xlator_t *this, dict_t *options)
+{
+ struct rpc_clnt *rpc = NULL;
+ quota_priv_t *priv = NULL;
+ int ret = -1;
+
+ priv = this->private;
+ if (priv->rpc_clnt) {
+ gf_log (this->name, GF_LOG_TRACE, "quota enforcer clnt already "
+ "inited");
+ //Turns out to be a NOP if the clnt is already connected.
+ ret = quota_enforcer_blocking_connect (priv->rpc_clnt);
+ if (ret)
+ goto out;
+
+ return priv->rpc_clnt;
+ }
+
+ priv->quota_enforcer = &quota_enforcer_clnt;
+
+ ret = dict_set_str (options, "transport.address-family", "unix");
+ if (ret)
+ goto out;
+
+ ret = dict_set_str (options, "transport-type", "socket");
+ if (ret)
+ goto out;
+
+ ret = dict_set_str (options, "transport.socket.connect-path",
+ "/tmp/quotad.socket");
+ if (ret)
+ goto out;
+
+ rpc = rpc_clnt_new (options, this->ctx, this->name, 16);
+ if (!rpc) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = rpc_clnt_register_notify (rpc, quota_enforcer_notify, this);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR, "failed to register notify");
+ goto out;
+ }
+
+ ret = quota_enforcer_blocking_connect (rpc);
+ if (ret)
+ goto out;
+
+ ret = 0;
+out:
+ if (ret) {
+ if (rpc)
+ rpc_clnt_unref (rpc);
+ rpc = NULL;
+ }
+
+ return rpc;
+}
+
+struct rpc_clnt_procedure quota_enforcer_actors[GF_AGGREGATOR_MAXVALUE] = {
+ [GF_AGGREGATOR_NULL] = {"NULL", NULL},
+ [GF_AGGREGATOR_LOOKUP] = {"LOOKUP", NULL},
+};
+
+struct rpc_clnt_program quota_enforcer_clnt = {
+ .progname = "Quota enforcer",
+ .prognum = GLUSTER_AGGREGATOR_PROGRAM,
+ .progver = GLUSTER_AGGREGATOR_VERSION,
+ .numproc = GF_AGGREGATOR_MAXVALUE,
+ .proctable = quota_enforcer_actors,
+};
diff --git a/xlators/features/quota/src/quota-mem-types.h b/xlators/features/quota/src/quota-mem-types.h
index 3082865da..97d916568 100644
--- a/xlators/features/quota/src/quota-mem-types.h
+++ b/xlators/features/quota/src/quota-mem-types.h
@@ -21,6 +21,9 @@ enum gf_quota_mem_types_ {
gf_quota_mt_int32_t,
gf_quota_mt_limits_t,
gf_quota_mt_quota_dentry_t,
+ gf_quota_mt_quota_limits_level_t,
+ gf_quota_mt_qd_vols_conf_t,
+ gf_quota_mt_aggregator_state_t,
gf_quota_mt_end
};
#endif
diff --git a/xlators/features/quota/src/quota.c b/xlators/features/quota/src/quota.c
index b60e37d9c..2ca4da0c1 100644
--- a/xlators/features/quota/src/quota.c
+++ b/xlators/features/quota/src/quota.c
@@ -12,44 +12,100 @@
#include "quota.h"
#include "common-utils.h"
#include "defaults.h"
+#include "statedump.h"
int32_t
quota_check_limit (call_frame_t *frame, inode_t *inode, xlator_t *this,
char *name, uuid_t par);
+
+int
+quota_fill_inodectx (xlator_t *this, inode_t *inode, dict_t *dict,
+ loc_t *loc, struct iatt *buf, int32_t *op_errno);
+
struct volume_options options[];
+static int32_t
+__quota_init_inode_ctx (inode_t *inode, xlator_t *this,
+ quota_inode_ctx_t **context)
+{
+ int32_t ret = -1;
+ quota_inode_ctx_t *ctx = NULL;
+
+ if (inode == NULL) {
+ goto out;
+ }
+
+ QUOTA_ALLOC_OR_GOTO (ctx, quota_inode_ctx_t, out);
+
+ LOCK_INIT(&ctx->lock);
+
+ if (context != NULL) {
+ *context = ctx;
+ }
+
+ INIT_LIST_HEAD (&ctx->parents);
+
+ ret = __inode_ctx_put (inode, this, (uint64_t )(long)ctx);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "cannot set quota context in inode (gfid:%s)",
+ uuid_utoa (inode->gfid));
+ }
+out:
+ return ret;
+}
+
+
+static int32_t
+quota_inode_ctx_get (inode_t *inode, xlator_t *this,
+ quota_inode_ctx_t **ctx, char create_if_absent)
+{
+ int32_t ret = 0;
+ uint64_t ctx_int;
+
+ LOCK (&inode->lock);
+ {
+ ret = __inode_ctx_get (inode, this, &ctx_int);
+
+ if ((ret == 0) && (ctx != NULL)) {
+ *ctx = (quota_inode_ctx_t *) (unsigned long)ctx_int;
+ } else if (create_if_absent) {
+ ret = __quota_init_inode_ctx (inode, this, ctx);
+ }
+ }
+ UNLOCK (&inode->lock);
+
+ return ret;
+}
+
int
quota_loc_fill (loc_t *loc, inode_t *inode, inode_t *parent, char *path)
{
int ret = -1;
- if (!loc) {
+ if (!loc || (inode == NULL))
return ret;
- }
if (inode) {
loc->inode = inode_ref (inode);
+ uuid_copy (loc->gfid, inode->gfid);
}
if (parent) {
loc->parent = inode_ref (parent);
}
- loc->path = gf_strdup (path);
- if (!loc->path) {
- goto loc_wipe;
- }
+ if (path != NULL) {
+ loc->path = gf_strdup (path);
- loc->name = strrchr (loc->path, '/');
- if (loc->name) {
- loc->name++;
- } else {
- goto loc_wipe;
+ loc->name = strrchr (loc->path, '/');
+ if (loc->name) {
+ loc->name++;
+ }
}
ret = 0;
-loc_wipe:
if (ret < 0) {
loc_wipe (loc);
}
@@ -82,7 +138,6 @@ quota_inode_loc_fill (inode_t *inode, loc_t *loc)
gf_log (this->name, GF_LOG_DEBUG,
"cannot find parent for inode (gfid:%s)",
uuid_utoa (inode->gfid));
- goto err;
}
ignore_parent:
@@ -91,7 +146,6 @@ ignore_parent:
gf_log (this->name, GF_LOG_DEBUG,
"cannot construct path for inode (gfid:%s)",
uuid_utoa (inode->gfid));
- goto err;
}
ret = quota_loc_fill (loc, inode, parent, resolvedpath);
@@ -137,8 +191,13 @@ quota_local_new ()
{
quota_local_t *local = NULL;
local = mem_get0 (THIS->local_pool);
- if (local)
- LOCK_INIT (&local->lock);
+ if (local == NULL)
+ goto out;
+
+ LOCK_INIT (&local->lock);
+ local->space_available = -1;
+
+out:
return local;
}
@@ -156,12 +215,15 @@ __quota_dentry_new (quota_inode_ctx_t *ctx, char *name, uuid_t par)
dentry->name = gf_strdup (name);
if (dentry->name == NULL) {
GF_FREE (dentry);
+ dentry = NULL;
goto err;
}
uuid_copy (dentry->par, par);
- list_add_tail (&dentry->next, &ctx->parents);
+ if (ctx != NULL)
+ list_add_tail (&dentry->next, &ctx->parents);
+
err:
return dentry;
}
@@ -182,19 +244,64 @@ out:
return;
}
+static inline void
+quota_link_count_decrement (quota_local_t *local)
+{
+ call_stub_t *stub = NULL;
+ int link_count = -1;
+
+ if (local == NULL)
+ goto out;
+
+ LOCK (&local->lock);
+ {
+ link_count = --local->link_count;
+ if (link_count == 0) {
+ stub = local->stub;
+ local->stub = NULL;
+ }
+ }
+ UNLOCK (&local->lock);
+
+ if (stub != NULL) {
+ call_resume (stub);
+ }
+out:
+ return;
+}
+
+static inline void
+quota_handle_validate_error (quota_local_t *local, int32_t op_ret,
+ int32_t op_errno)
+{
+ if (local == NULL)
+ goto out;
+
+ LOCK (&local->lock);
+ {
+ if (op_ret < 0) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ }
+ }
+ UNLOCK (&local->lock);
+
+ /* we abort checking limits on this path to root */
+ quota_link_count_decrement (local);
+out:
+ return;
+}
int32_t
quota_validate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict,
- dict_t *xdata)
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *postparent)
{
- quota_local_t *local = NULL;
- uint32_t validate_count = 0, link_count = 0;
- int32_t ret = 0;
- quota_inode_ctx_t *ctx = NULL;
- int64_t *size = 0;
- uint64_t value = 0;
- call_stub_t *stub = NULL;
+ quota_local_t *local = NULL;
+ int32_t ret = 0;
+ quota_inode_ctx_t *ctx = NULL;
+ int64_t *size = 0;
+ uint64_t value = 0;
local = frame->local;
@@ -206,7 +313,7 @@ quota_validate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
GF_ASSERT (frame);
GF_VALIDATE_OR_GOTO_WITH_ERROR ("quota", this, unwind, op_errno,
EINVAL);
- GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, dict, unwind, op_errno,
+ GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, xdata, unwind, op_errno,
EINVAL);
ret = inode_ctx_get (local->validate_loc.inode, this, &value);
@@ -220,7 +327,7 @@ quota_validate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto unwind;
}
- ret = dict_get_bin (dict, QUOTA_SIZE_KEY, (void **) &size);
+ ret = dict_get_bin (xdata, QUOTA_SIZE_KEY, (void **) &size);
if (ret < 0) {
gf_log (this->name, GF_LOG_WARNING,
"size key not present in dict");
@@ -243,25 +350,7 @@ quota_validate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
unwind:
- LOCK (&local->lock);
- {
- local->op_ret = -1;
- local->op_errno = op_errno;
-
- validate_count = --local->validate_count;
- link_count = local->link_count;
-
- if ((validate_count == 0) && (link_count == 0)) {
- stub = local->stub;
- local->stub = NULL;
- }
- }
- UNLOCK (&local->lock);
-
- if (stub != NULL) {
- call_resume (stub);
- }
-
+ quota_handle_validate_error (local, op_ret, op_errno);
return 0;
}
@@ -288,34 +377,425 @@ quota_timeout (struct timeval *tv, int32_t timeout)
return timed_out;
}
+static inline void
+quota_add_parent (quota_dentry_t *dentry, struct list_head *list)
+{
+ quota_dentry_t *entry = NULL;
+ gf_boolean_t found = _gf_false;
+
+ if ((dentry == NULL) || (list == NULL)) {
+ goto out;
+ }
+
+ list_for_each_entry (entry, list, next) {
+ if (uuid_compare (dentry->par, entry->par) == 0) {
+ found = _gf_true;
+ goto out;
+ }
+ }
+
+ list_add_tail (&dentry->next, list);
+
+out:
+ return;
+}
int32_t
-quota_check_limit (call_frame_t *frame, inode_t *inode, xlator_t *this,
- char *name, uuid_t par)
+quota_build_ancestry_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ gf_dirent_t *entries, dict_t *xdata)
{
- int32_t ret = -1;
- inode_t *_inode = NULL, *parent = NULL;
- quota_inode_ctx_t *ctx = NULL;
- quota_priv_t *priv = NULL;
- quota_local_t *local = NULL;
- char need_validate = 0, need_unwind = 0;
- int64_t delta = 0;
- call_stub_t *stub = NULL;
- int32_t validate_count = 0, link_count = 0;
- uint64_t value = 0;
- char just_validated = 0;
- uuid_t trav_uuid = {0,};
-
- GF_VALIDATE_OR_GOTO ("quota", this, out);
- GF_VALIDATE_OR_GOTO (this->name, frame, out);
- GF_VALIDATE_OR_GOTO (this->name, inode, out);
+ inode_t *parent = NULL, *tmp_parent = NULL;
+ gf_dirent_t *entry = NULL;
+ loc_t loc = {0, };
+ quota_dentry_t *dentry = NULL, *tmp = NULL;
+ quota_inode_ctx_t *ctx = NULL;
+ struct list_head parents = {0, };
+ quota_local_t *local = NULL;
+
+ INIT_LIST_HEAD (&parents);
local = frame->local;
- GF_VALIDATE_OR_GOTO (this->name, local, out);
+ frame->local = NULL;
+
+ if (op_ret < 0)
+ goto err;
+
+ parent = inode_parent (local->loc.inode, 0, NULL);
+ if (parent == NULL) {
+ gf_log (this->name, GF_LOG_WARNING, "parent is NULL");
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if ((op_ret > 0) && (entries != NULL)) {
+ list_for_each_entry (entry, &entries->list, list) {
+ if (__is_root_gfid (entry->inode->gfid)) {
+ /* The list contains a sub-list for each
+ * possible path to the target inode. Each
+ * sub-list starts with the root entry of the
+ * tree and is followed by the child entries
+ * for a particular path to the target entry.
+ * The root entry is an implied sub-list
+ * delimiter, as it denotes we have started
+ * processing a new path. Reset the parent
+ * pointer and continue
+ */
+
+ tmp_parent = NULL;
+ }
+
+ uuid_copy (loc.gfid, entry->d_stat.ia_gfid);
+
+ loc.inode = inode_ref (entry->inode);
+ loc.parent = inode_ref (tmp_parent);
+ loc.name = entry->d_name;
+
+ quota_fill_inodectx (this, entry->inode, entry->dict,
+ &loc, &entry->d_stat, &op_errno);
+
+ tmp_parent = entry->inode;
+
+ loc_wipe (&loc);
+ }
+ }
+
+ quota_inode_ctx_get (local->loc.inode, this, &ctx, 0);
+
+ if (ctx != NULL) {
+ LOCK (&ctx->lock);
+ {
+ list_for_each_entry (dentry, &ctx->parents, next) {
+ /* we built ancestry for a non-directory */
+ tmp = __quota_dentry_new (NULL, dentry->name,
+ dentry->par);
+ quota_add_parent (tmp, &parents);
+
+ if (list_empty (&tmp->next)) {
+ __quota_dentry_free (tmp);
+ tmp = NULL;
+ }
+ }
+ }
+ UNLOCK (&ctx->lock);
+ }
+
+ if (list_empty (&parents)) {
+ /* we built ancestry for a directory */
+ list_for_each_entry (entry, &entries->list, list) {
+ if (entry->inode == local->loc.inode)
+ break;
+ }
+
+ GF_ASSERT (&entry->list != &entries->list);
+
+ tmp = __quota_dentry_new (NULL, entry->d_name, parent->gfid);
+ quota_add_parent (tmp, &parents);
+ }
+
+ local->ancestry_cbk (&parents, local->loc.inode, 0, 0,
+ local->ancestry_data);
+ goto cleanup;
+
+err:
+ local->ancestry_cbk (NULL, NULL, -1, op_errno, local->ancestry_data);
+
+cleanup:
+ STACK_DESTROY (frame->root);
+ quota_local_cleanup (this, local);
+
+ if (parent != NULL) {
+ inode_unref (parent);
+ parent = NULL;
+ }
+
+ list_for_each_entry_safe (dentry, tmp, &parents, next) {
+ __quota_dentry_free (dentry);
+ }
+
+ return 0;
+}
+
+int32_t
+quota_build_ancestry_open_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ fd_t *fd, dict_t *xdata)
+{
+ dict_t *xdata_req = NULL;
+ quota_local_t *local = NULL;
+
+ if (op_ret < 0) {
+ goto err;
+ }
+
+ xdata_req = dict_new ();
+ if (xdata_req == NULL) {
+ op_ret = -ENOMEM;
+ goto err;
+ }
+
+ op_ret = dict_set_int8 (xdata_req, QUOTA_LIMIT_KEY, 1);
+ if (op_ret < 0) {
+ op_errno = -op_ret;
+ goto err;
+ }
+
+ op_ret = dict_set_int8 (xdata_req, GET_ANCESTRY_DENTRY_KEY, 1);
+ if (op_ret < 0) {
+ op_errno = -op_ret;
+ goto err;
+ }
+
+ /* This would ask posix layer to construct dentry chain till root */
+ STACK_WIND (frame, quota_build_ancestry_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdirp, fd, 0, 0, xdata_req);
+
+ op_ret = 0;
+
+err:
+ fd_unref (fd);
+
+ dict_unref (xdata_req);
+
+ if (op_ret < 0) {
+ local = frame->local;
+ frame->local = NULL;
+
+ local->ancestry_cbk (NULL, NULL, -1, op_errno,
+ local->ancestry_data);
+ quota_local_cleanup (this, local);
+ STACK_DESTROY (frame->root);
+ }
+
+ return 0;
+}
+
+int
+quota_build_ancestry (inode_t *inode, quota_ancestry_built_t ancestry_cbk,
+ void *data)
+{
+ loc_t loc = {0, };
+ fd_t *fd = NULL;
+ quota_local_t *local = NULL;
+ call_frame_t *new_frame = NULL;
+ int op_errno = EINVAL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
+
+ fd = fd_create (inode, 0);
+
+ new_frame = create_frame (this, this->ctx->pool);
+ if (new_frame == NULL) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ new_frame->root->uid = new_frame->root->gid = 0;
+
+ local = quota_local_new ();
+ if (local == NULL) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ new_frame->local = local;
+ local->ancestry_cbk = ancestry_cbk;
+ local->ancestry_data = data;
+ local->loc.inode = inode_ref (inode);
+
+ if (IA_ISDIR (inode->ia_type)) {
+ STACK_WIND (new_frame, quota_build_ancestry_open_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->opendir, &loc, fd,
+ NULL);
+ } else {
+ STACK_WIND (new_frame, quota_build_ancestry_open_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->open, &loc, 0, fd,
+ NULL);
+ }
+
+ loc_wipe (&loc);
+ return 0;
+
+err:
+ ancestry_cbk (NULL, NULL, -1, op_errno, data);
+
+ fd_unref (fd);
+
+ local = new_frame->local;
+ new_frame->local = NULL;
+
+ if (local != NULL) {
+ quota_local_cleanup (this, local);
+ }
+
+ if (new_frame != NULL) {
+ STACK_DESTROY (new_frame->root);
+ }
+
+ loc_wipe (&loc);
+ return 0;
+}
+
+int
+quota_validate (call_frame_t *frame, inode_t *inode, xlator_t *this,
+ fop_lookup_cbk_t cbk_fn)
+{
+ quota_local_t *local = NULL;
+ int ret = 0;
+ dict_t *xdata = NULL;
+ quota_priv_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ LOCK (&local->lock);
+ {
+ loc_wipe (&local->validate_loc);
+
+ ret = quota_inode_loc_fill (inode, &local->validate_loc);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "cannot fill loc for inode (gfid:%s), hence "
+ "aborting quota-checks and continuing with fop",
+ uuid_utoa (inode->gfid));
+ }
+ }
+ UNLOCK (&local->lock);
+
+ if (ret < 0) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ xdata = dict_new ();
+ if (xdata == NULL) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ret = dict_set_int8 (xdata, QUOTA_SIZE_KEY, 1);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING, "dict set failed");
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ret = dict_set_str (xdata, "volume-uuid", priv->volume_uuid);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING, "dict set failed");
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ret = quota_enforcer_lookup (frame, this, &local->validate_loc, xdata,
+ cbk_fn);
+ if (ret < 0) {
+ ret = -ENOTCONN;
+ goto err;
+ }
+
+ ret = 0;
+err:
+ return ret;
+}
+
+void
+quota_check_limit_continuation (struct list_head *parents, inode_t *inode,
+ int32_t op_ret, int32_t op_errno, void *data)
+{
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ quota_local_t *local = NULL;
+ quota_dentry_t *entry = NULL;
+ inode_t *parent = NULL;
+ int parent_count = 0;
+
+ frame = data;
+ local = frame->local;
+ this = THIS;
+
+ if ((op_ret < 0) || list_empty (parents)) {
+ if (op_ret >= 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Couldn't build ancestry for inode (gfid:%s). "
+ "Without knowing ancestors till root, quota "
+ "cannot be enforced. "
+ "Hence, failing fop with EIO",
+ uuid_utoa (inode->gfid));
+ op_errno = EIO;
+ }
+
+ quota_handle_validate_error (local, -1, op_errno);
+ goto out;
+ }
+
+ list_for_each_entry (entry, parents, next) {
+ parent_count++;
+ }
+
+ LOCK (&local->lock);
+ {
+ local->link_count += (parent_count - 1);
+ }
+ UNLOCK (&local->lock);
+
+ list_for_each_entry (entry, parents, next) {
+ parent = inode_find (inode->table, entry->par);
+
+ quota_check_limit (frame, parent, this, NULL, NULL);
+ }
+
+out:
+ return;
+}
+
+int32_t
+quota_check_limit (call_frame_t *frame, inode_t *inode, xlator_t *this,
+ char *name, uuid_t par)
+{
+ int32_t ret = -1, op_errno = EINVAL;
+ inode_t *_inode = NULL, *parent = NULL;
+ quota_inode_ctx_t *ctx = NULL;
+ quota_priv_t *priv = NULL;
+ quota_local_t *local = NULL;
+ char need_validate = 0;
+ gf_boolean_t hard_limit_exceeded = 0;
+ int64_t delta = 0, wouldbe_size = 0;
+ int64_t space_available = 0;
+ uint64_t value = 0;
+ char just_validated = 0;
+ uuid_t trav_uuid = {0,};
+ uint32_t timeout = 0;
+
+ GF_VALIDATE_OR_GOTO ("quota", this, err);
+ GF_VALIDATE_OR_GOTO (this->name, frame, err);
+ GF_VALIDATE_OR_GOTO (this->name, inode, err);
+
+ local = frame->local;
+ GF_VALIDATE_OR_GOTO (this->name, local, err);
delta = local->delta;
- GF_VALIDATE_OR_GOTO (this->name, local->stub, out);
+ GF_VALIDATE_OR_GOTO (this->name, local->stub, err);
+ /* Allow all the trusted clients
+ * Don't block the gluster internal processes like rebalance, gsyncd,
+ * self heal etc from the disk quotas.
+ *
+ * Method: Allow all the clients with PID negative. This is by the
+ * assumption that any kernel assigned pid doesn't have the negative
+ * number.
+ */
+ if (0 > frame->root->pid) {
+ ret = 0;
+ quota_link_count_decrement (local);
+ goto done;
+ }
priv = this->private;
@@ -328,10 +808,6 @@ quota_check_limit (call_frame_t *frame, inode_t *inode, xlator_t *this,
{
just_validated = local->just_validated;
local->just_validated = 0;
-
- if (just_validated) {
- local->validate_count--;
- }
}
UNLOCK (&local->lock);
@@ -340,34 +816,68 @@ quota_check_limit (call_frame_t *frame, inode_t *inode, xlator_t *this,
}
do {
- if (ctx != NULL) {
+ if (ctx != NULL && (ctx->hard_lim > 0 || ctx->soft_lim > 0)) {
+ wouldbe_size = ctx->size + delta;
+
LOCK (&ctx->lock);
{
- if (ctx->limit >= 0) {
- if (!just_validated
- && quota_timeout (&ctx->tv,
- priv->timeout)) {
- need_validate = 1;
- } else if ((ctx->size + delta)
- >= ctx->limit) {
- local->op_ret = -1;
- local->op_errno = EDQUOT;
- need_unwind = 1;
- }
+ timeout = priv->soft_timeout;
+
+ if ((ctx->soft_lim >= 0)
+ && (wouldbe_size > ctx->soft_lim)) {
+ timeout = priv->hard_timeout;
+ }
+
+ if (!just_validated
+ && quota_timeout (&ctx->tv, timeout)) {
+ need_validate = 1;
+ } else if (wouldbe_size >= ctx->hard_lim) {
+ hard_limit_exceeded = 1;
}
}
UNLOCK (&ctx->lock);
if (need_validate) {
- goto validate;
- }
+ ret = quota_validate (frame, _inode, this,
+ quota_validate_cbk);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto err;
+ }
- if (need_unwind) {
break;
}
+
+ if (hard_limit_exceeded) {
+ local->op_ret = -1;
+ local->op_errno = EDQUOT;
+
+ space_available = ctx->hard_lim - ctx->size;
+
+ if (space_available < 0)
+ space_available = 0;
+
+ if ((local->space_available < 0)
+ || (local->space_available
+ > space_available)){
+ local->space_available
+ = space_available;
+
+ }
+
+ if (space_available == 0) {
+ op_errno = EDQUOT;
+ goto err;
+ }
+ }
+
+ /* We log usage only if quota limit is configured on
+ that inode. */
+ quota_log_usage (this, ctx, _inode, delta);
}
if (__is_root_gfid (_inode->gfid)) {
+ quota_link_count_decrement (local);
break;
}
@@ -379,10 +889,15 @@ quota_check_limit (call_frame_t *frame, inode_t *inode, xlator_t *this,
}
if (parent == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "cannot find parent for inode (gfid:%s), hence "
- "aborting enforcing quota-limits and continuing"
- " with the fop", uuid_utoa (_inode->gfid));
+ ret = quota_build_ancestry (_inode,
+ quota_check_limit_continuation,
+ frame);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto err;
+ }
+
+ break;
}
inode_unref (_inode);
@@ -398,253 +913,105 @@ quota_check_limit (call_frame_t *frame, inode_t *inode, xlator_t *this,
ctx = (quota_inode_ctx_t *)(unsigned long)value;
} while (1);
- ret = 0;
-
if (_inode != NULL) {
inode_unref (_inode);
+ _inode = NULL;
}
- LOCK (&local->lock);
- {
- validate_count = local->validate_count;
- link_count = local->link_count;
- if ((validate_count == 0) && (link_count == 0)) {
- stub = local->stub;
- local->stub = NULL;
- }
- }
- UNLOCK (&local->lock);
-
- if (stub != NULL) {
- call_resume (stub);
- }
-
-out:
- return ret;
-
-validate:
- LOCK (&local->lock);
- {
- loc_wipe (&local->validate_loc);
-
- if (just_validated) {
- local->validate_count--;
- }
-
- local->validate_count++;
- ret = quota_inode_loc_fill (_inode, &local->validate_loc);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "cannot fill loc for inode (gfid:%s), hence "
- "aborting quota-checks and continuing with fop",
- uuid_utoa (_inode->gfid));
- local->validate_count--;
- }
- }
- UNLOCK (&local->lock);
-
- if (ret < 0) {
- goto loc_fill_failed;
- }
+done:
+ return 0;
- STACK_WIND (frame, quota_validate_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->getxattr, &local->validate_loc,
- QUOTA_SIZE_KEY, NULL);
+err:
+ quota_handle_validate_error (local, -1, op_errno);
-loc_fill_failed:
inode_unref (_inode);
return 0;
}
-
-int32_t
-quota_get_limit_value (inode_t *inode, xlator_t *this, int64_t *n)
+static inline int
+quota_get_limits (xlator_t *this, dict_t *dict, int64_t *hard_lim,
+ int64_t *soft_lim)
{
- int32_t ret = 0;
- char *path = NULL;
- limits_t *limit_node = NULL;
- quota_priv_t *priv = NULL;
+ quota_limit_t *limit = NULL;
+ quota_priv_t *priv = NULL;
+ int64_t soft_lim_percent = 0, *ptr = NULL;
+ int ret = 0;
- if (inode == NULL || n == NULL) {
- ret = -1;
+ if ((this == NULL) || (dict == NULL) || (hard_lim == NULL)
+ || (soft_lim == NULL))
goto out;
- }
-
- *n = 0;
-
- ret = inode_path (inode, NULL, &path);
- if (ret < 0) {
- ret = -1;
- goto out;
- }
priv = this->private;
- list_for_each_entry (limit_node, &priv->limit_head, limit_list) {
- if (strcmp (limit_node->path, path) == 0) {
- *n = limit_node->value;
- break;
- }
- }
-
-out:
- GF_FREE (path);
-
- return ret;
-}
-
-
-static int32_t
-__quota_init_inode_ctx (inode_t *inode, int64_t limit, xlator_t *this,
- dict_t *dict, struct iatt *buf,
- quota_inode_ctx_t **context)
-{
- int32_t ret = -1;
- int64_t *size = 0;
- quota_inode_ctx_t *ctx = NULL;
+ ret = dict_get_bin (dict, QUOTA_LIMIT_KEY, (void **) &ptr);
+ limit = (quota_limit_t *)ptr;
- if (inode == NULL) {
- goto out;
+ if (limit) {
+ *hard_lim = ntoh64 (limit->hard_lim);
+ soft_lim_percent = ntoh64 (limit->soft_lim_percent);
}
- QUOTA_ALLOC_OR_GOTO (ctx, quota_inode_ctx_t, out);
-
- ctx->limit = limit;
- if (buf)
- ctx->buf = *buf;
-
- LOCK_INIT(&ctx->lock);
-
- if (context != NULL) {
- *context = ctx;
+ if (soft_lim_percent < 0) {
+ soft_lim_percent = priv->default_soft_lim;
}
- INIT_LIST_HEAD (&ctx->parents);
-
- if (dict != NULL) {
- ret = dict_get_bin (dict, QUOTA_SIZE_KEY, (void **) &size);
- if (ret == 0) {
- ctx->size = ntoh64 (*size);
- gettimeofday (&ctx->tv, NULL);
- }
+ if ((*hard_lim > 0) && (soft_lim_percent > 0)) {
+ *soft_lim = (soft_lim_percent * (*hard_lim))/100;
}
- ret = __inode_ctx_put (inode, this, (uint64_t )(long)ctx);
- if (ret == -1) {
- gf_log (this->name, GF_LOG_WARNING,
- "cannot set quota context in inode (gfid:%s)",
- uuid_utoa (inode->gfid));
- }
out:
- return ret;
-}
-
-
-static int32_t
-quota_inode_ctx_get (inode_t *inode, int64_t limit, xlator_t *this,
- dict_t *dict, struct iatt *buf, quota_inode_ctx_t **ctx,
- char create_if_absent)
-{
- int32_t ret = 0;
- uint64_t ctx_int;
-
- LOCK (&inode->lock);
- {
- ret = __inode_ctx_get (inode, this, &ctx_int);
-
- if ((ret == 0) && (ctx != NULL)) {
- *ctx = (quota_inode_ctx_t *) (unsigned long)ctx_int;
- } else if (create_if_absent) {
- ret = __quota_init_inode_ctx (inode, limit, this, dict,
- buf, ctx);
- }
- }
- UNLOCK (&inode->lock);
-
- return ret;
+ return 0;
}
-
-int32_t
-quota_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, dict_t *dict, struct iatt *postparent)
+int
+quota_fill_inodectx (xlator_t *this, inode_t *inode, dict_t *dict,
+ loc_t *loc, struct iatt *buf, int32_t *op_errno)
{
- int32_t ret = -1;
- char found = 0;
- quota_local_t *local = NULL;
- quota_inode_ctx_t *ctx = NULL;
- quota_dentry_t *dentry = NULL;
- int64_t *size = 0;
- uint64_t value = 0;
- limits_t *limit_node = NULL;
- quota_priv_t *priv = NULL;
+ int32_t ret = -1;
+ char found = 0;
+ quota_inode_ctx_t *ctx = NULL;
+ quota_dentry_t *dentry = NULL;
+ uint64_t value = 0;
+ int64_t hard_lim = -1, soft_lim = -1;
- local = frame->local;
-
- priv = this->private;
+ quota_get_limits (this, dict, &hard_lim, &soft_lim);
inode_ctx_get (inode, this, &value);
ctx = (quota_inode_ctx_t *)(unsigned long)value;
- if ((op_ret < 0) || (local == NULL)
- || (((ctx == NULL) || (ctx->limit == local->limit))
- && (local->limit < 0) && !((IA_ISREG (buf->ia_type))
- || (IA_ISLNK (buf->ia_type))))) {
- goto unwind;
- }
-
- LOCK (&priv->lock);
- {
- list_for_each_entry (limit_node, &priv->limit_head,
- limit_list) {
- if (strcmp (local->loc.path, limit_node->path) == 0) {
- uuid_copy (limit_node->gfid, buf->ia_gfid);
- break;
- }
- }
+ if ((((ctx == NULL) || (ctx->hard_lim == hard_lim))
+ && (hard_lim < 0) && !QUOTA_REG_OR_LNK_FILE (buf->ia_type))) {
+ ret = 0;
+ goto out;
}
- UNLOCK (&priv->lock);
- ret = quota_inode_ctx_get (local->loc.inode, local->limit, this, dict,
- buf, &ctx, 1);
+ ret = quota_inode_ctx_get (inode, this, &ctx, 1);
if ((ret == -1) || (ctx == NULL)) {
gf_log (this->name, GF_LOG_WARNING, "cannot create quota "
"context in inode(gfid:%s)",
- uuid_utoa (local->loc.inode->gfid));
- op_ret = -1;
- op_errno = ENOMEM;
- goto unwind;
+ uuid_utoa (inode->gfid));
+ ret = -1;
+ *op_errno = ENOMEM;
+ goto out;
}
LOCK (&ctx->lock);
{
-
- if (dict != NULL) {
- ret = dict_get_bin (dict, QUOTA_SIZE_KEY,
- (void **) &size);
- if (ret == 0) {
- ctx->size = ntoh64 (*size);
- gettimeofday (&ctx->tv, NULL);
- }
- }
-
- if (local->limit != ctx->limit) {
- ctx->limit = local->limit;
- }
+ ctx->hard_lim = hard_lim;
+ ctx->soft_lim = soft_lim;
ctx->buf = *buf;
- if (!(IA_ISREG (buf->ia_type) || IA_ISLNK (buf->ia_type))) {
+ if (!QUOTA_REG_OR_LNK_FILE (buf->ia_type)) {
goto unlock;
}
- if (local->loc.name == NULL)
+ if (loc->name == NULL)
goto unlock;
list_for_each_entry (dentry, &ctx->parents, next) {
- if ((strcmp (dentry->name, local->loc.name) == 0) &&
- (uuid_compare (local->loc.parent->gfid,
+ if ((strcmp (dentry->name, loc->name) == 0) &&
+ (uuid_compare (loc->parent->gfid,
dentry->par) == 0)) {
found = 1;
break;
@@ -653,18 +1020,18 @@ quota_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (!found) {
dentry = __quota_dentry_new (ctx,
- (char *)local->loc.name,
- local->loc.parent->gfid);
+ (char *)loc->name,
+ loc->parent->gfid);
if (dentry == NULL) {
/*
- gf_log (this->name, GF_LOG_WARNING,
+ gf_log (this->name, GF_LOG_WARNING,
"cannot create a new dentry (par:%"
PRId64", name:%s) for inode(ino:%"
PRId64", gfid:%s)",
uuid_utoa (local->loc.inode->gfid));
*/
- op_ret = -1;
- op_errno = ENOMEM;
+ ret = -1;
+ *op_errno = ENOMEM;
goto unlock;
}
}
@@ -672,6 +1039,25 @@ quota_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
unlock:
UNLOCK (&ctx->lock);
+out:
+ return ret;
+}
+
+int32_t
+quota_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *dict, struct iatt *postparent)
+{
+ quota_local_t *local = NULL;
+
+ if (op_ret < 0)
+ goto unwind;
+
+ local = frame->local;
+
+ op_ret = quota_fill_inodectx (this, inode, dict, &local->loc, buf,
+ &op_errno);
+
unwind:
QUOTA_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, buf,
dict, postparent);
@@ -683,133 +1069,55 @@ int32_t
quota_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
dict_t *xattr_req)
{
- int32_t ret = -1;
- int64_t limit = -1;
- limits_t *limit_node = NULL;
- gf_boolean_t dict_newed = _gf_false;
- quota_priv_t *priv = NULL;
- quota_local_t *local = NULL;
+ quota_priv_t *priv = NULL;
+ int32_t ret = -1;
+ quota_local_t *local = NULL;
priv = this->private;
- list_for_each_entry (limit_node, &priv->limit_head, limit_list) {
- if (strcmp (limit_node->path, loc->path) == 0) {
- limit = limit_node->value;
- }
- }
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
- local = quota_local_new ();
- if (local == NULL) {
+ xattr_req = xattr_req ? dict_ref(xattr_req) : dict_new();
+ if (!xattr_req)
goto err;
- }
- ret = loc_copy (&local->loc, loc);
- if (ret == -1) {
+ local = quota_local_new ();
+ if (local == NULL) {
goto err;
}
frame->local = local;
+ loc_copy (&local->loc, loc);
- local->limit = limit;
-
- if (limit < 0) {
- goto wind;
- }
-
- if (xattr_req == NULL) {
- xattr_req = dict_new ();
- dict_newed = _gf_true;
- }
-
- ret = dict_set_uint64 (xattr_req, QUOTA_SIZE_KEY, 0);
+ ret = dict_set_int8 (xattr_req, QUOTA_LIMIT_KEY, 1);
if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "dict set of key for hard-limit failed");
goto err;
}
-wind:
STACK_WIND (frame, quota_lookup_cbk, FIRST_CHILD(this),
FIRST_CHILD(this)->fops->lookup, loc, xattr_req);
ret = 0;
err:
+ if (xattr_req)
+ dict_unref (xattr_req);
+
if (ret < 0) {
QUOTA_STACK_UNWIND (lookup, frame, -1, ENOMEM,
NULL, NULL, NULL, NULL);
}
- if (dict_newed == _gf_true) {
- dict_unref (xattr_req);
- }
-
return 0;
-}
-
-
-void
-quota_update_size (xlator_t *this, inode_t *inode, char *name, uuid_t par,
- int64_t delta)
-{
- inode_t *_inode = NULL;
- inode_t *parent = NULL;
- uint64_t value = 0;
- quota_inode_ctx_t *ctx = NULL;
- uuid_t trav_uuid = {0,};
-
- GF_VALIDATE_OR_GOTO ("quota", this, out);
- GF_VALIDATE_OR_GOTO (this->name, inode, out);
-
- inode_ctx_get (inode, this, &value);
- ctx = (quota_inode_ctx_t *)(unsigned long)value;
-
- _inode = inode_ref (inode);
-
- if ( par != NULL ) {
- uuid_copy (trav_uuid, par);
- }
-
- do {
- if ((ctx != NULL) && (ctx->limit >= 0)) {
- LOCK (&ctx->lock);
- {
- ctx->size += delta;
- }
- UNLOCK (&ctx->lock);
- }
-
- if (__is_root_gfid (_inode->gfid)) {
- break;
- }
-
- parent = inode_parent (_inode, trav_uuid, name);
- if (parent == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "cannot find parent for inode (gfid:%s), hence "
- "aborting size updation of parents",
- uuid_utoa (_inode->gfid));
- }
-
- if (name != NULL) {
- name = NULL;
- uuid_clear (trav_uuid);
- }
-
- inode_unref (_inode);
- _inode = parent;
- if (_inode == NULL) {
- break;
- }
-
- inode_ctx_get (_inode, this, &value);
- ctx = (quota_inode_ctx_t *)(unsigned long)value;
- } while (1);
-
-out:
- return;
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lookup, loc, xattr_req);
+ return 0;
}
-
int32_t
quota_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
@@ -819,8 +1127,6 @@ quota_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
uint64_t ctx_int = 0;
quota_inode_ctx_t *ctx = NULL;
quota_local_t *local = NULL;
- quota_dentry_t *dentry = NULL;
- int64_t delta = 0;
local = frame->local;
@@ -850,12 +1156,6 @@ quota_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
UNLOCK (&ctx->lock);
- list_for_each_entry (dentry, &ctx->parents, next) {
- delta = (postbuf->ia_blocks - prebuf->ia_blocks) * 512;
- quota_update_size (this, local->loc.inode,
- dentry->name, dentry->par, delta);
- }
-
out:
QUOTA_STACK_UNWIND (writev, frame, op_ret, op_errno, prebuf, postbuf,
xdata);
@@ -869,8 +1169,13 @@ quota_writev_helper (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iovec *vector, int32_t count, off_t off,
uint32_t flags, struct iobref *iobref, dict_t *xdata)
{
- quota_local_t *local = NULL;
- int32_t op_errno = EINVAL;
+ quota_local_t *local = NULL;
+ int32_t op_errno = EINVAL;
+ quota_priv_t *priv = NULL;
+ struct iovec *new_vector = NULL;
+ int32_t new_count = 0;
+
+ priv = this->private;
local = frame->local;
if (local == NULL) {
@@ -880,12 +1185,38 @@ quota_writev_helper (call_frame_t *frame, xlator_t *this, fd_t *fd,
if (local->op_ret == -1) {
op_errno = local->op_errno;
- goto unwind;
+
+ if ((op_errno == EDQUOT) && (local->space_available > 0)) {
+ new_count = iov_subset (vector, count, 0,
+ local->space_available, NULL);
+
+ new_vector = GF_CALLOC (new_count,
+ sizeof (struct iovec),
+ gf_common_mt_iovec);
+ if (new_vector == NULL) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unwind;
+ }
+
+ new_count = iov_subset (vector, count, 0,
+ local->space_available,
+ new_vector);
+
+ vector = new_vector;
+ count = new_count;
+ } else {
+ goto unwind;
+ }
}
- STACK_WIND (frame, quota_writev_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->writev, fd, vector, count, off,
- flags, iobref, xdata);
+ STACK_WIND (frame, quota_writev_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev, fd,
+ vector, count, off, flags, iobref, xdata);
+
+ if (new_vector != NULL)
+ GF_FREE (new_vector);
+
return 0;
unwind:
@@ -899,14 +1230,21 @@ quota_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iovec *vector, int32_t count, off_t off,
uint32_t flags, struct iobref *iobref, dict_t *xdata)
{
+ quota_priv_t *priv = NULL;
int32_t ret = -1, op_errno = EINVAL;
int32_t parents = 0;
uint64_t size = 0;
quota_local_t *local = NULL;
quota_inode_ctx_t *ctx = NULL;
- quota_priv_t *priv = NULL;
+ quota_dentry_t *dentry = NULL, *tmp = NULL;
call_stub_t *stub = NULL;
- quota_dentry_t *dentry = NULL;
+ struct list_head head = {0, };
+
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
+ INIT_LIST_HEAD (&head);
GF_ASSERT (frame);
GF_VALIDATE_OR_GOTO ("quota", this, unwind);
@@ -920,12 +1258,13 @@ quota_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
frame->local = local;
local->loc.inode = inode_ref (fd->inode);
- ret = quota_inode_ctx_get (fd->inode, -1, this, NULL, NULL, &ctx, 0);
+ ret = quota_inode_ctx_get (fd->inode, this, &ctx, 0);
if (ctx == NULL) {
- gf_log (this->name, GF_LOG_WARNING,
- "quota context not set in inode (gfid:%s)",
+ gf_log (this->name, GF_LOG_DEBUG, "quota context is NULL on "
+ "inode (%s). "
+ "If quota is not enabled recently and crawler has "
+ "finished crawling, its an error",
uuid_utoa (fd->inode->gfid));
- goto unwind;
}
stub = fop_writev_stub (frame, quota_writev_helper, fd, vector, count,
@@ -939,40 +1278,38 @@ quota_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
GF_VALIDATE_OR_GOTO (this->name, priv, unwind);
size = iov_length (vector, count);
- LOCK (&ctx->lock);
- {
- list_for_each_entry (dentry, &ctx->parents, next) {
- parents++;
- }
- }
- UNLOCK (&ctx->lock);
-
- local->delta = size;
- local->stub = stub;
- local->link_count = parents;
-
- list_for_each_entry (dentry, &ctx->parents, next) {
- ret = quota_check_limit (frame, fd->inode, this, dentry->name,
- dentry->par);
- if (ret == -1) {
- break;
+ if (ctx != NULL) {
+ LOCK (&ctx->lock);
+ {
+ list_for_each_entry (dentry, &ctx->parents, next) {
+ tmp = __quota_dentry_new (NULL, dentry->name,
+ dentry->par);
+ list_add_tail (&tmp->next, &head);
+ parents++;
+ }
}
+ UNLOCK (&ctx->lock);
}
- stub = NULL;
-
LOCK (&local->lock);
{
- local->link_count = 0;
- if (local->validate_count == 0) {
- stub = local->stub;
- local->stub = NULL;
- }
+ local->delta = size;
+ local->link_count = (parents != 0) ? parents : 1;
+ local->stub = stub;
}
UNLOCK (&local->lock);
- if (stub != NULL) {
- call_resume (stub);
+ if (parents == 0) {
+ /* nameless lookup on this inode, allow quota to reconstruct
+ * ancestry as part of check_limit.
+ */
+ quota_check_limit (frame, fd->inode, this, NULL, NULL);
+ } else {
+ list_for_each_entry_safe (dentry, tmp, &head, next) {
+ quota_check_limit (frame, fd->inode, this, dentry->name,
+ dentry->par);
+ __quota_dentry_free (dentry);
+ }
}
return 0;
@@ -980,6 +1317,12 @@ quota_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
unwind:
QUOTA_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev, fd,
+ vector, count, off, flags, iobref, xdata);
+ return 0;
}
@@ -1014,8 +1357,10 @@ quota_mkdir_helper (call_frame_t *frame, xlator_t *this, loc_t *loc,
goto unwind;
}
- STACK_WIND (frame, quota_mkdir_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata);
+ STACK_WIND (frame, quota_mkdir_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, loc,
+ mode, umask, xdata);
+
return 0;
unwind:
@@ -1029,9 +1374,14 @@ int32_t
quota_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
mode_t umask, dict_t *xdata)
{
- int32_t ret = 0, op_errno = 0;
- quota_local_t *local = NULL;
- call_stub_t *stub = NULL;
+ quota_priv_t *priv = NULL;
+ int32_t ret = 0, op_errno = 0;
+ quota_local_t *local = NULL;
+ call_stub_t *stub = NULL;
+
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
local = quota_local_new ();
if (local == NULL) {
@@ -1041,8 +1391,6 @@ quota_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
frame->local = local;
- local->link_count = 1;
-
ret = loc_copy (&local->loc, loc);
if (ret) {
op_errno = ENOMEM;
@@ -1057,34 +1405,29 @@ quota_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
goto err;
}
- local->stub = stub;
- local->delta = 0;
-
- quota_check_limit (frame, loc->parent, this, NULL, NULL);
-
- stub = NULL;
-
LOCK (&local->lock);
{
- if (local->validate_count == 0) {
- stub = local->stub;
- local->stub = NULL;
- }
-
- local->link_count = 0;
+ local->stub = stub;
+ local->delta = 0;
+ local->link_count = 1;
}
UNLOCK (&local->lock);
- if (stub != NULL) {
- call_resume (stub);
- }
-
+ quota_check_limit (frame, loc->parent, this, NULL, NULL);
return 0;
+
err:
QUOTA_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL,
NULL, NULL);
return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->mkdir,
+ loc, mode, umask, xdata);
+
+ return 0;
}
@@ -1104,7 +1447,7 @@ quota_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto unwind;
}
- ret = quota_inode_ctx_get (inode, -1, this, NULL, buf, &ctx, 1);
+ ret = quota_inode_ctx_get (inode, this, &ctx, 1);
if ((ret == -1) || (ctx == NULL)) {
gf_log (this->name, GF_LOG_WARNING, "cannot create quota "
"context in inode(gfid:%s)",
@@ -1147,8 +1490,12 @@ quota_create_helper (call_frame_t *frame, xlator_t *this, loc_t *loc,
{
quota_local_t *local = NULL;
int32_t op_errno = EINVAL;
+ quota_priv_t *priv = NULL;
local = frame->local;
+
+ priv = this->private;
+
if (local == NULL) {
gf_log (this->name, GF_LOG_WARNING, "local is NULL");
goto unwind;
@@ -1159,9 +1506,10 @@ quota_create_helper (call_frame_t *frame, xlator_t *this, loc_t *loc,
goto unwind;
}
- STACK_WIND (frame, quota_create_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->create, loc, flags, mode, umask,
- fd, xdata);
+
+ STACK_WIND (frame, quota_create_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->create, loc,
+ flags, mode, umask, fd, xdata);
return 0;
unwind:
@@ -1175,12 +1523,19 @@ int32_t
quota_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
{
- int32_t ret = -1;
- quota_local_t *local = NULL;
- call_stub_t *stub = NULL;
+ quota_priv_t *priv = NULL;
+ int32_t ret = -1;
+ quota_local_t *local = NULL;
+ int32_t op_errno = 0;
+ call_stub_t *stub = NULL;
+
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
local = quota_local_new ();
if (local == NULL) {
+ op_errno = ENOMEM;
goto err;
}
@@ -1189,6 +1544,7 @@ quota_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
ret = loc_copy (&local->loc, loc);
if (ret) {
gf_log (this->name, GF_LOG_WARNING, "loc_copy failed");
+ op_errno = ENOMEM;
goto err;
}
@@ -1198,34 +1554,27 @@ quota_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
goto err;
}
- local->link_count = 1;
- local->stub = stub;
- local->delta = 0;
-
- quota_check_limit (frame, loc->parent, this, NULL, NULL);
-
- stub = NULL;
-
LOCK (&local->lock);
{
- local->link_count = 0;
- if (local->validate_count == 0) {
- stub = local->stub;
- local->stub = NULL;
- }
+ local->link_count = 1;
+ local->stub = stub;
+ local->delta = 0;
}
UNLOCK (&local->lock);
- if (stub != NULL) {
- call_resume (stub);
- }
-
+ quota_check_limit (frame, loc->parent, this, NULL, NULL);
return 0;
err:
- QUOTA_STACK_UNWIND (create, frame, -1, ENOMEM, NULL, NULL, NULL, NULL,
- NULL, NULL);
+ QUOTA_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL,
+ NULL, NULL, NULL);
return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->create, loc,
+ flags, mode, umask, fd, xdata);
+ return 0;
}
@@ -1237,6 +1586,8 @@ quota_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
quota_local_t *local = NULL;
quota_inode_ctx_t *ctx = NULL;
uint64_t value = 0;
+ quota_dentry_t *dentry = NULL;
+ quota_dentry_t *old_dentry = NULL;
if (op_ret < 0) {
goto out;
@@ -1254,9 +1605,20 @@ quota_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- quota_update_size (this, local->loc.inode, (char *)local->loc.name,
- local->loc.parent->gfid,
- (-(ctx->buf.ia_blocks * 512)));
+ LOCK (&ctx->lock);
+ {
+ list_for_each_entry (dentry, &ctx->parents, next) {
+ if ((strcmp (dentry->name, local->loc.name) == 0) &&
+ (uuid_compare (local->loc.parent->gfid,
+ dentry->par) == 0)) {
+ old_dentry = dentry;
+ break;
+ }
+ }
+ if (old_dentry)
+ __quota_dentry_free (old_dentry);
+ }
+ UNLOCK (&ctx->lock);
out:
QUOTA_STACK_UNWIND (unlink, frame, op_ret, op_errno, preparent,
@@ -1269,9 +1631,14 @@ int32_t
quota_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
dict_t *xdata)
{
- int32_t ret = 0;
+ quota_priv_t *priv = NULL;
+ int32_t ret = -1;
quota_local_t *local = NULL;
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
local = quota_local_new ();
if (local == NULL) {
goto err;
@@ -1279,6 +1646,10 @@ quota_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
frame->local = local;
+ if (xdata && dict_get (xdata, GLUSTERFS_INTERNAL_FOP_KEY)) {
+ local->skip_check = _gf_true;
+ }
+
ret = loc_copy (&local->loc, loc);
if (ret) {
gf_log (this->name, GF_LOG_WARNING, "loc_copy failed");
@@ -1296,6 +1667,11 @@ err:
}
return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata);
+ return 0;
}
@@ -1317,16 +1693,16 @@ quota_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
local = (quota_local_t *) frame->local;
- quota_update_size (this, local->loc.parent, NULL, NULL,
- (buf->ia_blocks * 512));
+ if (local->skip_check)
+ goto out;
- ret = quota_inode_ctx_get (inode, -1, this, NULL, NULL, &ctx, 0);
+ ret = quota_inode_ctx_get (inode, this, &ctx, 0);
if ((ret == -1) || (ctx == NULL)) {
- gf_log (this->name, GF_LOG_WARNING, "cannot find quota "
- "context in %s (gfid:%s)", local->loc.path,
+ gf_log (this->name, GF_LOG_DEBUG, "quota context is NULL on "
+ "inode (%s). "
+ "If quota is not enabled recently and crawler has "
+ "finished crawling, its an error",
uuid_utoa (inode->gfid));
- op_ret = -1;
- op_errno = EINVAL;
goto out;
}
@@ -1380,6 +1756,9 @@ quota_link_helper (call_frame_t *frame, xlator_t *this, loc_t *oldloc,
{
quota_local_t *local = NULL;
int32_t op_errno = EINVAL;
+ quota_priv_t *priv = NULL;
+
+ priv = this->private;
local = frame->local;
if (local == NULL) {
@@ -1393,8 +1772,9 @@ quota_link_helper (call_frame_t *frame, xlator_t *this, loc_t *oldloc,
goto unwind;
}
- STACK_WIND (frame, quota_link_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
+ STACK_WIND (frame, quota_link_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->link, oldloc,
+ newloc, xdata);
return 0;
unwind:
@@ -1408,10 +1788,28 @@ int32_t
quota_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
dict_t *xdata)
{
+ quota_priv_t *priv = NULL;
int32_t ret = -1, op_errno = ENOMEM;
quota_local_t *local = NULL;
+ quota_inode_ctx_t *ctx = NULL;
call_stub_t *stub = NULL;
- quota_inode_ctx_t *ctx = NULL;
+
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
+ if (xdata && dict_get (xdata, GLUSTERFS_INTERNAL_FOP_KEY)) {
+ goto off;
+ }
+
+ quota_inode_ctx_get (oldloc->inode, this, &ctx, 0);
+ if (ctx == NULL) {
+ gf_log (this->name, GF_LOG_DEBUG, "quota context is NULL on "
+ "inode (%s). "
+ "If quota is not enabled recently and crawler has "
+ "finished crawling, its an error",
+ uuid_utoa (oldloc->inode->gfid));
+ }
local = quota_local_new ();
if (local == NULL) {
@@ -1420,6 +1818,7 @@ quota_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
frame->local = (void *) local;
+
ret = loc_copy (&local->loc, newloc);
if (ret == -1) {
gf_log (this->name, GF_LOG_WARNING, "loc_copy failed");
@@ -1431,47 +1830,26 @@ quota_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
goto err;
}
- local->link_count = 1;
- local->stub = stub;
-
- ret = quota_inode_ctx_get (oldloc->inode, -1, this, NULL, NULL, &ctx,
- 0);
- if (ctx == NULL) {
- gf_log (this->name, GF_LOG_WARNING,
- "quota context not set in inode (gfid:%s)",
- oldloc->inode ? uuid_utoa (oldloc->inode->gfid) : "0");
- op_errno = EINVAL;
- goto err;
- }
-
- local->delta = ctx->buf.ia_blocks * 512;
-
- quota_check_limit (frame, newloc->parent, this, NULL, NULL);
-
- stub = NULL;
-
LOCK (&local->lock);
{
- if (local->validate_count == 0) {
- stub = local->stub;
- local->stub = NULL;
- }
-
- local->link_count = 0;
+ local->link_count = 1;
+ local->stub = stub;
+ local->delta = (ctx != NULL) ? ctx->buf.ia_blocks * 512 : 0;
}
UNLOCK (&local->lock);
- if (stub != NULL) {
- call_resume (stub);
- }
+ quota_check_limit (frame, newloc->parent, this, NULL, NULL);
+ return 0;
- ret = 0;
err:
- if (ret < 0) {
- QUOTA_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL,
- NULL, NULL, NULL);
- }
+ QUOTA_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL,
+ NULL, NULL, NULL);
+ return 0;
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->link, oldloc,
+ newloc, xdata);
return 0;
}
@@ -1484,11 +1862,11 @@ quota_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
dict_t *xdata)
{
int32_t ret = -1;
+ int64_t size = 0;
quota_local_t *local = NULL;
quota_inode_ctx_t *ctx = NULL;
quota_dentry_t *old_dentry = NULL, *dentry = NULL;
char new_dentry_found = 0;
- int64_t size = 0;
if (op_ret < 0) {
goto out;
@@ -1502,29 +1880,22 @@ quota_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- if (IA_ISREG (local->oldloc.inode->ia_type)
- || IA_ISLNK (local->oldloc.inode->ia_type)) {
+ if (QUOTA_REG_OR_LNK_FILE (local->oldloc.inode->ia_type)) {
size = buf->ia_blocks * 512;
}
- if (local->oldloc.parent != local->newloc.parent) {
- quota_update_size (this, local->oldloc.parent, NULL, NULL, (-size));
- quota_update_size (this, local->newloc.parent, NULL, NULL, size);
- }
-
- if (!(IA_ISREG (local->oldloc.inode->ia_type)
- || IA_ISLNK (local->oldloc.inode->ia_type))) {
+ if (!QUOTA_REG_OR_LNK_FILE (local->oldloc.inode->ia_type)) {
goto out;
}
- ret = quota_inode_ctx_get (local->oldloc.inode, -1, this, NULL, NULL,
- &ctx, 0);
+ ret = quota_inode_ctx_get (local->oldloc.inode, this, &ctx, 0);
if ((ret == -1) || (ctx == NULL)) {
- gf_log (this->name, GF_LOG_WARNING, "quota context not"
- "set in inode(gfid:%s)",
+ gf_log (this->name, GF_LOG_DEBUG, "quota context is NULL on "
+ "inode (%s). "
+ "If quota is not enabled recently and crawler has "
+ "finished crawling, its an error",
uuid_utoa (local->oldloc.inode->gfid));
- op_ret = -1;
- op_errno = EINVAL;
+
goto out;
}
@@ -1570,7 +1941,8 @@ quota_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (dentry == NULL) {
gf_log (this->name, GF_LOG_WARNING,
"cannot create a new dentry (name:%s) "
- "for inode(gfid:%s)", local->newloc.name,
+ "for inode(gfid:%s)",
+ local->newloc.name,
uuid_utoa (local->newloc.inode->gfid));
op_ret = -1;
op_errno = ENOMEM;
@@ -1597,6 +1969,9 @@ quota_rename_helper (call_frame_t *frame, xlator_t *this, loc_t *oldloc,
{
quota_local_t *local = NULL;
int32_t op_errno = EINVAL;
+ quota_priv_t *priv = NULL;
+
+ priv = this->private;
local = frame->local;
if (local == NULL) {
@@ -1610,8 +1985,10 @@ quota_rename_helper (call_frame_t *frame, xlator_t *this, loc_t *oldloc,
goto unwind;
}
- STACK_WIND (frame, quota_rename_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata);
+ STACK_WIND (frame, quota_rename_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->rename, oldloc,
+ newloc, xdata);
+
return 0;
unwind:
@@ -1621,14 +1998,59 @@ unwind:
}
+static int32_t
+quota_rename_get_size_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata,
+ struct iatt *postparent)
+{
+ quota_local_t *local = NULL;
+ int32_t ret = 0;
+ int64_t *size = 0;
+
+ GF_ASSERT (frame);
+ GF_VALIDATE_OR_GOTO_WITH_ERROR ("quota", this, out, op_errno,
+ EINVAL);
+ GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, xdata, out, op_errno,
+ EINVAL);
+ local = frame->local;
+ GF_ASSERT (local);
+ local->link_count = 1;
+
+ if (op_ret < 0)
+ goto out;
+
+
+ ret = dict_get_bin (xdata, QUOTA_SIZE_KEY, (void **) &size);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "size key not present in dict");
+ op_errno = EINVAL;
+ goto out;
+ }
+ local->delta = ntoh64 (*size);
+ quota_check_limit (frame, local->newloc.parent, this,
+ NULL, NULL);
+ return 0;
+
+out:
+ quota_handle_validate_error (local, -1, op_errno);
+ return 0;
+}
+
int32_t
quota_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc,
loc_t *newloc, dict_t *xdata)
{
- int32_t ret = -1, op_errno = ENOMEM;
- quota_local_t *local = NULL;
- call_stub_t *stub = NULL;
- quota_inode_ctx_t *ctx = NULL;
+ quota_priv_t *priv = NULL;
+ int32_t ret = -1, op_errno = ENOMEM;
+ quota_local_t *local = NULL;
+ quota_inode_ctx_t *ctx = NULL;
+ call_stub_t *stub = NULL;
+
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
local = quota_local_new ();
if (local == NULL) {
@@ -1655,51 +2077,63 @@ quota_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc,
goto err;
}
- local->link_count = 1;
- local->stub = stub;
+ LOCK (&local->lock);
+ {
+ local->link_count = 1;
+ local->stub = stub;
+ }
+ UNLOCK (&local->lock);
- if (IA_ISREG (oldloc->inode->ia_type)
- || IA_ISLNK (oldloc->inode->ia_type)) {
- ret = quota_inode_ctx_get (oldloc->inode, -1, this, NULL, NULL,
- &ctx, 0);
+ if (QUOTA_REG_OR_LNK_FILE (oldloc->inode->ia_type)) {
+ ret = quota_inode_ctx_get (oldloc->inode, this, &ctx, 0);
if (ctx == NULL) {
gf_log (this->name, GF_LOG_WARNING,
- "quota context not set in inode (gfid:%s)",
+ "quota context not set in inode (gfid:%s), "
+ "considering file size as zero while enforcing "
+ "quota on new ancestry",
oldloc->inode ? uuid_utoa (oldloc->inode->gfid)
: "0");
- op_errno = EINVAL;
- goto err;
- }
- local->delta = ctx->buf.ia_blocks * 512;
- } else {
- local->delta = 0;
- }
+ local->delta = 0;
- quota_check_limit (frame, newloc->parent, this, NULL, NULL);
+ } else {
- stub = NULL;
+ /* FIXME: We need to account for the size occupied by this
+ * inode on the target directory. To avoid double
+ * accounting, we need to modify enforcer to perform
+ * quota_check_limit only uptil the least common ancestor
+ * directory inode*/
- LOCK (&local->lock);
- {
- if (local->validate_count == 0) {
- stub = local->stub;
- local->stub = NULL;
+ /* FIXME: The following code assumes that regular files and
+ *linkfiles are present, in their entirety, in a single
+ brick. This *assumption is invalid in the case of
+ stripe.*/
+
+ local->delta = ctx->buf.ia_blocks * 512;
}
- local->link_count = 0;
- }
- UNLOCK (&local->lock);
+ } else if (IA_ISDIR (oldloc->inode->ia_type)) {
+ ret = quota_validate (frame, oldloc->inode, this,
+ quota_rename_get_size_cbk);
+ if (ret){
+ op_errno = -ret;
+ goto err;
+ }
- if (stub != NULL) {
- call_resume (stub);
+ return 0;
}
- ret = 0;
+ quota_check_limit (frame, newloc->parent, this, NULL, NULL);
+ return 0;
+
err:
- if (ret == -1) {
- QUOTA_STACK_UNWIND (rename, frame, -1, op_errno, NULL,
- NULL, NULL, NULL, NULL, NULL);
- }
+ QUOTA_STACK_UNWIND (rename, frame, -1, op_errno, NULL,
+ NULL, NULL, NULL, NULL, NULL);
+ return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->rename, oldloc,
+ newloc, xdata);
return 0;
}
@@ -1711,7 +2145,6 @@ quota_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *buf, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- int64_t size = 0;
quota_local_t *local = NULL;
quota_inode_ctx_t *ctx = NULL;
quota_dentry_t *dentry = NULL;
@@ -1721,16 +2154,15 @@ quota_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
local = frame->local;
- size = buf->ia_blocks * 512;
- quota_update_size (this, local->loc.parent, NULL, NULL, size);
-
- quota_inode_ctx_get (local->loc.inode, -1, this, NULL, NULL,
- &ctx, 1);
+ quota_inode_ctx_get (local->loc.inode, this, &ctx, 1);
if (ctx == NULL) {
- gf_log (this->name, GF_LOG_WARNING,
- "quota context not set in inode (gfid:%s)",
+ gf_log (this->name, GF_LOG_DEBUG, "quota context is NULL on "
+ "inode (%s). "
+ "If quota is not enabled recently and crawler has "
+ "finished crawling, its an error",
uuid_utoa (local->loc.inode->gfid));
+
goto out;
}
@@ -1765,6 +2197,7 @@ quota_symlink_helper (call_frame_t *frame, xlator_t *this, const char *linkpath,
{
quota_local_t *local = NULL;
int32_t op_errno = EINVAL;
+ quota_priv_t *priv = NULL;
local = frame->local;
if (local == NULL) {
@@ -1772,14 +2205,16 @@ quota_symlink_helper (call_frame_t *frame, xlator_t *this, const char *linkpath,
goto unwind;
}
+ priv = this->private;
+
if (local->op_ret == -1) {
op_errno = local->op_errno;
goto unwind;
}
- STACK_WIND (frame, quota_symlink_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->symlink, linkpath, loc, umask,
- xdata);
+ STACK_WIND (frame, quota_symlink_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->symlink,
+ linkpath, loc, umask, xdata);
return 0;
unwind:
@@ -1793,10 +2228,15 @@ int
quota_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath,
loc_t *loc, mode_t umask, dict_t *xdata)
{
- int32_t ret = -1;
- int32_t op_errno = ENOMEM;
- quota_local_t *local = NULL;
- call_stub_t *stub = NULL;
+ quota_priv_t *priv = NULL;
+ int32_t ret = -1;
+ int32_t op_errno = ENOMEM;
+ quota_local_t *local = NULL;
+ call_stub_t *stub = NULL;
+
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
local = quota_local_new ();
if (local == NULL) {
@@ -1811,36 +2251,21 @@ quota_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath,
goto err;
}
- local->link_count = 1;
-
stub = fop_symlink_stub (frame, quota_symlink_helper, linkpath, loc,
umask, xdata);
if (stub == NULL) {
goto err;
}
- local->stub = stub;
- local->delta = strlen (linkpath);
-
- quota_check_limit (frame, loc->parent, this, NULL, NULL);
-
- stub = NULL;
-
LOCK (&local->lock);
{
- if (local->validate_count == 0) {
- stub = local->stub;
- local->stub = NULL;
- }
-
- local->link_count = 0;
+ local->stub = stub;
+ local->delta = strlen (linkpath);
+ local->link_count = 1;
}
UNLOCK (&local->lock);
- if (stub != NULL) {
- call_resume (stub);
- }
-
+ quota_check_limit (frame, loc->parent, this, NULL, NULL);
return 0;
err:
@@ -1848,6 +2273,12 @@ err:
NULL, NULL);
return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->symlink,
+ linkpath, loc, umask, xdata);
+ return 0;
}
@@ -1857,7 +2288,6 @@ quota_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *postbuf, dict_t *xdata)
{
quota_local_t *local = NULL;
- int64_t delta = 0;
quota_inode_ctx_t *ctx = NULL;
if (op_ret < 0) {
@@ -1870,15 +2300,12 @@ quota_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- delta = (postbuf->ia_blocks - prebuf->ia_blocks) * 512;
-
- quota_update_size (this, local->loc.inode, NULL, NULL, delta);
-
- quota_inode_ctx_get (local->loc.inode, -1, this, NULL, NULL,
- &ctx, 0);
+ quota_inode_ctx_get (local->loc.inode, this, &ctx, 0);
if (ctx == NULL) {
- gf_log (this->name, GF_LOG_WARNING,
- "quota context not set in inode (gfid:%s)",
+ gf_log (this->name, GF_LOG_DEBUG, "quota context is NULL on "
+ "inode (%s). "
+ "If quota is not enabled recently and crawler has "
+ "finished crawling, its an error",
uuid_utoa (local->loc.inode->gfid));
goto out;
}
@@ -1900,9 +2327,14 @@ int32_t
quota_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
dict_t *xdata)
{
+ quota_priv_t *priv = NULL;
int32_t ret = -1;
quota_local_t *local = NULL;
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
local = quota_local_new ();
if (local == NULL) {
goto err;
@@ -1920,10 +2352,15 @@ quota_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
return 0;
+
err:
QUOTA_STACK_UNWIND (truncate, frame, -1, ENOMEM, NULL, NULL, NULL);
return 0;
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
+ return 0;
}
@@ -1933,7 +2370,6 @@ quota_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *postbuf, dict_t *xdata)
{
quota_local_t *local = NULL;
- int64_t delta = 0;
quota_inode_ctx_t *ctx = NULL;
if (op_ret < 0) {
@@ -1946,15 +2382,12 @@ quota_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- delta = (postbuf->ia_blocks - prebuf->ia_blocks) * 512;
-
- quota_update_size (this, local->loc.inode, NULL, NULL, delta);
-
- quota_inode_ctx_get (local->loc.inode, -1, this, NULL, NULL,
- &ctx, 0);
+ quota_inode_ctx_get (local->loc.inode, this, &ctx, 0);
if (ctx == NULL) {
- gf_log (this->name, GF_LOG_WARNING,
- "quota context not set in inode (gfid:%s)",
+ gf_log (this->name, GF_LOG_DEBUG, "quota context is NULL on "
+ "inode (%s). "
+ "If quota is not enabled recently and crawler has "
+ "finished crawling, its an error",
uuid_utoa (local->loc.inode->gfid));
goto out;
}
@@ -1976,8 +2409,13 @@ int32_t
quota_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
dict_t *xdata)
{
+ quota_priv_t *priv = NULL;
quota_local_t *local = NULL;
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
local = quota_local_new ();
if (local == NULL)
goto err;
@@ -1986,14 +2424,21 @@ quota_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
local->loc.inode = inode_ref (fd->inode);
- STACK_WIND (frame, quota_ftruncate_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
+ STACK_WIND (frame, quota_ftruncate_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->ftruncate, fd,
+ offset, xdata);
return 0;
err:
QUOTA_STACK_UNWIND (ftruncate, frame, -1, ENOMEM, NULL, NULL, NULL);
return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ftruncate, fd,
+ offset, xdata);
+ return 0;
}
@@ -2006,14 +2451,23 @@ quota_send_dir_limit_to_cli (call_frame_t *frame, xlator_t *this,
dict_t *dict = NULL;
quota_inode_ctx_t *ctx = NULL;
uint64_t value = 0;
+ quota_priv_t *priv = NULL;
+
+ priv = this->private;
+ if (!priv->is_quota_on) {
+ snprintf (dir_limit, 1024, "Quota is disabled please turn on");
+ goto dict_set;
+ }
ret = inode_ctx_get (inode, this, &value);
if (ret < 0)
goto out;
ctx = (quota_inode_ctx_t *)(unsigned long)value;
- snprintf (dir_limit, 1024, "%"PRId64",%"PRId64, ctx->size, ctx->limit);
+ snprintf (dir_limit, 1024, "%"PRId64",%"PRId64, ctx->size,
+ ctx->hard_lim);
+dict_set:
dict = dict_new ();
if (dict == NULL) {
ret = -1;
@@ -2024,7 +2478,7 @@ quota_send_dir_limit_to_cli (call_frame_t *frame, xlator_t *this,
if (ret < 0)
goto out;
- gf_log (this->name, GF_LOG_INFO, "str = %s", dir_limit);
+ gf_log (this->name, GF_LOG_DEBUG, "str = %s", dir_limit);
QUOTA_STACK_UNWIND (getxattr, frame, 0, 0, dict, NULL);
@@ -2076,7 +2530,8 @@ quota_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
int32_t
quota_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata)
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ dict_t *xdata)
{
quota_local_t *local = NULL;
quota_inode_ctx_t *ctx = NULL;
@@ -2091,12 +2546,17 @@ quota_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- quota_inode_ctx_get (local->loc.inode, -1, this, NULL, NULL,
- &ctx, 0);
+ quota_inode_ctx_get (local->loc.inode, this, &ctx, 0);
if (ctx == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "quota context not set in inode (gfid:%s)",
- uuid_utoa (local->loc.inode->gfid));
+ if (!IA_ISDIR (buf->ia_type)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "quota context is NULL on "
+ "inode (%s). "
+ "If quota is not enabled recently and crawler "
+ "has finished crawling, its an error",
+ uuid_utoa (local->loc.inode->gfid));
+ }
+
goto out;
}
@@ -2116,9 +2576,14 @@ out:
int32_t
quota_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
+ quota_priv_t *priv = NULL;
quota_local_t *local = NULL;
int32_t ret = -1;
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
local = quota_local_new ();
if (local == NULL) {
goto unwind;
@@ -2132,12 +2597,19 @@ quota_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
}
STACK_WIND (frame, quota_stat_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->stat, loc, xdata);
+ FIRST_CHILD(this)->fops->stat, loc,
+ xdata);
return 0;
unwind:
QUOTA_STACK_UNWIND (stat, frame, -1, ENOMEM, NULL, NULL);
return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->stat, loc,
+ xdata);
+ return 0;
}
@@ -2159,12 +2631,17 @@ quota_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- quota_inode_ctx_get (local->loc.inode, -1, this, NULL, NULL,
- &ctx, 0);
+ quota_inode_ctx_get (local->loc.inode, this, &ctx, 0);
if (ctx == NULL) {
- gf_log (this->name, GF_LOG_WARNING,
- "quota context not set in inode (gfid:%s)",
- uuid_utoa (local->loc.inode->gfid));
+ if (!IA_ISDIR (buf->ia_type)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "quota context is NULL on "
+ "inode (%s). "
+ "If quota is not enabled recently and crawler "
+ "has finished crawling, its an error",
+ uuid_utoa (local->loc.inode->gfid));
+ }
+
goto out;
}
@@ -2184,8 +2661,13 @@ out:
int32_t
quota_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
{
+ quota_priv_t *priv = NULL;
quota_local_t *local = NULL;
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
local = quota_local_new ();
if (local == NULL) {
goto unwind;
@@ -2195,13 +2677,20 @@ quota_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
local->loc.inode = inode_ref (fd->inode);
- STACK_WIND (frame, quota_fstat_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fstat, fd, xdata);
+ STACK_WIND (frame, quota_fstat_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->fstat, fd,
+ xdata);
return 0;
unwind:
QUOTA_STACK_UNWIND (fstat, frame, -1, ENOMEM, NULL, NULL);
return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fstat, fd,
+ xdata);
+ return 0;
}
@@ -2223,11 +2712,12 @@ quota_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- quota_inode_ctx_get (local->loc.inode, -1, this, NULL, NULL,
- &ctx, 0);
+ quota_inode_ctx_get (local->loc.inode, this, &ctx, 0);
if (ctx == NULL) {
- gf_log (this->name, GF_LOG_WARNING,
- "quota context not set in inode (gfid:%s)",
+ gf_log (this->name, GF_LOG_DEBUG, "quota context is NULL on "
+ "inode (%s). "
+ "If quota is not enabled recently and crawler has "
+ "finished crawling, its an error",
uuid_utoa (local->loc.inode->gfid));
goto out;
}
@@ -2239,7 +2729,8 @@ quota_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
UNLOCK (&ctx->lock);
out:
- QUOTA_STACK_UNWIND (readlink, frame, op_ret, op_errno, path, buf, xdata);
+ QUOTA_STACK_UNWIND (readlink, frame, op_ret, op_errno, path, buf,
+ xdata);
return 0;
}
@@ -2248,9 +2739,14 @@ int32_t
quota_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
dict_t *xdata)
{
+ quota_priv_t *priv = NULL;
quota_local_t *local = NULL;
int32_t ret = -1;
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
local = quota_local_new ();
if (local == NULL) {
goto unwind;
@@ -2264,13 +2760,20 @@ quota_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
goto unwind;
}
- STACK_WIND (frame, quota_readlink_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->readlink, loc, size, xdata);
+ STACK_WIND (frame, quota_readlink_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->readlink, loc,
+ size, xdata);
return 0;
unwind:
QUOTA_STACK_UNWIND (readlink, frame, -1, ENOMEM, NULL, NULL, NULL);
return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readlink, loc,
+ size, xdata);
+ return 0;
}
@@ -2293,11 +2796,12 @@ quota_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- quota_inode_ctx_get (local->loc.inode, -1, this, NULL, NULL,
- &ctx, 0);
+ quota_inode_ctx_get (local->loc.inode, this, &ctx, 0);
if (ctx == NULL) {
- gf_log (this->name, GF_LOG_WARNING,
- "quota context not set in inode (gfid:%s)",
+ gf_log (this->name, GF_LOG_DEBUG, "quota context is NULL on "
+ "inode (%s). "
+ "If quota is not enabled recently and crawler has "
+ "finished crawling, its an error",
uuid_utoa (local->loc.inode->gfid));
goto out;
}
@@ -2319,8 +2823,13 @@ int32_t
quota_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
off_t offset, uint32_t flags, dict_t *xdata)
{
+ quota_priv_t *priv = NULL;
quota_local_t *local = NULL;
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
local = quota_local_new ();
if (local == NULL) {
goto unwind;
@@ -2330,13 +2839,20 @@ quota_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
local->loc.inode = inode_ref (fd->inode);
- STACK_WIND (frame, quota_readv_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->readv, fd, size, offset, flags,
- xdata);
+ STACK_WIND (frame, quota_readv_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->readv, fd,
+ size, offset, flags, xdata);
return 0;
unwind:
- QUOTA_STACK_UNWIND (readv, frame, -1, ENOMEM, NULL, -1, NULL, NULL, NULL);
+ QUOTA_STACK_UNWIND (readv, frame, -1, ENOMEM, NULL, -1, NULL, NULL,
+ NULL);
+ return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readv, fd,
+ size, offset, flags, xdata);
return 0;
}
@@ -2359,11 +2875,12 @@ quota_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- quota_inode_ctx_get (local->loc.inode, -1, this, NULL, NULL,
- &ctx, 0);
+ quota_inode_ctx_get (local->loc.inode, this, &ctx, 0);
if (ctx == NULL) {
- gf_log (this->name, GF_LOG_WARNING,
- "quota context not set in inode (gfid:%s)",
+ gf_log (this->name, GF_LOG_DEBUG, "quota context is NULL on "
+ "inode (%s). "
+ "If quota is not enabled recently and crawler has "
+ "finished crawling, its an error",
uuid_utoa (local->loc.inode->gfid));
goto out;
}
@@ -2385,8 +2902,13 @@ int32_t
quota_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
dict_t *xdata)
{
+ quota_priv_t *priv = NULL;
quota_local_t *local = NULL;
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
local = quota_local_new ();
if (local == NULL) {
goto unwind;
@@ -2397,13 +2919,19 @@ quota_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
frame->local = local;
STACK_WIND (frame, quota_fsync_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fsync, fd, flags, xdata);
+ FIRST_CHILD(this)->fops->fsync, fd,
+ flags, xdata);
return 0;
unwind:
QUOTA_STACK_UNWIND (fsync, frame, -1, ENOMEM, NULL, NULL, NULL);
return 0;
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsync, fd,
+ flags, xdata);
+ return 0;
}
@@ -2425,12 +2953,16 @@ quota_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- quota_inode_ctx_get (local->loc.inode, -1, this, NULL, NULL,
- &ctx, 0);
+ quota_inode_ctx_get (local->loc.inode, this, &ctx, 0);
if (ctx == NULL) {
- gf_log (this->name, GF_LOG_DEBUG,
- "quota context not set in inode (gfid:%s)",
- uuid_utoa (local->loc.inode->gfid));
+ if (!IA_ISDIR (statpost->ia_type)) {
+ gf_log (this->name, GF_LOG_DEBUG, "quota context is "
+ "NULL on inode (%s). "
+ "If quota is not enabled recently and crawler "
+ "has finished crawling, its an error",
+ uuid_utoa (local->loc.inode->gfid));
+ }
+
goto out;
}
@@ -2452,9 +2984,14 @@ int32_t
quota_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
struct iatt *stbuf, int32_t valid, dict_t *xdata)
{
+ quota_priv_t *priv = NULL;
quota_local_t *local = NULL;
int32_t ret = -1;
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
local = quota_local_new ();
if (local == NULL) {
goto unwind;
@@ -2469,12 +3006,19 @@ quota_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
}
STACK_WIND (frame, quota_setattr_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->setattr, loc, stbuf, valid, xdata);
+ FIRST_CHILD (this)->fops->setattr, loc,
+ stbuf, valid, xdata);
return 0;
unwind:
QUOTA_STACK_UNWIND (setattr, frame, -1, ENOMEM, NULL, NULL, NULL);
return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->setattr, loc,
+ stbuf, valid, xdata);
+ return 0;
}
@@ -2496,12 +3040,16 @@ quota_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- quota_inode_ctx_get (local->loc.inode, -1, this, NULL, NULL,
- &ctx, 0);
+ quota_inode_ctx_get (local->loc.inode, this, &ctx, 0);
if (ctx == NULL) {
- gf_log (this->name, GF_LOG_WARNING,
- "quota context not set in inode (gfid:%s)",
- uuid_utoa (local->loc.inode->gfid));
+ if (!IA_ISDIR (statpost->ia_type)) {
+ gf_log (this->name, GF_LOG_DEBUG, "quota context is "
+ "NULL on inode (%s). "
+ "If quota is not enabled recently and crawler "
+ "has finished crawling, its an error",
+ uuid_utoa (local->loc.inode->gfid));
+ }
+
goto out;
}
@@ -2522,8 +3070,13 @@ int32_t
quota_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iatt *stbuf, int32_t valid, dict_t *xdata)
{
+ quota_priv_t *priv = NULL;
quota_local_t *local = NULL;
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
local = quota_local_new ();
if (local == NULL) {
goto unwind;
@@ -2533,13 +3086,20 @@ quota_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
local->loc.inode = inode_ref (fd->inode);
- STACK_WIND (frame, quota_fsetattr_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fsetattr, fd, stbuf, valid, xdata);
+ STACK_WIND (frame, quota_fsetattr_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->fsetattr, fd,
+ stbuf, valid, xdata);
return 0;
unwind:
QUOTA_STACK_UNWIND (fsetattr, frame, -1, ENOMEM, NULL, NULL, NULL);
return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fsetattr, fd,
+ stbuf, valid, xdata);
+ return 0;
}
@@ -2559,7 +3119,7 @@ quota_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto unwind;
}
- ret = quota_inode_ctx_get (inode, -1, this, NULL, buf, &ctx, 1);
+ ret = quota_inode_ctx_get (inode, this, &ctx, 1);
if ((ret == -1) || (ctx == NULL)) {
gf_log (this->name, GF_LOG_WARNING, "cannot create quota "
"context in inode (gfid:%s)", uuid_utoa (inode->gfid));
@@ -2600,6 +3160,7 @@ quota_mknod_helper (call_frame_t *frame, xlator_t *this, loc_t *loc,
{
quota_local_t *local = NULL;
int32_t op_errno = EINVAL;
+ quota_priv_t *priv = NULL;
local = frame->local;
if (local == NULL) {
@@ -2607,14 +3168,16 @@ quota_mknod_helper (call_frame_t *frame, xlator_t *this, loc_t *loc,
goto unwind;
}
+ priv = this->private;
+
if (local->op_ret == -1) {
op_errno = local->op_errno;
goto unwind;
}
- STACK_WIND (frame, quota_mknod_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask,
- xdata);
+ STACK_WIND (frame, quota_mknod_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, loc,
+ mode, rdev, umask, xdata);
return 0;
@@ -2629,9 +3192,14 @@ int
quota_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
dev_t rdev, mode_t umask, dict_t *xdata)
{
- int32_t ret = -1;
- quota_local_t *local = NULL;
- call_stub_t *stub = NULL;
+ quota_priv_t *priv = NULL;
+ int32_t ret = -1;
+ quota_local_t *local = NULL;
+ call_stub_t *stub = NULL;
+
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
local = quota_local_new ();
if (local == NULL) {
@@ -2652,33 +3220,26 @@ quota_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
goto err;
}
- local->link_count = 1;
- local->stub = stub;
- local->delta = 0;
-
- quota_check_limit (frame, loc->parent, this, NULL, NULL);
-
- stub = NULL;
-
LOCK (&local->lock);
{
- local->link_count = 0;
- if (local->validate_count == 0) {
- stub = local->stub;
- local->stub = NULL;
- }
+ local->link_count = 1;
+ local->stub = stub;
+ local->delta = 0;
}
UNLOCK (&local->lock);
- if (stub != NULL) {
- call_resume (stub);
- }
-
+ quota_check_limit (frame, loc->parent, this, NULL, NULL);
return 0;
+
err:
QUOTA_STACK_UNWIND (mknod, frame, -1, ENOMEM, NULL, NULL, NULL, NULL,
NULL);
+ return 0;
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->mknod, loc,
+ mode, rdev, umask, xdata);
return 0;
}
@@ -2686,6 +3247,28 @@ int
quota_setxattr_cbk (call_frame_t *frame, void *cookie,
xlator_t *this, int op_ret, int op_errno, dict_t *xdata)
{
+ quota_local_t *local = NULL;
+ quota_inode_ctx_t *ctx = NULL;
+ int ret = 0;
+
+ local = frame->local;
+ if (!local)
+ goto out;
+
+ ret = quota_inode_ctx_get (local->loc.inode, this, &ctx, 1);
+ if ((ret < 0) || (ctx == NULL)) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ LOCK (&ctx->lock);
+ {
+ ctx->hard_lim = local->limit.hard_lim;
+ ctx->soft_lim = local->limit.soft_lim_percent;
+ }
+ UNLOCK (&ctx->lock);
+
+out:
QUOTA_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata);
return 0;
}
@@ -2694,30 +3277,83 @@ int
quota_setxattr (call_frame_t *frame, xlator_t *this,
loc_t *loc, dict_t *dict, int flags, dict_t *xdata)
{
- int op_errno = EINVAL;
- int op_ret = -1;
+ quota_priv_t *priv = NULL;
+ int op_errno = EINVAL;
+ int op_ret = -1;
+ int64_t hard_lim = -1, soft_lim = -1;
+ quota_local_t *local = NULL;
+
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (loc, err);
- GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.quota*", dict,
- op_errno, err);
+ if (frame->root->pid >= 0) {
+ GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.quota*", dict,
+ op_errno, err);
+ GF_IF_INTERNAL_XATTR_GOTO ("trusted.pgfid*", dict, op_errno,
+ err);
+ }
+
+ quota_get_limits (this, dict, &hard_lim, &soft_lim);
+
+ if (hard_lim > 0) {
+ local = quota_local_new ();
+ if (local == NULL) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ frame->local = local;
+ loc_copy (&local->loc, loc);
+
+ local->limit.hard_lim = hard_lim;
+ local->limit.soft_lim_percent = soft_lim;
+ }
STACK_WIND (frame, quota_setxattr_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->setxattr,
- loc, dict, flags, xdata);
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->setxattr, loc,
+ dict, flags, xdata);
return 0;
err:
QUOTA_STACK_UNWIND (setxattr, frame, op_ret, op_errno, NULL);
return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setxattr, loc,
+ dict, flags, xdata);
+ return 0;
}
int
quota_fsetxattr_cbk (call_frame_t *frame, void *cookie,
xlator_t *this, int op_ret, int op_errno, dict_t *xdata)
{
+ quota_inode_ctx_t *ctx = NULL;
+ quota_local_t *local = NULL;
+
+ local = frame->local;
+ if (!local)
+ goto out;
+
+ op_ret = quota_inode_ctx_get (local->loc.inode, this, &ctx, 1);
+ if ((op_ret < 0) || (ctx == NULL)) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ LOCK (&ctx->lock);
+ {
+ ctx->hard_lim = local->limit.hard_lim;
+ ctx->soft_lim = local->limit.soft_lim_percent;
+ }
+ UNLOCK (&ctx->lock);
+
+out:
QUOTA_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, xdata);
return 0;
}
@@ -2726,24 +3362,55 @@ int
quota_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
dict_t *dict, int flags, dict_t *xdata)
{
- int32_t op_ret = -1;
- int32_t op_errno = EINVAL;
+ quota_priv_t *priv = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = EINVAL;
+ quota_local_t *local = NULL;
+ int64_t hard_lim = -1, soft_lim = -1;
+
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (fd, err);
- GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.quota*", dict,
- op_errno, err);
+ if (0 <= frame->root->pid) {
+ GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.quota*",
+ dict, op_errno, err);
+ GF_IF_INTERNAL_XATTR_GOTO ("trusted.pgfid*", dict,
+ op_errno, err);
+ }
+
+ quota_get_limits (this, dict, &hard_lim, &soft_lim);
+
+ if (hard_lim > 0) {
+ local = quota_local_new ();
+ if (local == NULL) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ frame->local = local;
+ local->loc.inode = inode_ref (fd->inode);
+
+ local->limit.hard_lim = hard_lim;
+ local->limit.soft_lim_percent = soft_lim;
+ }
STACK_WIND (frame, quota_fsetxattr_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fsetxattr,
- fd, dict, flags, xdata);
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsetxattr, fd,
+ dict, flags, xdata);
return 0;
- err:
+err:
QUOTA_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, NULL);
return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr, fd,
+ dict, flags, xdata);
+ return 0;
}
@@ -2759,24 +3426,42 @@ int
quota_removexattr (call_frame_t *frame, xlator_t *this,
loc_t *loc, const char *name, dict_t *xdata)
{
+ quota_priv_t *priv = NULL;
int32_t op_errno = EINVAL;
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
VALIDATE_OR_GOTO (this, err);
- GF_IF_NATIVE_XATTR_GOTO ("trusted.quota*",
- name, op_errno, err);
+ /* all quota xattrs can be cleaned up by doing setxattr on special key.
+ * Hence its ok that we don't allow removexattr on quota keys here.
+ */
+ if (frame->root->pid >= 0) {
+ GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.quota*",
+ name, op_errno, err);
+ GF_IF_NATIVE_XATTR_GOTO ("trusted.pgfid*", name,
+ op_errno, err);
+ }
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (loc, err);
STACK_WIND (frame, quota_removexattr_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->removexattr,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->removexattr,
loc, name, xdata);
return 0;
+
err:
QUOTA_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL);
return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->removexattr,
+ loc, name, xdata);
+ return 0;
}
@@ -2792,24 +3477,37 @@ int
quota_fremovexattr (call_frame_t *frame, xlator_t *this,
fd_t *fd, const char *name, dict_t *xdata)
{
+ quota_priv_t *priv = NULL;
int32_t op_ret = -1;
int32_t op_errno = EINVAL;
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
VALIDATE_OR_GOTO (frame, err);
VALIDATE_OR_GOTO (this, err);
VALIDATE_OR_GOTO (fd, err);
- GF_IF_NATIVE_XATTR_GOTO ("trusted.quota*",
- name, op_errno, err);
-
+ if (frame->root->pid >= 0) {
+ GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.quota*",
+ name, op_errno, err);
+ GF_IF_NATIVE_XATTR_GOTO ("trusted.pgfid*", name,
+ op_errno, err);
+ }
STACK_WIND (frame, quota_fremovexattr_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fremovexattr,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->fremovexattr,
fd, name, xdata);
return 0;
- err:
+err:
QUOTA_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, NULL);
return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fremovexattr,
+ fd, name, xdata);
+ return 0;
}
@@ -2818,16 +3516,16 @@ quota_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct statvfs *buf,
dict_t *xdata)
{
- inode_t *root_inode = NULL;
- quota_priv_t *priv = NULL;
- uint64_t value = 0;
- quota_inode_ctx_t *ctx = NULL;
- limits_t *limit_node = NULL;
- int64_t usage = -1;
- int64_t avail = -1;
- int64_t blocks = 0;
+ inode_t *inode = NULL;
+ uint64_t value = 0;
+ int64_t usage = -1;
+ int64_t avail = -1;
+ int64_t blocks = 0;
+ quota_inode_ctx_t *ctx = NULL;
+ int ret = 0;
+ gf_boolean_t dict_created = _gf_false;
- root_inode = cookie;
+ inode = cookie;
/* This fop will fail mostly in case of client disconnect's,
* which is already logged. Hence, not logging here */
@@ -2838,130 +3536,540 @@ quota_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
* cookie, and it would only do so if the value was non-NULL. This
* check is therefore just routine defensive coding.
*/
- if (!root_inode) {
+ if (!inode) {
gf_log(this->name,GF_LOG_WARNING,
"null inode, cannot adjust for quota");
goto unwind;
}
- if (!root_inode->table || (root_inode != root_inode->table->root)) {
- gf_log(this->name,GF_LOG_WARNING,
- "non-root inode, cannot adjust for quota");
- goto unwind;
- }
- inode_ctx_get (root_inode, this, &value);
+ inode_ctx_get (inode, this, &value);
if (!value) {
goto unwind;
}
- ctx = (quota_inode_ctx_t *)(unsigned long)value;
- usage = (ctx->size) / buf->f_bsize;
- priv = this->private;
- list_for_each_entry (limit_node, &priv->limit_head, limit_list) {
- /* Notice that this only works for volume-level quota. */
- if (strcmp (limit_node->path, "/") == 0) {
- blocks = limit_node->value / buf->f_bsize;
- if (usage > blocks) {
- break;
- }
+ /* if limit is set on this inode, report statfs based on this inode
+ * else report based on root.
+ */
+ ctx = (quota_inode_ctx_t *)(unsigned long)value;
+ if (ctx->hard_lim <= 0) {
+ inode_ctx_get (inode->table->root, this, &value);
+ ctx = (quota_inode_ctx_t *)(unsigned long) value;
+ if (!ctx || ctx->hard_lim < 0)
+ goto unwind;
+ }
+
+ { /* statfs is adjusted in this code block */
+ usage = (ctx->size) / buf->f_bsize;
+
+ blocks = ctx->hard_lim / buf->f_bsize;
+ buf->f_blocks = blocks;
+
+ avail = buf->f_blocks - usage;
+ avail = max (avail, 0);
+
+ buf->f_bfree = avail;
+ /*
+ * We have to assume that the total assigned quota
+ * won't cause us to dip into the reserved space,
+ * because dealing with the overcommitted cases is
+ * just too hairy (especially when different bricks
+ * might be using different reserved percentages and
+ * such).
+ */
+ buf->f_bavail = buf->f_bfree;
+ }
- buf->f_blocks = blocks;
- avail = buf->f_blocks - usage;
- if (buf->f_bfree > avail) {
- buf->f_bfree = avail;
- }
- /*
- * We have to assume that the total assigned quota
- * won't cause us to dip into the reserved space,
- * because dealing with the overcommitted cases is
- * just too hairy (especially when different bricks
- * might be using different reserved percentages and
- * such).
- */
- buf->f_bavail = buf->f_bfree;
- break;
- }
+ if (!xdata) {
+ xdata = dict_new ();
+ if (!xdata)
+ goto unwind;
+ dict_created = _gf_true;
}
+ ret = dict_set_int8 (xdata, "quota-deem-statfs", 1);
+ if (-1 == ret)
+ gf_log (this->name, GF_LOG_ERROR, "Dict set failed, "
+ "deem-statfs option may have no effect");
+
unwind:
- if (root_inode) {
- inode_unref(root_inode);
- }
- STACK_UNWIND_STRICT (statfs, frame, op_ret, op_errno, buf, xdata);
+ QUOTA_STACK_UNWIND (statfs, frame, op_ret, op_errno, buf, xdata);
+
+ if (dict_created)
+ dict_unref (xdata);
+ return 0;
+}
+
+
+int32_t
+quota_statfs_helper (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ dict_t *xdata)
+{
+ quota_local_t *local = NULL;
+ int op_errno = EINVAL;
+
+ GF_VALIDATE_OR_GOTO ("quota", (local = frame->local), err);
+
+ if (-1 == local->op_ret) {
+ op_errno = local->op_errno;
+ goto err;
+ }
+
+ STACK_WIND_COOKIE (frame, quota_statfs_cbk, loc->inode,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->statfs, loc, xdata);
+ return 0;
+err:
+ QUOTA_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL);
return 0;
}
+int32_t
+quota_statfs_validate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata,
+ struct iatt *postparent)
+{
+ quota_local_t *local = NULL;
+ int32_t ret = 0;
+ quota_inode_ctx_t *ctx = NULL;
+ int64_t *size = 0;
+ uint64_t value = 0;
+
+ local = frame->local;
+
+ if (op_ret < 0)
+ goto resume;
+
+ GF_ASSERT (local);
+ GF_ASSERT (frame);
+ GF_VALIDATE_OR_GOTO_WITH_ERROR ("quota", this, resume, op_errno,
+ EINVAL);
+ GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, xdata, resume, op_errno,
+ EINVAL);
+
+ ret = inode_ctx_get (local->validate_loc.inode, this, &value);
+
+ ctx = (quota_inode_ctx_t *)(unsigned long)value;
+ if ((ret == -1) || (ctx == NULL)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "quota context is not present in inode (gfid:%s)",
+ uuid_utoa (local->validate_loc.inode->gfid));
+ op_errno = EINVAL;
+ goto resume;
+ }
+
+ ret = dict_get_bin (xdata, QUOTA_SIZE_KEY, (void **) &size);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "size key not present in dict");
+ op_errno = EINVAL;
+ goto resume;
+ }
+
+ LOCK (&ctx->lock);
+ {
+ ctx->size = ntoh64 (*size);
+ gettimeofday (&ctx->tv, NULL);
+ }
+ UNLOCK (&ctx->lock);
+
+resume:
+ quota_link_count_decrement (local);
+ return 0;
+}
int32_t
quota_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
- inode_t *root_inode = NULL;
+ quota_local_t *local = NULL;
+ int op_errno = 0;
+ call_stub_t *stub = NULL;
+ quota_priv_t *priv = NULL;
+ int ret = 0;
- if (loc->inode) {
- root_inode = loc->inode->table->root;
- inode_ref(root_inode);
- STACK_WIND_COOKIE (frame, quota_statfs_cbk, root_inode,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->statfs, loc, xdata);
- }
- else {
- /*
- * We have to make sure that we never get to quota_statfs_cbk
- * with a cookie that points to something other than an inode,
- * which is exactly what would happen with STACK_UNWIND using
- * that as a callback. Therefore, use default_statfs_cbk in
- * this case instead.
- */
- gf_log(this->name,GF_LOG_WARNING,
- "missing inode, cannot adjust for quota");
- STACK_WIND (frame, default_statfs_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->statfs, loc, xdata);
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
+ if (priv->consider_statfs && loc->inode) {
+ local = quota_local_new ();
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ frame->local = local;
+
+ stub = fop_statfs_stub (frame, quota_statfs_helper, loc, xdata);
+ if (!stub) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ LOCK (&local->lock);
+ {
+ local->inode = inode_ref (loc->inode);
+ local->link_count = 1;
+ local->stub = stub;
+ }
+ UNLOCK (&local->lock);
+
+ ret = quota_validate (frame, local->inode, this,
+ quota_statfs_validate_cbk);
+ if (0 > ret) {
+ quota_handle_validate_error (local, -1, -ret);
+ }
+
+ return 0;
}
+
+ /*
+ * We have to make sure that we never get to quota_statfs_cbk
+ * with a cookie that points to something other than an inode,
+ * which is exactly what would happen with STACK_UNWIND using
+ * that as a callback. Therefore, use default_statfs_cbk in
+ * this case instead.
+ *
+ * Also if the option deem-statfs is not set to "on" don't
+ * bother calculating quota limit on / in statfs_cbk.
+ */
+ if (priv->consider_statfs)
+ gf_log (this->name,GF_LOG_WARNING,
+ "missing inode, cannot adjust for quota");
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->statfs, loc, xdata);
return 0;
-}
+err:
+ STACK_UNWIND_STRICT (statfs, frame, -1, op_errno, NULL, NULL);
+
+ if (local)
+ quota_local_cleanup (this, local);
+ return 0;
+}
int
quota_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, gf_dirent_t *entries,
dict_t *xdata)
{
- gf_dirent_t *entry = NULL;
+ gf_dirent_t *entry = NULL;
+ quota_local_t *local = NULL;
+ loc_t loc = {0, };
if (op_ret <= 0)
goto unwind;
+ local = frame->local;
+
list_for_each_entry (entry, &entries->list, list) {
- /* TODO: fill things */
+ if ((strcmp (entry->d_name, ".") == 0)
+ || (strcmp (entry->d_name, "..") == 0))
+ continue;
+
+ uuid_copy (loc.gfid, entry->d_stat.ia_gfid);
+ loc.inode = inode_ref (entry->inode);
+ loc.parent = inode_ref (local->loc.inode);
+ uuid_copy (loc.pargfid, loc.parent->gfid);
+ loc.name = entry->d_name;
+
+ quota_fill_inodectx (this, entry->inode, entry->dict,
+ &loc, &entry->d_stat, &op_errno);
+
+ loc_wipe (&loc);
}
unwind:
- STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries, xdata);
+ QUOTA_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries, xdata);
return 0;
}
+
int
quota_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
off_t offset, dict_t *dict)
{
- int ret = 0;
+ quota_priv_t *priv = NULL;
+ int ret = 0;
+ gf_boolean_t new_dict = _gf_false;
+ quota_local_t *local = NULL;
+
+ priv = this->private;
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
+ local = quota_local_new ();
+
+ if (local == NULL) {
+ goto err;
+ }
+
+ frame->local = local;
+
+ local->loc.inode = inode_ref (fd->inode);
+
+ if (dict == NULL) {
+ dict = dict_new ();
+ new_dict = _gf_true;
+ }
if (dict) {
- ret = dict_set_uint64 (dict, QUOTA_SIZE_KEY, 0);
+ ret = dict_set_int8 (dict, QUOTA_LIMIT_KEY, 1);
if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "dict set of key for hard-limit failed");
goto err;
}
}
STACK_WIND (frame, quota_readdirp_cbk,
- FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdirp,
- fd, size, offset, dict);
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdirp, fd,
+ size, offset, dict);
+
+ if (new_dict) {
+ dict_unref (dict);
+ }
+
return 0;
err:
STACK_UNWIND_STRICT (readdirp, frame, -1, EINVAL, NULL, NULL);
+
+ if (new_dict) {
+ dict_unref (dict);
+ }
+
+ return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdirp, fd,
+ size, offset, dict);
+ return 0;
+}
+
+int32_t
+quota_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ int32_t ret = 0;
+ uint64_t ctx_int = 0;
+ quota_inode_ctx_t *ctx = NULL;
+ quota_local_t *local = NULL;
+
+ local = frame->local;
+
+ if ((op_ret < 0) || (local == NULL)) {
+ goto out;
+ }
+
+ ret = inode_ctx_get (local->loc.inode, this, &ctx_int);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s: failed to get the context", local->loc.path);
+ goto out;
+ }
+
+ ctx = (quota_inode_ctx_t *)(unsigned long) ctx_int;
+
+ if (ctx == NULL) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "quota context not set in %s (gfid:%s)",
+ local->loc.path, uuid_utoa (local->loc.inode->gfid));
+ goto out;
+ }
+
+ LOCK (&ctx->lock);
+ {
+ ctx->buf = *postbuf;
+ }
+ UNLOCK (&ctx->lock);
+
+out:
+ QUOTA_STACK_UNWIND (fallocate, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+
+ return 0;
+}
+
+
+int32_t
+quota_fallocate_helper (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int32_t mode, off_t offset, size_t len, dict_t *xdata)
+{
+ quota_local_t *local = NULL;
+ int32_t op_errno = EINVAL;
+ quota_priv_t *priv = NULL;
+
+ local = frame->local;
+ if (local == NULL) {
+ gf_log (this->name, GF_LOG_WARNING, "local is NULL");
+ goto unwind;
+ }
+
+ priv = this->private;
+
+ if (local->op_ret == -1) {
+ op_errno = local->op_errno;
+ goto unwind;
+ }
+
+ STACK_WIND (frame, quota_fallocate_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len,
+ xdata);
+ return 0;
+
+unwind:
+ QUOTA_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+
+int32_t
+quota_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ int32_t ret = -1, op_errno = EINVAL;
+ int32_t parents = 0;
+ quota_local_t *local = NULL;
+ quota_inode_ctx_t *ctx = NULL;
+ quota_priv_t *priv = NULL;
+ quota_dentry_t *dentry = NULL;
+ call_stub_t *stub = NULL;
+
+ priv = this->private;
+ GF_VALIDATE_OR_GOTO (this->name, priv, unwind);
+
+ WIND_IF_QUOTAOFF (priv->is_quota_on, off);
+
+ GF_ASSERT (frame);
+ GF_VALIDATE_OR_GOTO ("quota", this, unwind);
+ GF_VALIDATE_OR_GOTO (this->name, fd, unwind);
+
+ local = quota_local_new ();
+ if (local == NULL) {
+ goto unwind;
+ }
+
+ frame->local = local;
+ local->loc.inode = inode_ref (fd->inode);
+
+ ret = quota_inode_ctx_get (fd->inode, this, &ctx, 0);
+ if (ctx == NULL) {
+ gf_log (this->name, GF_LOG_DEBUG, "quota context is "
+ "NULL on inode (%s). "
+ "If quota is not enabled recently and crawler "
+ "has finished crawling, its an error",
+ uuid_utoa (local->loc.inode->gfid));
+ }
+
+ stub = fop_fallocate_stub(frame, quota_fallocate_helper, fd, mode,
+ offset, len, xdata);
+ if (stub == NULL) {
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+
+ priv = this->private;
+ GF_VALIDATE_OR_GOTO (this->name, priv, unwind);
+
+ if (ctx != NULL) {
+ LOCK (&ctx->lock);
+ {
+ list_for_each_entry (dentry, &ctx->parents, next) {
+ parents++;
+ }
+ }
+ UNLOCK (&ctx->lock);
+ }
+
+ /*
+ * Note that by using len as the delta we're assuming the range from
+ * offset to offset+len has not already been allocated. This can result
+ * in ENOSPC errors attempting to allocate an already allocated range.
+ */
+ local->delta = len;
+ local->stub = stub;
+ local->link_count = parents;
+
+ if (parents == 0) {
+ local->link_count = 1;
+ quota_check_limit (frame, fd->inode, this, NULL, NULL);
+ } else {
+ list_for_each_entry (dentry, &ctx->parents, next) {
+ quota_check_limit (frame, fd->inode, this, dentry->name,
+ dentry->par);
+ }
+ }
+
+ return 0;
+
+unwind:
+ QUOTA_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+
+off:
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fallocate, fd, mode, offset,
+ len, xdata);
return 0;
}
+/* Logs if
+* i. Usage crossed soft limit
+* ii. Usage above soft limit and alert-time elapsed
+*/
+void
+quota_log_usage (xlator_t *this, quota_inode_ctx_t *ctx, inode_t *inode,
+ int64_t delta)
+{
+ struct timeval cur_time = {0,};
+ char *usage_str = NULL;
+ char size_str[32] = {0};
+ char *path = NULL;
+ int64_t cur_size = 0;
+ quota_priv_t *priv = NULL;
+ gf_boolean_t dyn_mem = _gf_true;
+
+ priv = this->private;
+ if ((ctx->soft_lim <= 0) || (timerisset (&ctx->prev_log) &&
+ !quota_timeout (&ctx->prev_log,
+ priv->log_timeout))) {
+ return;
+ }
+
+
+ cur_size = ctx->size + delta;
+ usage_str = gf_uint64_2human_readable (cur_size);
+ if (!usage_str) {
+ snprintf (size_str, sizeof (size_str), "%"PRId64, cur_size);
+ usage_str = (char*) size_str;
+ dyn_mem = _gf_false;
+ }
+ inode_path (inode, NULL, &path);
+ if (!path)
+ path = uuid_utoa (inode->gfid);
+
+ gettimeofday (&cur_time, NULL);
+ /* Usage crossed/reached soft limit */
+ if (DID_REACH_LIMIT (ctx->soft_lim, ctx->size, cur_size)) {
+
+ gf_log (this->name, GF_LOG_ALERT, "Usage crossed "
+ "soft limit: %s used by %s", usage_str, path);
+ ctx->prev_log = cur_time;
+ }
+ /* Usage is above soft limit */
+ else if (cur_size > ctx->soft_lim){
+ gf_log (this->name, GF_LOG_ALERT, "Usage is above "
+ "soft limit: %s used by %s", usage_str, path);
+ ctx->prev_log = cur_time;
+ }
+
+ if (dyn_mem)
+ GF_FREE (usage_str);
+}
int32_t
mem_acct_init (xlator_t *this)
@@ -3014,80 +4122,6 @@ quota_forget (xlator_t *this, inode_t *inode)
return 0;
}
-
-int
-quota_parse_limits (quota_priv_t *priv, xlator_t *this, dict_t *xl_options,
- struct list_head *old_list)
-{
- int32_t ret = -1;
- char *str = NULL;
- char *str_val = NULL;
- char *path = NULL, *saveptr = NULL;
- uint64_t value = 0;
- limits_t *quota_lim = NULL, *old = NULL;
-
- ret = dict_get_str (xl_options, "limit-set", &str);
-
- if (str) {
- path = strtok_r (str, ":", &saveptr);
-
- while (path) {
- str_val = strtok_r (NULL, ",", &saveptr);
-
- ret = gf_string2bytesize (str_val, &value);
- if (ret != 0)
- goto err;
-
- QUOTA_ALLOC_OR_GOTO (quota_lim, limits_t, err);
-
- quota_lim->path = path;
-
- quota_lim->value = value;
-
- gf_log (this->name, GF_LOG_INFO, "%s:%"PRId64,
- quota_lim->path, quota_lim->value);
-
- if (old_list != NULL) {
- list_for_each_entry (old, old_list,
- limit_list) {
- if (strcmp (old->path, quota_lim->path)
- == 0) {
- uuid_copy (quota_lim->gfid,
- old->gfid);
- break;
- }
- }
- }
-
- LOCK (&priv->lock);
- {
- list_add_tail (&quota_lim->limit_list,
- &priv->limit_head);
- }
- UNLOCK (&priv->lock);
-
- path = strtok_r (NULL, ":", &saveptr);
- }
- } else {
- gf_log (this->name, GF_LOG_INFO,
- "no \"limit-set\" option provided");
- }
-
- LOCK (&priv->lock);
- {
- list_for_each_entry (quota_lim, &priv->limit_head, limit_list) {
- gf_log (this->name, GF_LOG_INFO, "%s:%"PRId64,
- quota_lim->path, quota_lim->value);
- }
- }
- UNLOCK (&priv->lock);
-
- ret = 0;
-err:
- return ret;
-}
-
-
int32_t
init (xlator_t *this)
{
@@ -3109,19 +4143,18 @@ init (xlator_t *this)
QUOTA_ALLOC_OR_GOTO (priv, quota_priv_t, err);
- INIT_LIST_HEAD (&priv->limit_head);
-
LOCK_INIT (&priv->lock);
this->private = priv;
- ret = quota_parse_limits (priv, this, this->options, NULL);
-
- if (ret) {
- goto err;
- }
-
- GF_OPTION_INIT ("timeout", priv->timeout, int64, err);
+ GF_OPTION_INIT ("deem-statfs", priv->consider_statfs, bool, err);
+ GF_OPTION_INIT ("server-quota", priv->is_quota_on, bool, err);
+ GF_OPTION_INIT ("default-soft-limit", priv->default_soft_lim, percent,
+ err);
+ GF_OPTION_INIT ("soft-timeout", priv->soft_timeout, time, err);
+ GF_OPTION_INIT ("hard-timeout", priv->hard_timeout, time, err);
+ GF_OPTION_INIT ("alert-time", priv->log_timeout, time, err);
+ GF_OPTION_INIT ("volume-uuid", priv->volume_uuid, str, err);
this->local_pool = mem_pool_new (quota_local_t, 64);
if (!this->local_pool) {
@@ -3131,132 +4164,103 @@ init (xlator_t *this)
goto err;
}
+ if (priv->is_quota_on) {
+ priv->rpc_clnt = quota_enforcer_init (this, this->options);
+ if (priv->rpc_clnt == NULL) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_WARNING,
+ "quota enforcer rpc init failed");
+ goto err;
+ }
+ }
+
ret = 0;
err:
return ret;
}
-
-void
-__quota_reconfigure_inode_ctx (xlator_t *this, inode_t *inode, limits_t *limit)
+int
+reconfigure (xlator_t *this, dict_t *options)
{
- int ret = -1;
- quota_inode_ctx_t *ctx = NULL;
-
- GF_VALIDATE_OR_GOTO ("quota", this, out);
- GF_VALIDATE_OR_GOTO (this->name, inode, out);
- GF_VALIDATE_OR_GOTO (this->name, limit, out);
-
- ret = quota_inode_ctx_get (inode, limit->value, this, NULL, NULL, &ctx,
- 1);
- if ((ret == -1) || (ctx == NULL)) {
- gf_log (this->name, GF_LOG_WARNING, "cannot create quota "
- "context in inode(gfid:%s)",
- uuid_utoa (inode->gfid));
- goto out;
- }
-
- LOCK (&ctx->lock);
- {
- ctx->limit = limit->value;
- }
- UNLOCK (&ctx->lock);
+ int32_t ret = -1;
+ quota_priv_t *priv = NULL;
+ gf_boolean_t quota_on = _gf_false;
-out:
- return;
-}
-
-
-void
-__quota_reconfigure (xlator_t *this, inode_table_t *itable, limits_t *limit)
-{
- inode_t *inode = NULL;
+ priv = this->private;
- if ((this == NULL) || (itable == NULL) || (limit == NULL)) {
- goto out;
- }
+ GF_OPTION_RECONF ("deem-statfs", priv->consider_statfs, options, bool,
+ out);
+ GF_OPTION_RECONF ("server-quota", quota_on, options, bool,
+ out);
+ GF_OPTION_RECONF ("default-soft-limit", priv->default_soft_lim,
+ options, percent, out);
+ GF_OPTION_RECONF ("alert-time", priv->log_timeout, options,
+ time, out);
+ GF_OPTION_RECONF ("soft-timeout", priv->soft_timeout, options,
+ time, out);
+ GF_OPTION_RECONF ("hard-timeout", priv->hard_timeout, options,
+ time, out);
+
+ if (quota_on) {
+ priv->rpc_clnt = quota_enforcer_init (this,
+ this->options);
+ if (priv->rpc_clnt == NULL) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_WARNING,
+ "quota enforcer rpc init failed");
+ goto out;
+ }
- if (!uuid_is_null (limit->gfid)) {
- inode = inode_find (itable, limit->gfid);
} else {
- inode = inode_resolve (itable, limit->path);
+ if (priv->rpc_clnt) {
+ // Quotad is shutdown when there is no started volume
+ // which has quota enabled. So, we should disable the
+ // enforcer client when quota is disabled on a volume,
+ // to avoid spurious reconnect attempts to a service
+ // (quotad), that is known to be down.
+ rpc_clnt_disable (priv->rpc_clnt);
+ }
}
- if (inode != NULL) {
- __quota_reconfigure_inode_ctx (this, inode, limit);
- }
+ priv->is_quota_on = quota_on;
+ ret = 0;
out:
- return;
+ return ret;
}
-
-int
-reconfigure (xlator_t *this, dict_t *options)
+int32_t
+quota_priv_dump (xlator_t *this)
{
- int32_t ret = -1;
- quota_priv_t *priv = NULL;
- limits_t *limit = NULL, *next = NULL, *new = NULL;
- struct list_head head = {0, };
- xlator_t *top = NULL;
- char found = 0;
+ quota_priv_t *priv = NULL;
+ int32_t ret = -1;
- priv = this->private;
- INIT_LIST_HEAD (&head);
+ GF_ASSERT (this);
- LOCK (&priv->lock);
- {
- list_splice_init (&priv->limit_head, &head);
- }
- UNLOCK (&priv->lock);
-
- ret = quota_parse_limits (priv, this, options, &head);
- if (ret == -1) {
- gf_log ("quota", GF_LOG_WARNING,
- "quota reconfigure failed, "
- "new changes will not take effect");
- goto out;
- }
-
- LOCK (&priv->lock);
- {
- top = ((glusterfs_ctx_t *)this->ctx)->active->top;
- GF_ASSERT (top);
-
- list_for_each_entry (limit, &priv->limit_head, limit_list) {
- __quota_reconfigure (this, top->itable, limit);
- }
-
- list_for_each_entry_safe (limit, next, &head, limit_list) {
- found = 0;
- list_for_each_entry (new, &priv->limit_head,
- limit_list) {
- if (strcmp (new->path, limit->path) == 0) {
- found = 1;
- break;
- }
- }
-
- if (!found) {
- limit->value = -1;
- __quota_reconfigure (this, top->itable, limit);
- }
+ priv = this->private;
- list_del_init (&limit->limit_list);
- GF_FREE (limit);
- }
+ gf_proc_dump_add_section ("xlators.features.quota.priv", this->name);
+
+ ret = TRY_LOCK (&priv->lock);
+ if (ret)
+ goto out;
+ else {
+ gf_proc_dump_write("soft-timeout", "%d", priv->soft_timeout);
+ gf_proc_dump_write("hard-timeout", "%d", priv->hard_timeout);
+ gf_proc_dump_write("alert-time", "%d", priv->log_timeout);
+ gf_proc_dump_write("quota-on", "%d", priv->is_quota_on);
+ gf_proc_dump_write("statfs", "%d", priv->consider_statfs);
+ gf_proc_dump_write("volume-uuid", "%s", priv->volume_uuid);
+ gf_proc_dump_write("validation-count", "%ld",
+ priv->validation_count);
}
UNLOCK (&priv->lock);
- GF_OPTION_RECONF ("timeout", priv->timeout, options, int64, out);
-
- ret = 0;
out:
- return ret;
+ return 0;
}
-
void
fini (xlator_t *this)
{
@@ -3291,21 +4295,79 @@ struct xlator_fops fops = {
.removexattr = quota_removexattr,
.fremovexattr = quota_fremovexattr,
.readdirp = quota_readdirp,
+ .fallocate = quota_fallocate,
};
struct xlator_cbks cbks = {
.forget = quota_forget
};
+struct xlator_dumpops dumpops = {
+ .priv = quota_priv_dump,
+};
struct volume_options options[] = {
{.key = {"limit-set"}},
- {.key = {"timeout"},
- .type = GF_OPTION_TYPE_SIZET,
+ {.key = {"deem-statfs"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "If set to on, it takes quota limits into"
+ "consideration while estimating fs size. (df command)"
+ " (Default is off)."
+ },
+ {.key = {"server-quota"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "Skip the quota enforcement if the feature is"
+ " not turned on. This is not a user exposed option."
+ },
+ {.key = {"default-soft-limit"},
+ .type = GF_OPTION_TYPE_PERCENT,
+ .default_value = "80%",
+ },
+ {.key = {"soft-timeout"},
+ .type = GF_OPTION_TYPE_TIME,
+ .min = 0,
+ .max = 1800,
+ .default_value = "60",
+ .description = "quota caches the directory sizes on client. "
+ "soft-timeout indicates the timeout for the validity of"
+ " cache before soft-limit has been crossed."
+ },
+ {.key = {"hard-timeout"},
+ .type = GF_OPTION_TYPE_TIME,
.min = 0,
.max = 60,
- .default_value = "0",
- .description = "quota caches the directory sizes on client. Timeout "
- "indicates the timeout for the cache to be revalidated."
+ .default_value = "5",
+ .description = "quota caches the directory sizes on client. "
+ "hard-timeout indicates the timeout for the validity of"
+ " cache after soft-limit has been crossed."
+ },
+ { .key = {"username"},
+ .type = GF_OPTION_TYPE_ANY,
+ },
+ { .key = {"password"},
+ .type = GF_OPTION_TYPE_ANY,
+ },
+ { .key = {"transport-type"},
+ .value = {"tcp", "socket", "ib-verbs", "unix", "ib-sdp",
+ "tcp/client", "ib-verbs/client", "rdma"},
+ .type = GF_OPTION_TYPE_STR,
+ },
+ { .key = {"remote-host"},
+ .type = GF_OPTION_TYPE_INTERNET_ADDRESS,
+ },
+ { .key = {"remote-port"},
+ .type = GF_OPTION_TYPE_INT,
+ },
+ { .key = {"volume-uuid"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "uuid of the volume this brick is part of."
+ },
+ { .key = {"alert-time"},
+ .type = GF_OPTION_TYPE_TIME,
+ .min = 0,
+ .max = 7*86400,
+ .default_value = "86400",
},
{.key = {NULL}}
};
diff --git a/xlators/features/quota/src/quota.h b/xlators/features/quota/src/quota.h
index 2b4e9a015..84c3257fe 100644
--- a/xlators/features/quota/src/quota.h
+++ b/xlators/features/quota/src/quota.h
@@ -12,20 +12,48 @@
#include "config.h"
#endif
+#ifndef _QUOTA_H
+#define _QUOTA_H
+
#include "xlator.h"
#include "call-stub.h"
#include "defaults.h"
-#include "byte-order.h"
#include "common-utils.h"
#include "quota-mem-types.h"
+#include "glusterfs.h"
+#include "compat.h"
+#include "logging.h"
+#include "dict.h"
+#include "stack.h"
+#include "common-utils.h"
+#include "event.h"
+#include "globals.h"
+#include "rpcsvc.h"
+#include "rpc-clnt.h"
+#include "byte-order.h"
+#include "glusterfs3-xdr.h"
+#include "glusterfs3.h"
+#include "xdr-generic.h"
+#include "compat-errno.h"
+#include "protocol-common.h"
-#define QUOTA_XATTR_PREFIX "trusted."
#define DIRTY "dirty"
#define SIZE "size"
#define CONTRIBUTION "contri"
#define VAL_LENGTH 8
#define READDIR_BUF 4096
+#ifndef UUID_CANONICAL_FORM_LEN
+#define UUID_CANONICAL_FORM_LEN 36
+#endif
+
+#define WIND_IF_QUOTAOFF(is_quota_on, label) \
+ if (!is_quota_on) \
+ goto label;
+
+#define DID_REACH_LIMIT(lim, prev_size, cur_size) \
+ ((cur_size) >= (lim) && (prev_size) < (lim))
+
#define QUOTA_SAFE_INCREMENT(lock, var) \
do { \
LOCK (lock); \
@@ -46,7 +74,7 @@
gf_quota_mt_##type); \
if (!var) { \
gf_log ("", GF_LOG_ERROR, \
- "out of memory :("); \
+ "out of memory"); \
ret = -1; \
goto label; \
} \
@@ -74,11 +102,17 @@
#define GET_CONTRI_KEY(var, _vol_name, _gfid, _ret) \
do { \
char _gfid_unparsed[40]; \
- uuid_unparse (_gfid, _gfid_unparsed); \
- _ret = gf_asprintf (var, QUOTA_XATTR_PREFIX \
- "%s.%s." CONTRIBUTION, \
- _vol_name, _gfid_unparsed); \
- } while (0)
+ if (_gfid != NULL) { \
+ uuid_unparse (_gfid, _gfid_unparsed); \
+ _ret = gf_asprintf (var, QUOTA_XATTR_PREFIX \
+ "%s.%s." CONTRIBUTION, \
+ _vol_name, _gfid_unparsed); \
+ } else { \
+ _ret = gf_asprintf (var, QUOTA_XATTR_PREFIX \
+ "%s.." CONTRIBUTION, \
+ _vol_name); \
+ } \
+ } while (0)
#define GET_CONTRI_KEY_OR_GOTO(var, _vol_name, _gfid, label) \
@@ -96,6 +130,11 @@
goto label; \
} while (0)
+#define QUOTA_REG_OR_LNK_FILE(ia_type) \
+ (IA_ISREG (ia_type) || IA_ISLNK (ia_type))
+
+
+
struct quota_dentry {
char *name;
uuid_t par;
@@ -105,46 +144,77 @@ typedef struct quota_dentry quota_dentry_t;
struct quota_inode_ctx {
int64_t size;
- int64_t limit;
+ int64_t hard_lim;
+ int64_t soft_lim;
struct iatt buf;
struct list_head parents;
struct timeval tv;
+ struct timeval prev_log;
gf_lock_t lock;
};
typedef struct quota_inode_ctx quota_inode_ctx_t;
+struct quota_limit {
+ int64_t hard_lim;
+ int64_t soft_lim_percent;
+} __attribute__ ((packed));
+typedef struct quota_limit quota_limit_t;
+
+typedef void
+(*quota_ancestry_built_t) (struct list_head *parents, inode_t *inode,
+ int32_t op_ret, int32_t op_errno, void *data);
+
struct quota_local {
- gf_lock_t lock;
- uint32_t validate_count;
- uint32_t link_count;
- loc_t loc;
- loc_t oldloc;
- loc_t newloc;
- loc_t validate_loc;
- int64_t delta;
- int32_t op_ret;
- int32_t op_errno;
- int64_t size;
- int64_t limit;
- char just_validated;
- inode_t *inode;
- call_stub_t *stub;
+ gf_lock_t lock;
+ uint32_t validate_count;
+ uint32_t link_count;
+ loc_t loc;
+ loc_t oldloc;
+ loc_t newloc;
+ loc_t validate_loc;
+ int64_t delta;
+ int32_t op_ret;
+ int32_t op_errno;
+ int64_t size;
+ gf_boolean_t skip_check;
+ char just_validated;
+ fop_lookup_cbk_t validate_cbk;
+ inode_t *inode;
+ call_stub_t *stub;
+ struct iobref *iobref;
+ quota_limit_t limit;
+ int64_t space_available;
+ quota_ancestry_built_t ancestry_cbk;
+ void *ancestry_data;
};
-typedef struct quota_local quota_local_t;
+typedef struct quota_local quota_local_t;
struct quota_priv {
- int64_t timeout;
- struct list_head limit_head;
- gf_lock_t lock;
+ uint32_t soft_timeout;
+ uint32_t hard_timeout;
+ uint32_t log_timeout;
+ double default_soft_lim;
+ gf_boolean_t is_quota_on;
+ gf_boolean_t consider_statfs;
+ gf_lock_t lock;
+ rpc_clnt_prog_t *quota_enforcer;
+ struct rpcsvc_program *quotad_aggregator;
+ struct rpc_clnt *rpc_clnt;
+ rpcsvc_t *rpcsvc;
+ inode_table_t *itable;
+ char *volume_uuid;
+ uint64_t validation_count;
};
-typedef struct quota_priv quota_priv_t;
+typedef struct quota_priv quota_priv_t;
-struct limits {
- struct list_head limit_list;
- char *path;
- int64_t value;
- uuid_t gfid;
-};
-typedef struct limits limits_t;
+int
+quota_enforcer_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ dict_t *xdata, fop_lookup_cbk_t cbk);
+struct rpc_clnt *
+quota_enforcer_init (xlator_t *this, dict_t *options);
-uint64_t cn = 1;
+void
+quota_log_usage (xlator_t *this, quota_inode_ctx_t *ctx, inode_t *inode,
+ int64_t delta);
+
+#endif
diff --git a/xlators/features/quota/src/quotad-aggregator.c b/xlators/features/quota/src/quotad-aggregator.c
new file mode 100644
index 000000000..5f13fd251
--- /dev/null
+++ b/xlators/features/quota/src/quotad-aggregator.c
@@ -0,0 +1,423 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "cli1-xdr.h"
+#include "quota.h"
+#include "quotad-helpers.h"
+#include "quotad-aggregator.h"
+
+struct rpcsvc_program quotad_aggregator_prog;
+
+struct iobuf *
+quotad_serialize_reply (rpcsvc_request_t *req, void *arg, struct iovec *outmsg,
+ xdrproc_t xdrproc)
+{
+ struct iobuf *iob = NULL;
+ ssize_t retlen = 0;
+ ssize_t xdr_size = 0;
+
+ GF_VALIDATE_OR_GOTO ("server", req, ret);
+
+ /* First, get the io buffer into which the reply in arg will
+ * be serialized.
+ */
+ if (arg && xdrproc) {
+ xdr_size = xdr_sizeof (xdrproc, arg);
+ iob = iobuf_get2 (req->svc->ctx->iobuf_pool, xdr_size);
+ if (!iob) {
+ gf_log_callingfn (THIS->name, GF_LOG_ERROR,
+ "Failed to get iobuf");
+ goto ret;
+ };
+
+ iobuf_to_iovec (iob, outmsg);
+ /* Use the given serializer to translate the give C structure in arg
+ * to XDR format which will be written into the buffer in outmsg.
+ */
+ /* retlen is used to received the error since size_t is unsigned and we
+ * need -1 for error notification during encoding.
+ */
+
+ retlen = xdr_serialize_generic (*outmsg, arg, xdrproc);
+ if (retlen == -1) {
+ /* Failed to Encode 'GlusterFS' msg in RPC is not exactly
+ failure of RPC return values.. client should get
+ notified about this, so there are no missing frames */
+ gf_log_callingfn ("", GF_LOG_ERROR, "Failed to encode message");
+ req->rpc_err = GARBAGE_ARGS;
+ retlen = 0;
+ }
+ }
+ outmsg->iov_len = retlen;
+ret:
+ if (retlen == -1) {
+ iobuf_unref (iob);
+ iob = NULL;
+ }
+
+ return iob;
+}
+
+int
+quotad_aggregator_submit_reply (call_frame_t *frame, rpcsvc_request_t *req,
+ void *arg, struct iovec *payload,
+ int payloadcount, struct iobref *iobref,
+ xdrproc_t xdrproc)
+{
+ struct iobuf *iob = NULL;
+ int ret = -1;
+ struct iovec rsp = {0,};
+ quotad_aggregator_state_t *state = NULL;
+ char new_iobref = 0;
+
+ GF_VALIDATE_OR_GOTO ("server", req, ret);
+
+ if (frame) {
+ state = frame->root->state;
+ frame->local = NULL;
+ }
+
+ if (!iobref) {
+ iobref = iobref_new ();
+ if (!iobref) {
+ goto ret;
+ }
+
+ new_iobref = 1;
+ }
+
+ iob = quotad_serialize_reply (req, arg, &rsp, xdrproc);
+ if (!iob) {
+ gf_log ("", GF_LOG_ERROR, "Failed to serialize reply");
+ goto ret;
+ }
+
+ iobref_add (iobref, iob);
+
+ ret = rpcsvc_submit_generic (req, &rsp, 1, payload, payloadcount,
+ iobref);
+
+ iobuf_unref (iob);
+
+ ret = 0;
+ret:
+ if (state) {
+ quotad_aggregator_free_state (state);
+ }
+
+ if (frame) {
+ if (frame->root->client)
+ gf_client_unref (frame->root->client);
+
+ STACK_DESTROY (frame->root);
+ }
+
+ if (new_iobref) {
+ iobref_unref (iobref);
+ }
+
+ return ret;
+}
+
+int
+quotad_aggregator_getlimit_cbk (xlator_t *this, call_frame_t *frame,
+ void *lookup_rsp)
+{
+ gfs3_lookup_rsp *rsp = lookup_rsp;
+ gf_cli_rsp cli_rsp = {0,};
+ dict_t *xdata = NULL;
+ int ret = -1;
+
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->this, xdata,
+ (rsp->xdata.xdata_val),
+ (rsp->xdata.xdata_len), rsp->op_ret,
+ rsp->op_errno, out);
+
+ ret = 0;
+out:
+ rsp->op_ret = ret;
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to unserialize "
+ "nameless lookup rsp");
+ goto reply;
+ }
+ cli_rsp.op_ret = rsp->op_ret;
+ cli_rsp.op_errno = rsp->op_errno;
+ cli_rsp.op_errstr = "";
+ if (xdata) {
+ GF_PROTOCOL_DICT_SERIALIZE (frame->this, xdata,
+ (&cli_rsp.dict.dict_val),
+ (cli_rsp.dict.dict_len),
+ cli_rsp.op_errno, reply);
+ }
+
+reply:
+ quotad_aggregator_submit_reply (frame, frame->local, (void*)&cli_rsp, NULL, 0,
+ NULL, (xdrproc_t)xdr_gf_cli_rsp);
+
+ dict_unref (xdata);
+ GF_FREE (cli_rsp.dict.dict_val);
+ return 0;
+}
+
+int
+quotad_aggregator_getlimit (rpcsvc_request_t *req)
+{
+ call_frame_t *frame = NULL;
+ gf_cli_req cli_req = {{0}, };
+ gf_cli_rsp cli_rsp = {0};
+ gfs3_lookup_req args = {{0,},};
+ gfs3_lookup_rsp rsp = {0,};
+ quotad_aggregator_state_t *state = NULL;
+ xlator_t *this = NULL;
+ dict_t *dict = NULL;
+ int ret = -1, op_errno = 0;
+ char *gfid_str = NULL;
+ uuid_t gfid = {0};
+
+ GF_VALIDATE_OR_GOTO ("quotad-aggregator", req, err);
+
+ this = THIS;
+
+ ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
+ //failed to decode msg;
+ gf_log ("", GF_LOG_ERROR, "xdr decoding error");
+ req->rpc_err = GARBAGE_ARGS;
+ goto err;
+ }
+
+ if (cli_req.dict.dict_len) {
+ dict = dict_new ();
+ ret = dict_unserialize (cli_req.dict.dict_val,
+ cli_req.dict.dict_len, &dict);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "unserialize req-buffer to dictionary");
+ goto err;
+ }
+ }
+
+ ret = dict_get_str (dict, "gfid", &gfid_str);
+ if (ret) {
+ goto err;
+ }
+
+ uuid_parse ((const char*)gfid_str, gfid);
+
+ frame = quotad_aggregator_get_frame_from_req (req);
+ if (frame == NULL) {
+ rsp.op_errno = ENOMEM;
+ goto err;
+ }
+ state = frame->root->state;
+ state->xdata = dict;
+ ret = dict_set_int32 (state->xdata, QUOTA_LIMIT_KEY, 42);
+ if (ret)
+ goto err;
+
+ ret = dict_set_int32 (state->xdata, QUOTA_SIZE_KEY, 42);
+ if (ret)
+ goto err;
+
+ ret = dict_set_int32 (state->xdata, GET_ANCESTRY_PATH_KEY, 42);
+ if (ret)
+ goto err;
+
+ memcpy (&args.gfid, &gfid, 16);
+
+ args.bname = alloca (req->msg[0].iov_len);
+ args.xdata.xdata_val = alloca (req->msg[0].iov_len);
+
+ ret = qd_nameless_lookup (this, frame, &args, state->xdata,
+ quotad_aggregator_getlimit_cbk);
+ if (ret) {
+ rsp.op_errno = ret;
+ goto err;
+ }
+
+ return ret;
+
+err:
+ cli_rsp.op_ret = -1;
+ cli_rsp.op_errno = op_errno;
+ cli_rsp.op_errstr = "";
+
+ quotad_aggregator_getlimit_cbk (this, frame, &cli_rsp);
+ dict_unref (dict);
+
+ return ret;
+}
+
+int
+quotad_aggregator_lookup_cbk (xlator_t *this, call_frame_t *frame,
+ void *rsp)
+{
+ quotad_aggregator_submit_reply (frame, frame->local, rsp, NULL, 0, NULL,
+ (xdrproc_t)xdr_gfs3_lookup_rsp);
+
+ return 0;
+}
+
+
+int
+quotad_aggregator_lookup (rpcsvc_request_t *req)
+{
+ call_frame_t *frame = NULL;
+ gfs3_lookup_req args = {{0,},};
+ int ret = -1, op_errno = 0;
+ gfs3_lookup_rsp rsp = {0,};
+ quotad_aggregator_state_t *state = NULL;
+ xlator_t *this = NULL;
+
+ GF_VALIDATE_OR_GOTO ("quotad-aggregator", req, err);
+
+ this = THIS;
+
+ args.bname = alloca (req->msg[0].iov_len);
+ args.xdata.xdata_val = alloca (req->msg[0].iov_len);
+
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_lookup_req);
+ if (ret < 0) {
+ rsp.op_errno = EINVAL;
+ goto err;
+ }
+
+ frame = quotad_aggregator_get_frame_from_req (req);
+ if (frame == NULL) {
+ rsp.op_errno = ENOMEM;
+ goto err;
+ }
+
+ state = frame->root->state;
+
+ GF_PROTOCOL_DICT_UNSERIALIZE (this, state->xdata,
+ (args.xdata.xdata_val),
+ (args.xdata.xdata_len), ret,
+ op_errno, err);
+
+
+ ret = qd_nameless_lookup (this, frame, &args, state->xdata,
+ quotad_aggregator_lookup_cbk);
+ if (ret) {
+ rsp.op_errno = ret;
+ goto err;
+ }
+
+ return ret;
+
+err:
+ rsp.op_ret = -1;
+ rsp.op_errno = op_errno;
+
+ quotad_aggregator_lookup_cbk (this, frame, &rsp);
+ return ret;
+}
+
+int
+quotad_aggregator_rpc_notify (rpcsvc_t *rpc, void *xl, rpcsvc_event_t event,
+ void *data)
+{
+ if (!xl || !data) {
+ gf_log_callingfn ("server", GF_LOG_WARNING,
+ "Calling rpc_notify without initializing");
+ goto out;
+ }
+
+ switch (event) {
+ case RPCSVC_EVENT_ACCEPT:
+ break;
+
+ case RPCSVC_EVENT_DISCONNECT:
+ break;
+
+ default:
+ break;
+ }
+
+out:
+ return 0;
+}
+
+int
+quotad_aggregator_init (xlator_t *this)
+{
+ quota_priv_t *priv = NULL;
+ int ret = -1;
+
+ priv = this->private;
+
+ ret = dict_set_str (this->options, "transport.address-family", "unix");
+ if (ret)
+ goto out;
+
+ ret = dict_set_str (this->options, "transport-type", "socket");
+ if (ret)
+ goto out;
+
+ ret = dict_set_str (this->options, "transport.socket.listen-path",
+ "/tmp/quotad.socket");
+ if (ret)
+ goto out;
+
+ /* RPC related */
+ priv->rpcsvc = rpcsvc_init (this, this->ctx, this->options, 0);
+ if (priv->rpcsvc == NULL) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "creation of rpcsvc failed");
+ ret = -1;
+ goto out;
+ }
+
+ ret = rpcsvc_create_listeners (priv->rpcsvc, this->options,
+ this->name);
+ if (ret < 1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "creation of listener failed");
+ ret = -1;
+ goto out;
+ }
+
+ priv->quotad_aggregator = &quotad_aggregator_prog;
+ quotad_aggregator_prog.options = this->options;
+
+ ret = rpcsvc_program_register (priv->rpcsvc, &quotad_aggregator_prog);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "registration of program (name:%s, prognum:%d, "
+ "progver:%d) failed", quotad_aggregator_prog.progname,
+ quotad_aggregator_prog.prognum,
+ quotad_aggregator_prog.progver);
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+rpcsvc_actor_t quotad_aggregator_actors[] = {
+ [GF_AGGREGATOR_NULL] = {"NULL", GF_AGGREGATOR_NULL, NULL, NULL, 0,
+ DRC_NA},
+ [GF_AGGREGATOR_LOOKUP] = {"LOOKUP", GF_AGGREGATOR_NULL,
+ quotad_aggregator_lookup, NULL, 0, DRC_NA},
+ [GF_AGGREGATOR_GETLIMIT] = {"GETLIMIT", GF_AGGREGATOR_GETLIMIT,
+ quotad_aggregator_getlimit, NULL, 0},
+};
+
+
+struct rpcsvc_program quotad_aggregator_prog = {
+ .progname = "GlusterFS 3.3",
+ .prognum = GLUSTER_AGGREGATOR_PROGRAM,
+ .progver = GLUSTER_AGGREGATOR_VERSION,
+ .numactors = GF_AGGREGATOR_MAXVALUE,
+ .actors = quotad_aggregator_actors
+};
diff --git a/xlators/features/quota/src/quotad-aggregator.h b/xlators/features/quota/src/quotad-aggregator.h
new file mode 100644
index 000000000..5ddea5b3c
--- /dev/null
+++ b/xlators/features/quota/src/quotad-aggregator.h
@@ -0,0 +1,37 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _QUOTAD_AGGREGATOR_H
+#define _QUOTAD_AGGREGATOR_H
+
+#include "quota.h"
+#include "stack.h"
+#include "glusterfs3-xdr.h"
+#include "inode.h"
+
+typedef struct {
+ void *pool;
+ xlator_t *this;
+ xlator_t *active_subvol;
+ inode_table_t *itable;
+ loc_t loc;
+ dict_t *xdata;
+} quotad_aggregator_state_t;
+
+typedef int (*quotad_aggregator_lookup_cbk_t) (xlator_t *this,
+ call_frame_t *frame,
+ void *rsp);
+int
+qd_nameless_lookup (xlator_t *this, call_frame_t *frame, gfs3_lookup_req *req,
+ dict_t *xdata, quotad_aggregator_lookup_cbk_t lookup_cbk);
+int
+quotad_aggregator_init (xlator_t *this);
+
+#endif
diff --git a/xlators/features/quota/src/quotad-helpers.c b/xlators/features/quota/src/quotad-helpers.c
new file mode 100644
index 000000000..fd3099114
--- /dev/null
+++ b/xlators/features/quota/src/quotad-helpers.c
@@ -0,0 +1,113 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "quotad-helpers.h"
+
+quotad_aggregator_state_t *
+get_quotad_aggregator_state (xlator_t *this, rpcsvc_request_t *req)
+{
+ quotad_aggregator_state_t *state = NULL;
+ xlator_t *active_subvol = NULL;
+ quota_priv_t *priv = NULL;
+
+ state = (void *)GF_CALLOC (1, sizeof (*state),
+ gf_quota_mt_aggregator_state_t);
+ if (!state)
+ return NULL;
+
+ state->this = THIS;
+ priv = this->private;
+
+ LOCK (&priv->lock);
+ {
+ active_subvol = state->active_subvol = FIRST_CHILD (this);
+ }
+ UNLOCK (&priv->lock);
+
+ if (active_subvol->itable == NULL)
+ active_subvol->itable = inode_table_new (4096, active_subvol);
+
+ state->itable = active_subvol->itable;
+
+ state->pool = this->ctx->pool;
+
+ return state;
+}
+
+void
+quotad_aggregator_free_state (quotad_aggregator_state_t *state)
+{
+ if (state->xdata)
+ dict_unref (state->xdata);
+
+ GF_FREE (state);
+}
+
+call_frame_t *
+quotad_aggregator_alloc_frame (rpcsvc_request_t *req)
+{
+ call_frame_t *frame = NULL;
+ quotad_aggregator_state_t *state = NULL;
+ xlator_t *this = NULL;
+
+ GF_VALIDATE_OR_GOTO ("server", req, out);
+ GF_VALIDATE_OR_GOTO ("server", req->trans, out);
+ GF_VALIDATE_OR_GOTO ("server", req->svc, out);
+ GF_VALIDATE_OR_GOTO ("server", req->svc->ctx, out);
+
+ this = req->svc->mydata;
+
+ frame = create_frame (this, req->svc->ctx->pool);
+ if (!frame)
+ goto out;
+
+ state = get_quotad_aggregator_state (this, req);
+ if (!state)
+ goto out;
+
+ frame->root->state = state;
+ frame->root->unique = 0;
+
+ frame->this = this;
+out:
+ return frame;
+}
+
+call_frame_t *
+quotad_aggregator_get_frame_from_req (rpcsvc_request_t *req)
+{
+ call_frame_t *frame = NULL;
+ client_t *client = NULL;
+
+ GF_VALIDATE_OR_GOTO ("server", req, out);
+
+ frame = quotad_aggregator_alloc_frame (req);
+ if (!frame)
+ goto out;
+
+ client = req->trans->xl_private;
+
+ frame->root->op = req->procnum;
+
+ frame->root->unique = req->xid;
+
+ frame->root->uid = req->uid;
+ frame->root->gid = req->gid;
+ frame->root->pid = req->pid;
+
+ gf_client_ref (client);
+ frame->root->client = client;
+
+ frame->root->lk_owner = req->lk_owner;
+
+ frame->local = req;
+out:
+ return frame;
+}
diff --git a/xlators/features/quota/src/quotad-helpers.h b/xlators/features/quota/src/quotad-helpers.h
new file mode 100644
index 000000000..a10fb7fa8
--- /dev/null
+++ b/xlators/features/quota/src/quotad-helpers.h
@@ -0,0 +1,24 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef QUOTAD_HELPERS_H
+#define QUOTAD_HELPERS_H
+
+#include "rpcsvc.h"
+#include "quota.h"
+#include "quotad-aggregator.h"
+
+void
+quotad_aggregator_free_state (quotad_aggregator_state_t *state);
+
+call_frame_t *
+quotad_aggregator_get_frame_from_req (rpcsvc_request_t *req);
+
+#endif
diff --git a/xlators/features/quota/src/quotad.c b/xlators/features/quota/src/quotad.c
new file mode 100644
index 000000000..243b943e9
--- /dev/null
+++ b/xlators/features/quota/src/quotad.c
@@ -0,0 +1,210 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#include "quota.h"
+#include "quotad-aggregator.h"
+#include "common-utils.h"
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ if (!this)
+ return ret;
+
+ ret = xlator_mem_acct_init (this, gf_quota_mt_end + 1);
+
+ if (0 != ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Memory accounting "
+ "init failed");
+ return ret;
+ }
+
+ return ret;
+}
+
+int32_t
+qd_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *postparent)
+{
+ quotad_aggregator_lookup_cbk_t lookup_cbk = NULL;
+ gfs3_lookup_rsp rsp = {0, };
+
+ lookup_cbk = cookie;
+
+ rsp.op_ret = op_ret;
+ rsp.op_errno = op_errno;
+
+ gf_stat_from_iatt (&rsp.postparent, postparent);
+
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ rsp.xdata.xdata_len, rsp.op_errno, out);
+
+ gf_stat_from_iatt (&rsp.stat, buf);
+
+out:
+ lookup_cbk (this, frame, &rsp);
+
+ GF_FREE (rsp.xdata.xdata_val);
+
+ inode_unref (inode);
+
+ return 0;
+}
+
+xlator_t *
+qd_find_subvol (xlator_t *this, char *volume_uuid)
+{
+ xlator_list_t *child = NULL;
+ xlator_t *subvol = NULL;
+ char key[1024];
+ char *optstr = NULL;
+
+ if (!this || !volume_uuid)
+ goto out;
+
+ for (child = this->children; child; child = child->next) {
+ snprintf(key, 1024, "%s.volume-id", child->xlator->name);
+ if (dict_get_str(this->options, key, &optstr) < 0)
+ continue;
+
+ if (strcmp (optstr, volume_uuid) == 0) {
+ subvol = child->xlator;
+ break;
+ }
+ }
+
+out:
+ return subvol;
+}
+
+int
+qd_nameless_lookup (xlator_t *this, call_frame_t *frame, gfs3_lookup_req *req,
+ dict_t *xdata, quotad_aggregator_lookup_cbk_t lookup_cbk)
+{
+ gfs3_lookup_rsp rsp = {0, };
+ int op_errno = 0, ret = -1;
+ loc_t loc = {0, };
+ quotad_aggregator_state_t *state = NULL;
+ quota_priv_t *priv = NULL;
+ xlator_t *subvol = NULL;
+ char *volume_uuid = NULL;
+
+ priv = this->private;
+ state = frame->root->state;
+
+ frame->root->op = GF_FOP_LOOKUP;
+
+ loc.inode = inode_new (state->itable);
+ if (loc.inode == NULL) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ memcpy (loc.gfid, req->gfid, 16);
+
+ ret = dict_get_str (xdata, "volume-uuid", &volume_uuid);
+ if (ret < 0) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ subvol = qd_find_subvol (this, volume_uuid);
+ if (subvol == NULL) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ STACK_WIND_COOKIE (frame, qd_lookup_cbk, lookup_cbk, subvol,
+ subvol->fops->lookup, &loc, xdata);
+ return 0;
+
+out:
+ rsp.op_ret = -1;
+ rsp.op_errno = op_errno;
+
+ lookup_cbk (this, frame, &rsp);
+
+ inode_unref (loc.inode);
+ return 0;
+}
+
+int
+qd_reconfigure (xlator_t *this, dict_t *options)
+{
+ /* As of now quotad is restarted upon alteration of volfile */
+ return 0;
+}
+
+void
+qd_fini (xlator_t *this)
+{
+ return;
+}
+
+int32_t
+qd_init (xlator_t *this)
+{
+ int32_t ret = -1;
+ quota_priv_t *priv = NULL;
+
+ if (NULL == this->children) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "FATAL: quota (%s) not configured for min of 1 child",
+ this->name);
+ ret = -1;
+ goto err;
+ }
+
+ QUOTA_ALLOC_OR_GOTO (priv, quota_priv_t, err);
+ LOCK_INIT (&priv->lock);
+
+ this->private = priv;
+
+ ret = quotad_aggregator_init (this);
+ if (ret < 0)
+ goto err;
+
+ ret = 0;
+err:
+ if (ret) {
+ GF_FREE (priv);
+ }
+ return ret;
+}
+
+class_methods_t class_methods = {
+ .init = qd_init,
+ .fini = qd_fini,
+ .reconfigure = qd_reconfigure,
+};
+
+struct xlator_fops fops = {
+};
+
+struct xlator_cbks cbks = {
+};
+
+struct volume_options options[] = {
+ { .key = {"transport-type"},
+ .value = {"rpc", "rpc-over-rdma", "tcp", "socket", "ib-verbs",
+ "unix", "ib-sdp", "tcp/server", "ib-verbs/server", "rdma",
+ "rdma*([ \t]),*([ \t])socket",
+ "rdma*([ \t]),*([ \t])tcp",
+ "tcp*([ \t]),*([ \t])rdma",
+ "socket*([ \t]),*([ \t])rdma"},
+ .type = GF_OPTION_TYPE_STR
+ },
+ { .key = {"transport.*"},
+ .type = GF_OPTION_TYPE_ANY,
+ },
+ {.key = {NULL}}
+};
diff --git a/xlators/features/read-only/src/Makefile.am b/xlators/features/read-only/src/Makefile.am
index aa4112282..4c1462137 100644
--- a/xlators/features/read-only/src/Makefile.am
+++ b/xlators/features/read-only/src/Makefile.am
@@ -4,12 +4,12 @@ xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
noinst_HEADERS = read-only-common.h
-read_only_la_LDFLAGS = -module -avoidversion
+read_only_la_LDFLAGS = -module -avoid-version
read_only_la_SOURCES = read-only.c read-only-common.c
read_only_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-worm_la_LDFLAGS = -module -avoidversion
+worm_la_LDFLAGS = -module -avoid-version
worm_la_SOURCES = read-only-common.c worm.c
worm_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
diff --git a/xlators/features/read-only/src/worm.c b/xlators/features/read-only/src/worm.c
index 34e7d6643..16c3eb3da 100644
--- a/xlators/features/read-only/src/worm.c
+++ b/xlators/features/read-only/src/worm.c
@@ -81,8 +81,7 @@ struct xlator_fops fops = {
.lk = ro_lk,
};
-struct xlator_cbks cbks = {
-};
+struct xlator_cbks cbks;
struct volume_options options[] = {
{ .key = {NULL} },
diff --git a/xlators/features/trash/src/Makefile.am b/xlators/features/trash/src/Makefile.am
index 4541e696d..5251eb082 100644
--- a/xlators/features/trash/src/Makefile.am
+++ b/xlators/features/trash/src/Makefile.am
@@ -1,7 +1,7 @@
xlator_LTLIBRARIES = trash.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/features
-trash_la_LDFLAGS = -module -avoidversion
+trash_la_LDFLAGS = -module -avoid-version
trash_la_SOURCES = trash.c
trash_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
diff --git a/xlators/lib/src/libxlator.c b/xlators/lib/src/libxlator.c
index 624a929d0..4e680c510 100644
--- a/xlators/lib/src/libxlator.c
+++ b/xlators/lib/src/libxlator.c
@@ -11,6 +11,34 @@
#include "libxlator.h"
+int marker_xtime_default_gauge[] = {
+ [MCNT_FOUND] = 1,
+ [MCNT_NOTFOUND] = -1,
+ [MCNT_ENODATA] = -1,
+ [MCNT_ENOTCONN] = -1,
+ [MCNT_ENOENT] = -1,
+ [MCNT_EOTHER] = -1,
+};
+
+int marker_uuid_default_gauge[] = {
+ [MCNT_FOUND] = 1,
+ [MCNT_NOTFOUND] = 0,
+ [MCNT_ENODATA] = 0,
+ [MCNT_ENOTCONN] = 0,
+ [MCNT_ENOENT] = 0,
+ [MCNT_EOTHER] = 0,
+};
+
+static int marker_idx_errno_map[] = {
+ [MCNT_FOUND] = EINVAL,
+ [MCNT_NOTFOUND] = EINVAL,
+ [MCNT_ENOENT] = ENOENT,
+ [MCNT_ENOTCONN] = ENOTCONN,
+ [MCNT_ENODATA] = ENODATA,
+ [MCNT_EOTHER] = EINVAL,
+ [MCNT_MAX] = 0,
+};
+
/*Copy the contents of oldtimebuf to newtimbuf*/
static void
update_timebuf (uint32_t *oldtimbuf, uint32_t *newtimebuf)
@@ -47,22 +75,61 @@ match_uuid_local (const char *name, char *uuid)
static void
marker_local_incr_errcount (xl_marker_local_t *local, int op_errno)
{
+ marker_result_idx_t i = -1;
+
if (!local)
return;
switch (op_errno) {
case ENODATA:
- local->enodata_count++;
+ i = MCNT_ENODATA;
break;
case ENOENT:
- local->enoent_count++;
+ i = MCNT_ENOENT;
break;
case ENOTCONN:
- local->enotconn_count++;
+ i = MCNT_ENOTCONN;
break;
default:
+ i = MCNT_EOTHER;
+ break;
+ }
+
+ local->count[i]++;
+}
+
+static int
+evaluate_marker_results (int *gauge, int *count)
+{
+ int i = 0;
+ int op_errno = 0;
+ gf_boolean_t sane = _gf_true;
+
+ /* check if the policy of the gauge is violated;
+ * if yes, try to get the best errno, ie. look
+ * for the first position where there is a more
+ * specific kind of vioilation than the generic EINVAL
+ */
+ for (i = 0; i < MCNT_MAX; i++) {
+ if (sane) {
+ if ((gauge[i] > 0 && count[i] < gauge[i]) ||
+ (gauge[i] < 0 && count[i] >= -gauge[i])) {
+ sane = _gf_false;
+ /* generic action: adopt corresponding errno */
+ op_errno = marker_idx_errno_map[i];
+ }
+ } else {
+ /* already insane; trying to get a more informative
+ * errno by checking subsequent counters
+ */
+ if (count[i] > 0)
+ op_errno = marker_idx_errno_map[i];
+ }
+ if (op_errno && op_errno != EINVAL)
break;
}
+
+ return op_errno;
}
/* Aggregate all the <volid>.xtime attrs of the cluster and send the max*/
@@ -98,14 +165,10 @@ cluster_markerxtime_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
{
callcnt = --local->call_count;
- if (local->esomerr)
- goto unlock;
-
vol_uuid = local->vol_uuid;
if (op_ret) {
marker_local_incr_errcount (local, op_errno);
- local->esomerr = op_errno;
goto unlock;
}
@@ -119,11 +182,11 @@ cluster_markerxtime_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (dict_get_ptr (dict, marker_xattr, (void **)&net_timebuf)) {
gf_log (this->name, GF_LOG_WARNING,
"Unable to get <uuid>.xtime attr");
- local->noxtime_count++;
+ local->count[MCNT_NOTFOUND]++;
goto unlock;
}
- if (local->has_xtime) {
+ if (local->count[MCNT_FOUND]) {
get_hosttime (net_timebuf, host_timebuf);
if ( (host_timebuf[0]>local->host_timebuf[0]) ||
(host_timebuf[0] == local->host_timebuf[0] &&
@@ -135,7 +198,7 @@ cluster_markerxtime_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
} else {
get_hosttime (net_timebuf, local->host_timebuf);
update_timebuf (net_timebuf, local->net_timebuf);
- local->has_xtime = _gf_true;
+ local->count[MCNT_FOUND]++;
}
}
@@ -147,7 +210,7 @@ unlock:
op_errno = 0;
need_unwind = 1;
- if (local->has_xtime) {
+ if (local->count[MCNT_FOUND]) {
if (!dict)
dict = dict_new();
@@ -160,17 +223,9 @@ unlock:
}
}
- if (local->noxtime_count)
- goto out;
-
- if (local->enodata_count || local->enotconn_count ||
- local->enoent_count) {
+ op_errno = evaluate_marker_results (local->gauge, local->count);
+ if (op_errno)
op_ret = -1;
- op_errno = local->enodata_count? ENODATA:
- local->enotconn_count? ENOTCONN:
- local->enoent_count? ENOENT:
- local->esomerr;
- }
}
out:
@@ -178,6 +233,7 @@ out:
frame->local = local->xl_local;
local->xl_specf_unwind (frame, op_ret,
op_errno, dict, xdata);
+ GF_FREE (local);
} else if (need_unwind) {
STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno,
dict, xdata);
@@ -228,7 +284,7 @@ cluster_markeruuid_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (ret)
goto unlock;
- if (marker_has_volinfo (local)) {
+ if (local->count[MCNT_FOUND]) {
if ((local->volmark->major != volmark->major) ||
(local->volmark->minor != volmark->minor)) {
op_ret = -1;
@@ -257,6 +313,7 @@ cluster_markeruuid_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
uuid_unparse (volmark->uuid, vol_uuid);
if (volmark->retval)
local->retval = volmark->retval;
+ local->count[MCNT_FOUND]++;
}
}
unlock:
@@ -267,7 +324,7 @@ unlock:
op_errno = 0;
need_unwind = 1;
- if (marker_has_volinfo (local)) {
+ if (local->count[MCNT_FOUND]) {
if (!dict)
dict = dict_new();
@@ -277,11 +334,10 @@ unlock:
op_ret = -1;
op_errno = ENOMEM;
}
- } else {
- op_ret = -1;
- op_errno = local->enotconn_count? ENOTCONN:
- local->enoent_count? ENOENT:EINVAL;
}
+ op_errno = evaluate_marker_results (local->gauge, local->count);
+ if (op_errno)
+ op_ret = -1;
}
out:
@@ -289,6 +345,7 @@ unlock:
frame->local = local->xl_local;
local->xl_specf_unwind (frame, op_ret,
op_errno, dict, xdata);
+ GF_FREE (local);
return 0;
} else if (need_unwind){
STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno,
@@ -303,7 +360,7 @@ cluster_getmarkerattr (call_frame_t *frame,xlator_t *this, loc_t *loc,
const char *name, void *xl_local,
xlator_specf_unwind_t xl_specf_getxattr_unwind,
xlator_t **sub_volumes, int count, int type,
- char *vol_uuid)
+ int *gauge, char *vol_uuid)
{
int i = 0;
xl_marker_local_t *local = NULL;
@@ -326,6 +383,7 @@ cluster_getmarkerattr (call_frame_t *frame,xlator_t *this, loc_t *loc,
local->call_count = count;
local->xl_specf_unwind = xl_specf_getxattr_unwind;
local->vol_uuid = vol_uuid;
+ memcpy (local->gauge, gauge, sizeof (local->gauge));
frame->local = local;
@@ -357,3 +415,113 @@ err:
return -1;
}
+
+int
+gf_get_min_stime (xlator_t *this, dict_t *dst, char *key, data_t *value)
+{
+ int ret = -1;
+ uint32_t *net_timebuf = NULL;
+ uint32_t *value_timebuf = NULL;
+ uint32_t host_timebuf[2] = {0,};
+ uint32_t host_value_timebuf[2] = {0,};
+
+ /* stime should be minimum of all the other nodes */
+ ret = dict_get_bin (dst, key, (void **)&net_timebuf);
+ if (ret < 0) {
+ net_timebuf = GF_CALLOC (1, sizeof (int64_t),
+ gf_common_mt_char);
+ if (!net_timebuf)
+ goto out;
+
+ ret = dict_set_bin (dst, key, net_timebuf, sizeof (int64_t));
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "key=%s: dict set failed", key);
+ goto error;
+ }
+ }
+
+ value_timebuf = data_to_bin (value);
+ if (!value_timebuf) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "key=%s: getting value of stime failed", key);
+ ret = -1;
+ goto out;
+ }
+
+ get_hosttime (value_timebuf, host_value_timebuf);
+ get_hosttime (net_timebuf, host_timebuf);
+
+ /* can't use 'min()' macro here as we need to compare two fields
+ in the array, selectively */
+ if ((host_value_timebuf[0] < host_timebuf[0]) ||
+ ((host_value_timebuf[0] == host_timebuf[0]) &&
+ (host_value_timebuf[1] < host_timebuf[1]))) {
+ update_timebuf (value_timebuf, net_timebuf);
+ }
+
+ ret = 0;
+out:
+ return ret;
+error:
+ /* To be used only when net_timebuf is not set in the dict */
+ if (net_timebuf)
+ GF_FREE (net_timebuf);
+
+ return ret;
+}
+
+int
+gf_get_max_stime (xlator_t *this, dict_t *dst, char *key, data_t *value)
+{
+ int ret = -1;
+ uint32_t *net_timebuf = NULL;
+ uint32_t *value_timebuf = NULL;
+ uint32_t host_timebuf[2] = {0,};
+ uint32_t host_value_timebuf[2] = {0,};
+
+ /* stime should be maximum of all the other nodes */
+ ret = dict_get_bin (dst, key, (void **)&net_timebuf);
+ if (ret < 0) {
+ net_timebuf = GF_CALLOC (1, sizeof (int64_t),
+ gf_common_mt_char);
+ if (!net_timebuf)
+ goto out;
+
+ ret = dict_set_bin (dst, key, net_timebuf, sizeof (int64_t));
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "key=%s: dict set failed", key);
+ goto error;
+ }
+ }
+
+ value_timebuf = data_to_bin (value);
+ if (!value_timebuf) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "key=%s: getting value of stime failed", key);
+ ret = -1;
+ goto out;
+ }
+
+ get_hosttime (value_timebuf, host_value_timebuf);
+ get_hosttime (net_timebuf, host_timebuf);
+
+ /* can't use 'max()' macro here as we need to compare two fields
+ in the array, selectively */
+ if ((host_value_timebuf[0] > host_timebuf[0]) ||
+ ((host_value_timebuf[0] == host_timebuf[0]) &&
+ (host_value_timebuf[1] > host_timebuf[1]))) {
+ update_timebuf (value_timebuf, net_timebuf);
+ }
+
+ ret = 0;
+out:
+ return ret;
+error:
+ /* To be used only when net_timebuf is not set in the dict */
+ if (net_timebuf)
+ GF_FREE (net_timebuf);
+
+ return ret;
+}
diff --git a/xlators/lib/src/libxlator.h b/xlators/lib/src/libxlator.h
index 7e75c9482..175d3141d 100644
--- a/xlators/lib/src/libxlator.h
+++ b/xlators/lib/src/libxlator.h
@@ -32,6 +32,7 @@
#define MARKER_UUID_TYPE 1
#define MARKER_XTIME_TYPE 2
#define GF_XATTR_QUOTA_SIZE_KEY "trusted.glusterfs.quota.size"
+#define GF_XATTR_QUOTA_LIMIT_LIST "trusted.limit.list"
typedef int32_t (*xlator_specf_unwind_t) (call_frame_t *frame,
@@ -48,6 +49,69 @@ struct volume_mark {
uint32_t usec;
}__attribute__ ((__packed__));
+
+/*
+ * The enumerated type here
+ * is used to index two kind
+ * of integer arrays:
+ * - gauges
+ * - counters
+
+ * A counter is used internally,
+ * in getxattr callbacks, to count
+ * the results, categorized as
+ * the enum names suggest. So values
+ * in the counter are always non-negative.
+
+ * Gauges are part of the API.
+ * The caller passes one to the
+ * top-level aggregator function,
+ * cluster_getmarkerattr(). The gauge
+ * defines an evaluation policy for the
+ * counter. That is, at the
+ * end of the aggregation process
+ * the gauge is matched against the
+ * counter, and the policy
+ * represented by the gauge decides
+ * whether to return with success or failure,
+ * and in latter case, what particular failure
+ * case (errno).
+
+ * The rules are the following: for some index i,
+ * - if gauge[i] == 0, no requirement is set
+ * against counter[i];
+ * - if gauge[i] > 0, counter[i] >= gauge[i]
+ * is required;
+ * - if gauge[i] < 0, counter[i] < |gauge[i]|
+ * is required.
+
+ * If the requirement is not met, then i is mapped
+ * to the respective errno (MCNT_ENOENT -> ENOENT),
+ * or in lack of that, EINVAL.
+
+ * Cf. evaluate_marker_results() and marker_idx_errno_map[]
+ * in libxlator.c
+
+ * We provide two default gauges, one inteded for xtime
+ * aggregation, other for volume mark aggregation. The
+ * policies they represent agree with the hard-coded
+ * one prior to gauges. Cf. marker_xtime_default_gauge
+ * and marker_uuid_default_gauge in libxlator.c
+ */
+
+typedef enum {
+ MCNT_FOUND,
+ MCNT_NOTFOUND,
+ MCNT_ENODATA,
+ MCNT_ENOTCONN,
+ MCNT_ENOENT,
+ MCNT_EOTHER,
+ MCNT_MAX
+} marker_result_idx_t;
+
+extern int marker_xtime_default_gauge[];
+extern int marker_uuid_default_gauge[];
+
struct marker_str {
struct volume_mark *volmark;
data_t *data;
@@ -55,13 +119,8 @@ struct marker_str {
uint32_t host_timebuf[2];
uint32_t net_timebuf[2];
int32_t call_count;
- unsigned has_xtime:1;
- int32_t enoent_count;
- int32_t enotconn_count;
- int32_t enodata_count;
- int32_t noxtime_count;
-
- int esomerr;
+ int gauge[MCNT_MAX];
+ int count[MCNT_MAX];
xlator_specf_unwind_t xl_specf_unwind;
void *xl_local;
@@ -71,15 +130,6 @@ struct marker_str {
typedef struct marker_str xl_marker_local_t;
-static inline gf_boolean_t
-marker_has_volinfo (xl_marker_local_t *marker)
-{
- if (marker->volmark)
- return _gf_true;
- else
- return _gf_false;
-}
-
int32_t
cluster_markerxtime_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, dict_t *dict, dict_t *xdata);
@@ -93,12 +143,15 @@ cluster_getmarkerattr (call_frame_t *frame,xlator_t *this, loc_t *loc,
const char *name, void *xl_local,
xlator_specf_unwind_t xl_specf_getxattr_unwind,
xlator_t **sub_volumes, int count, int type,
- char *vol_uuid);
+ int *gauge, char *vol_uuid);
int
match_uuid_local (const char *name, char *uuid);
+int
+gf_get_min_stime (xlator_t *this, dict_t *dst, char *key, data_t *value);
-
+int
+gf_get_max_stime (xlator_t *this, dict_t *dst, char *key, data_t *value);
#endif /* !_LIBXLATOR_H */
diff --git a/xlators/mgmt/glusterd/src/Makefile.am b/xlators/mgmt/glusterd/src/Makefile.am
index 95d2aba9d..9b33edf4d 100644
--- a/xlators/mgmt/glusterd/src/Makefile.am
+++ b/xlators/mgmt/glusterd/src/Makefile.am
@@ -1,32 +1,38 @@
xlator_LTLIBRARIES = glusterd.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/mgmt
glusterd_la_CPPFLAGS = $(AM_CPPFLAGS) "-DFILTERDIR=\"$(libdir)/glusterfs/$(PACKAGE_VERSION)/filter\""
-glusterd_la_LDFLAGS = -module -avoidversion
+glusterd_la_LDFLAGS = -module -avoid-version
+if ENABLE_BD_XLATOR
+glusterd_la_LDFLAGS += -llvm2app
+endif
glusterd_la_SOURCES = glusterd.c glusterd-handler.c glusterd-sm.c \
glusterd-op-sm.c glusterd-utils.c glusterd-rpc-ops.c \
glusterd-store.c glusterd-handshake.c glusterd-pmap.c \
glusterd-volgen.c glusterd-rebalance.c glusterd-quota.c \
glusterd-geo-rep.c glusterd-replace-brick.c glusterd-log-ops.c \
glusterd-volume-ops.c glusterd-brick-ops.c glusterd-mountbroker.c \
- glusterd-syncop.c glusterd-hooks.c
+ glusterd-syncop.c glusterd-hooks.c glusterd-volume-set.c \
+ glusterd-locks.c glusterd-snapshot.c glusterd-mgmt-handler.c \
+ glusterd-mgmt.c glusterd-etcd.c
glusterd_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
$(top_builddir)/rpc/xdr/src/libgfxdr.la \
$(top_builddir)/rpc/rpc-lib/src/libgfrpc.la \
- $(LIBXML2_LIBS) -lcrypto
+ $(XML_LIBS) -lcrypto
noinst_HEADERS = glusterd.h glusterd-utils.h glusterd-op-sm.h \
glusterd-sm.h glusterd-store.h glusterd-mem-types.h \
glusterd-pmap.h glusterd-volgen.h glusterd-mountbroker.h \
- glusterd-syncop.h glusterd-hooks.h
+ glusterd-syncop.h glusterd-hooks.h glusterd-locks.h \
+ glusterd-mgmt.h glusterd-etcd.h
AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
-I$(rpclibdir) -I$(CONTRIBDIR)/rbtree \
-I$(top_srcdir)/rpc/xdr/src -I$(top_srcdir)/rpc/rpc-lib/src \
- -I$(CONTRIBDIR)/uuid \
+ -I$(CONTRIBDIR)/uuid -I$(CONTRIBDIR)/mount \
-DSBIN_DIR=\"$(sbindir)\" -DDATADIR=\"$(localstatedir)\" \
-DGSYNCD_PREFIX=\"$(libexecdir)/glusterfs\"\
- -DSYNCDAEMON_COMPILE=$(SYNCDAEMON_COMPILE) $(LIBXML2_CFLAGS)
+ -DSYNCDAEMON_COMPILE=$(SYNCDAEMON_COMPILE) $(XML_CPPFLAGS)
AM_CFLAGS = -Wall $(GF_CFLAGS)
diff --git a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c
index 5e93c061d..1804dd02e 100644
--- a/xlators/mgmt/glusterd/src/glusterd-brick-ops.c
+++ b/xlators/mgmt/glusterd/src/glusterd-brick-ops.c
@@ -335,7 +335,7 @@ out:
/* Handler functions */
int
-glusterd_handle_add_brick (rpcsvc_request_t *req)
+__glusterd_handle_add_brick (rpcsvc_request_t *req)
{
int32_t ret = -1;
gf_cli_req cli_req = {{0,}};
@@ -358,15 +358,16 @@ glusterd_handle_add_brick (rpcsvc_request_t *req)
GF_ASSERT (req);
- if (!xdr_to_generic (req->msg[0], &cli_req,
- (xdrproc_t)xdr_gf_cli_req)) {
+ ret = xdr_to_generic (req->msg[0], &cli_req,
+ (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
//failed to decode msg;
req->rpc_err = GARBAGE_ARGS;
snprintf (err_str, sizeof (err_str), "Garbage args received");
goto out;
}
- gf_log ("glusterd", GF_LOG_INFO, "Received add brick req");
+ gf_log (this->name, GF_LOG_INFO, "Received add brick req");
if (cli_req.dict.dict_len) {
/* Unserialize the dictionary */
@@ -376,11 +377,11 @@ glusterd_handle_add_brick (rpcsvc_request_t *req)
cli_req.dict.dict_len,
&dict);
if (ret < 0) {
- gf_log ("glusterd", GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"failed to "
"unserialize req-buffer to dictionary");
snprintf (err_str, sizeof (err_str), "Unable to decode "
- "the buffer");
+ "the command");
goto out;
}
}
@@ -388,45 +389,50 @@ glusterd_handle_add_brick (rpcsvc_request_t *req)
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get volume name");
snprintf (err_str, sizeof (err_str), "Unable to get volume "
"name");
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
if (!(ret = glusterd_check_volume_exists (volname))) {
ret = -1;
- snprintf(err_str, 2048, "Volume %s does not exist", volname);
- gf_log ("glusterd", GF_LOG_ERROR, "%s", err_str);
+ snprintf (err_str, sizeof (err_str), "Volume %s does not exist",
+ volname);
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
ret = dict_get_int32 (dict, "count", &brick_count);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get count");
snprintf (err_str, sizeof (err_str), "Unable to get volume "
"brick count");
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
ret = dict_get_int32 (dict, "replica-count", &replica_count);
if (!ret) {
- gf_log (THIS->name, GF_LOG_INFO, "replica-count is %d",
+ gf_log (this->name, GF_LOG_INFO, "replica-count is %d",
replica_count);
}
ret = dict_get_int32 (dict, "stripe-count", &stripe_count);
if (!ret) {
- gf_log (THIS->name, GF_LOG_INFO, "stripe-count is %d",
+ gf_log (this->name, GF_LOG_INFO, "stripe-count is %d",
stripe_count);
}
+ if (!dict_get (dict, "force")) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get flag");
+ goto out;
+ }
ret = glusterd_volinfo_find (volname, &volinfo);
if (ret) {
snprintf (err_str, sizeof (err_str), "Unable to get volinfo "
"for volume name %s", volname);
- gf_log ("glusterd", GF_LOG_ERROR, "%s", err_str);
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
@@ -442,10 +448,10 @@ glusterd_handle_add_brick (rpcsvc_request_t *req)
goto brick_val;
if ((brick_count % volinfo->dist_leaf_count) != 0) {
- snprintf(err_str, 2048, "Incorrect number of bricks"
- " supplied %d with count %d",
+ snprintf (err_str, sizeof (err_str), "Incorrect number "
+ "of bricks supplied %d with count %d",
brick_count, volinfo->dist_leaf_count);
- gf_log("glusterd", GF_LOG_ERROR, "%s", err_str);
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
ret = -1;
goto out;
}
@@ -461,7 +467,7 @@ glusterd_handle_add_brick (rpcsvc_request_t *req)
err_str,
sizeof (err_str));
if (ret == -1) {
- gf_log ("glusterd", GF_LOG_ERROR, "%s", err_str);
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
@@ -471,7 +477,7 @@ glusterd_handle_add_brick (rpcsvc_request_t *req)
ret = dict_set_int32 (dict, "stripe-count", stripe_count);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"failed to set the stripe-count in dict");
goto out;
}
@@ -483,7 +489,7 @@ glusterd_handle_add_brick (rpcsvc_request_t *req)
&type, err_str,
sizeof (err_str));
if (ret == -1) {
- gf_log ("glusterd", GF_LOG_ERROR, "%s", err_str);
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
@@ -493,7 +499,7 @@ glusterd_handle_add_brick (rpcsvc_request_t *req)
ret = dict_set_int32 (dict, "replica-count", replica_count);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"failed to set the replica-count in dict");
goto out;
}
@@ -503,14 +509,14 @@ brick_val:
if (ret) {
snprintf (err_str, sizeof (err_str), "Unable to get volume "
"bricks");
- gf_log ("glusterd", GF_LOG_ERROR, "%s", err_str);
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
if (type != volinfo->type) {
ret = dict_set_int32 (dict, "type", type);
if (ret)
- gf_log (THIS->name, GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"failed to set the new type in dict");
}
@@ -526,8 +532,6 @@ out:
cli_rsp = &rsp;
glusterd_to_cli (req, cli_rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_cli_rsp, dict);
- if (dict)
- dict_unref (dict);
ret = 0; //sent error to cli, prevent second reply
}
@@ -536,9 +540,92 @@ out:
return ret;
}
+int
+glusterd_handle_add_brick (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req, __glusterd_handle_add_brick);
+}
+
+static int
+subvol_matcher_init (int **subvols, int count)
+{
+ int ret = -1;
+
+ *subvols = GF_CALLOC (count, sizeof(int), gf_gld_mt_int);
+ if (*subvols)
+ ret = 0;
+
+ return ret;
+}
+
+static void
+subvol_matcher_update (int *subvols, glusterd_volinfo_t *volinfo,
+ glusterd_brickinfo_t *brickinfo)
+{
+ glusterd_brickinfo_t *tmp = NULL;
+ int32_t sub_volume = 0;
+ int pos = 0;
+
+ list_for_each_entry (tmp, &volinfo->bricks, brick_list) {
+
+ if (strcmp (tmp->hostname, brickinfo->hostname) ||
+ strcmp (tmp->path, brickinfo->path)) {
+ pos++;
+ continue;
+ }
+ gf_log (THIS->name, GF_LOG_DEBUG, LOGSTR_FOUND_BRICK,
+ brickinfo->hostname, brickinfo->path,
+ volinfo->volname);
+ sub_volume = (pos / volinfo->dist_leaf_count);
+ subvols[sub_volume]++;
+ break;
+ }
+
+}
+
+static int
+subvol_matcher_verify (int *subvols, glusterd_volinfo_t *volinfo, char *err_str,
+ size_t err_len, char *vol_type, int replica_count)
+{
+ int i = 0;
+ int ret = 0;
+ int count = volinfo->replica_count-replica_count;
+
+ if (replica_count) {
+ for (i = 0; i < volinfo->subvol_count; i++) {
+ if (subvols[i] != count) {
+ ret = -1;
+ snprintf (err_str, err_len, "Remove exactly %d"
+ " brick(s) from each subvolume.", count);
+ break;
+ }
+ }
+ return ret;
+ }
+
+ do {
+
+ if (subvols[i] % volinfo->dist_leaf_count == 0) {
+ continue;
+ } else {
+ ret = -1;
+ snprintf (err_str, err_len,
+ "Bricks not from same subvol for %s", vol_type);
+ break;
+ }
+ } while (++i < volinfo->subvol_count);
+
+ return ret;
+}
+
+static void
+subvol_matcher_destroy (int *subvols)
+{
+ GF_FREE (subvols);
+}
int
-glusterd_handle_remove_brick (rpcsvc_request_t *req)
+__glusterd_handle_remove_brick (rpcsvc_request_t *req)
{
int32_t ret = -1;
gf_cli_req cli_req = {{0,}};
@@ -550,33 +637,30 @@ glusterd_handle_remove_brick (rpcsvc_request_t *req)
int i = 1;
glusterd_volinfo_t *volinfo = NULL;
glusterd_brickinfo_t *brickinfo = NULL;
- int32_t pos = 0;
- int32_t sub_volume = 0;
- int32_t sub_volume_start = 0;
- int32_t sub_volume_end = 0;
- glusterd_brickinfo_t *tmp = NULL;
+ int *subvols = NULL;
char err_str[2048] = {0};
gf_cli_rsp rsp = {0,};
void *cli_rsp = NULL;
char vol_type[256] = {0,};
int32_t replica_count = 0;
- int32_t brick_index = 0;
- int32_t tmp_brick_idx = 0;
- int found = 0;
- int diff_count = 0;
char *volname = 0;
+ xlator_t *this = NULL;
GF_ASSERT (req);
+ this = THIS;
+ GF_ASSERT (this);
- if (!xdr_to_generic (req->msg[0], &cli_req,
- (xdrproc_t)xdr_gf_cli_req)) {
+ ret = xdr_to_generic (req->msg[0], &cli_req,
+ (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
//failed to decode msg;
req->rpc_err = GARBAGE_ARGS;
+ snprintf (err_str, sizeof (err_str), "Received garbage args");
goto out;
}
- gf_log ("glusterd", GF_LOG_INFO, "Received rem brick req");
+ gf_log (this->name, GF_LOG_INFO, "Received rem brick req");
if (cli_req.dict.dict_len) {
/* Unserialize the dictionary */
@@ -586,36 +670,42 @@ glusterd_handle_remove_brick (rpcsvc_request_t *req)
cli_req.dict.dict_len,
&dict);
if (ret < 0) {
- gf_log ("glusterd", GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"failed to "
"unserialize req-buffer to dictionary");
+ snprintf (err_str, sizeof (err_str), "Unable to decode "
+ "the command");
goto out;
}
}
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "Unable to get volname");
+ snprintf (err_str, sizeof (err_str), "Unable to get volume "
+ "name");
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
ret = dict_get_int32 (dict, "count", &count);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get count");
+ snprintf (err_str, sizeof (err_str), "Unable to get brick "
+ "count");
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
ret = glusterd_volinfo_find (volname, &volinfo);
if (ret) {
- snprintf (err_str, 2048, "Volume %s does not exist",
- volname);
- gf_log ("", GF_LOG_ERROR, "%s", err_str);
+ snprintf (err_str, sizeof (err_str),"Volume %s does not exist",
+ volname);
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
ret = dict_get_int32 (dict, "replica-count", &replica_count);
if (!ret) {
- gf_log (THIS->name, GF_LOG_INFO,
+ gf_log (this->name, GF_LOG_INFO,
"request to change replica-count to %d", replica_count);
ret = gd_rmbr_validate_replica_count (volinfo, replica_count,
count, err_str,
@@ -632,7 +722,7 @@ glusterd_handle_remove_brick (rpcsvc_request_t *req)
ret = dict_set_int32 (dict, "replica-count",
replica_count);
if (ret) {
- gf_log (THIS->name, GF_LOG_WARNING,
+ gf_log (this->name, GF_LOG_WARNING,
"failed to set the replica_count "
"in dict");
goto out;
@@ -651,12 +741,12 @@ glusterd_handle_remove_brick (rpcsvc_request_t *req)
strcpy (vol_type, "distribute");
}
- /* Do not allow remove-brick if the volume is plain stripe */
+ /* Do not allow remove-brick if the volume is a stripe volume*/
if ((volinfo->type == GF_CLUSTER_TYPE_STRIPE) &&
(volinfo->brick_count == volinfo->stripe_count)) {
- snprintf (err_str, 2048,
- "Removing brick from a plain stripe is not allowed");
- gf_log ("glusterd", GF_LOG_ERROR, "%s", err_str);
+ snprintf (err_str, sizeof (err_str),
+ "Removing brick from a stripe volume is not allowed");
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
ret = -1;
goto out;
}
@@ -664,11 +754,11 @@ glusterd_handle_remove_brick (rpcsvc_request_t *req)
if (!replica_count &&
(volinfo->type == GF_CLUSTER_TYPE_STRIPE_REPLICATE) &&
(volinfo->brick_count == volinfo->dist_leaf_count)) {
- snprintf (err_str, 2048,
+ snprintf (err_str, sizeof(err_str),
"Removing bricks from stripe-replicate"
" configuration is not allowed without reducing "
"replica or stripe count explicitly.");
- gf_log (THIS->name, GF_LOG_ERROR, "%s", err_str);
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
ret = -1;
goto out;
}
@@ -676,11 +766,11 @@ glusterd_handle_remove_brick (rpcsvc_request_t *req)
if (!replica_count &&
(volinfo->type == GF_CLUSTER_TYPE_REPLICATE) &&
(volinfo->brick_count == volinfo->dist_leaf_count)) {
- snprintf (err_str, 2048,
+ snprintf (err_str, sizeof (err_str),
"Removing bricks from replicate configuration "
"is not allowed without reducing replica count "
"explicitly.");
- gf_log (THIS->name, GF_LOG_ERROR, "%s", err_str);
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
ret = -1;
goto out;
}
@@ -690,10 +780,10 @@ glusterd_handle_remove_brick (rpcsvc_request_t *req)
if (!replica_count && (volinfo->type != GF_CLUSTER_TYPE_NONE)) {
if (volinfo->dist_leaf_count &&
(count % volinfo->dist_leaf_count)) {
- snprintf (err_str, 2048, "Remove brick incorrect"
- " brick count of %d for %s %d",
+ snprintf (err_str, sizeof (err_str), "Remove brick "
+ "incorrect brick count of %d for %s %d",
count, vol_type, volinfo->dist_leaf_count);
- gf_log ("", GF_LOG_ERROR, "%s", err_str);
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
ret = -1;
goto out;
}
@@ -707,22 +797,32 @@ glusterd_handle_remove_brick (rpcsvc_request_t *req)
}
strcpy (brick_list, " ");
+
+ if ((volinfo->type != GF_CLUSTER_TYPE_NONE) &&
+ (volinfo->subvol_count > 1)) {
+ ret = subvol_matcher_init (&subvols, volinfo->subvol_count);
+ if (ret)
+ goto out;
+ }
+
while ( i <= count) {
- snprintf (key, 256, "brick%d", i);
+ snprintf (key, sizeof (key), "brick%d", i);
ret = dict_get_str (dict, key, &brick);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get %s", key);
+ snprintf (err_str, sizeof (err_str), "Unable to get %s",
+ key);
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
- gf_log ("", GF_LOG_DEBUG, "Remove brick count %d brick: %s",
- i, brick);
+ gf_log (this->name, GF_LOG_DEBUG, "Remove brick count %d brick:"
+ " %s", i, brick);
ret = glusterd_volume_brickinfo_get_by_brick(brick, volinfo,
&brickinfo);
if (ret) {
- snprintf(err_str, 2048,"Incorrect brick %s for volume"
- " %s", brick, volname);
- gf_log ("", GF_LOG_ERROR, "%s", err_str);
+ snprintf (err_str, sizeof (err_str), "Incorrect brick "
+ "%s for volume %s", brick, volname);
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
strcat(brick_list, brick);
@@ -733,74 +833,18 @@ glusterd_handle_remove_brick (rpcsvc_request_t *req)
(volinfo->brick_count <= volinfo->dist_leaf_count))
continue;
- if (replica_count) {
- /* do the validation of bricks here */
- /* -2 because i++ is already done, and i starts with 1,
- instead of 0 */
- diff_count = (volinfo->replica_count - replica_count);
- brick_index = (((i -2) / diff_count) * volinfo->replica_count);
- tmp_brick_idx = 0;
- found = 0;
- list_for_each_entry (tmp, &volinfo->bricks, brick_list) {
- tmp_brick_idx++;
- gf_log (THIS->name, GF_LOG_TRACE,
- "validate brick %s:%s (%d %d %d)",
- tmp->hostname, tmp->path, tmp_brick_idx,
- brick_index, volinfo->replica_count);
- if (tmp_brick_idx <= brick_index)
- continue;
- if (tmp_brick_idx >
- (brick_index + volinfo->replica_count))
- break;
- if ((!strcmp (tmp->hostname,brickinfo->hostname)) &&
- !strcmp (tmp->path, brickinfo->path)) {
- found = 1;
- break;
- }
- }
- if (found)
- continue;
+ /* Find which subvolume the brick belongs to */
+ subvol_matcher_update (subvols, volinfo, brickinfo);
+ }
- snprintf(err_str, 2048,"Bricks are from same subvol");
- gf_log (THIS->name, GF_LOG_INFO,
- "failed to validate brick %s:%s (%d %d %d)",
- tmp->hostname, tmp->path, tmp_brick_idx,
- brick_index, volinfo->replica_count);
- ret = -1;
- /* brick order is not valid */
+ /* Check if the bricks belong to the same subvolumes.*/
+ if ((volinfo->type != GF_CLUSTER_TYPE_NONE) &&
+ (volinfo->subvol_count > 1)) {
+ ret = subvol_matcher_verify (subvols, volinfo,
+ err_str, sizeof(err_str),
+ vol_type, replica_count);
+ if (ret)
goto out;
- }
-
- pos = 0;
- list_for_each_entry (tmp, &volinfo->bricks, brick_list) {
-
- if (strcmp (tmp->hostname,brickinfo->hostname) ||
- strcmp (tmp->path, brickinfo->path)) {
- pos++;
- continue;
- }
-
- gf_log ("", GF_LOG_INFO, "Found brick");
- if (!sub_volume && (volinfo->dist_leaf_count > 1)) {
- sub_volume = (pos / volinfo->dist_leaf_count) + 1;
- sub_volume_start = (volinfo->dist_leaf_count *
- (sub_volume - 1));
- sub_volume_end = (volinfo->dist_leaf_count *
- sub_volume) - 1;
- } else {
- if (pos < sub_volume_start ||
- pos >sub_volume_end) {
- ret = -1;
- snprintf(err_str, 2048,"Bricks not from"
- " same subvol for %s",
- vol_type);
- gf_log ("", GF_LOG_ERROR,
- "%s", err_str);
- goto out;
- }
- }
- break;
- }
}
ret = glusterd_op_begin_synctask (req, GD_OP_REMOVE_BRICK, dict);
@@ -810,24 +854,125 @@ out:
rsp.op_ret = -1;
rsp.op_errno = 0;
if (err_str[0] == '\0')
- snprintf (err_str, sizeof (err_str), "Operation failed");
- gf_log ("", GF_LOG_ERROR, "%s", err_str);
+ snprintf (err_str, sizeof (err_str),
+ "Operation failed");
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
rsp.op_errstr = err_str;
cli_rsp = &rsp;
glusterd_to_cli (req, cli_rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_cli_rsp, dict);
- if (dict)
- dict_unref (dict);
ret = 0; //sent error to cli, prevent second reply
}
+
GF_FREE (brick_list);
+ subvol_matcher_destroy (subvols);
free (cli_req.dict.dict_val); //its malloced by xdr
return ret;
}
+int
+glusterd_handle_remove_brick (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ __glusterd_handle_remove_brick);
+}
+
+static int
+_glusterd_restart_gsync_session (dict_t *this, char *key,
+ data_t *value, void *data)
+{
+ char *slave = NULL;
+ char *slave_buf = NULL;
+ char *path_list = NULL;
+ char *slave_vol = NULL;
+ char *slave_ip = NULL;
+ char *conf_path = NULL;
+ char **errmsg = NULL;
+ int ret = -1;
+ glusterd_gsync_status_temp_t *param = NULL;
+ gf_boolean_t is_running = _gf_false;
+
+ param = (glusterd_gsync_status_temp_t *)data;
+
+ GF_ASSERT (param);
+ GF_ASSERT (param->volinfo);
+
+ slave = strchr(value->data, ':');
+ if (slave) {
+ slave++;
+ slave_buf = gf_strdup (slave);
+ if (!slave_buf) {
+ gf_log ("", GF_LOG_ERROR,
+ "Failed to gf_strdup");
+ ret = -1;
+ goto out;
+ }
+ }
+ else
+ return 0;
+
+ ret = dict_set_dynstr (param->rsp_dict, "slave", slave_buf);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to store slave");
+ if (slave_buf)
+ GF_FREE(slave_buf);
+ goto out;
+ }
+
+ ret = glusterd_get_slave_details_confpath (param->volinfo,
+ param->rsp_dict,
+ &slave_ip, &slave_vol,
+ &conf_path, errmsg);
+ if (ret) {
+ if (*errmsg)
+ gf_log ("", GF_LOG_ERROR, "%s", *errmsg);
+ else
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to fetch slave or confpath details.");
+ goto out;
+ }
+
+ /* In cases that gsyncd is not running, we will not invoke it
+ * because of add-brick. */
+ ret = glusterd_check_gsync_running_local (param->volinfo->volname,
+ slave, conf_path,
+ &is_running);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "gsync running validation failed.");
+ goto out;
+ }
+ if (_gf_false == is_running) {
+ gf_log ("", GF_LOG_DEBUG, "gsync session for %s and %s is"
+ " not running on this node. Hence not restarting.",
+ param->volinfo->volname, slave);
+ ret = 0;
+ goto out;
+ }
+
+ ret = glusterd_get_local_brickpaths (param->volinfo, &path_list);
+ if (!path_list) {
+ gf_log ("", GF_LOG_DEBUG, "This node not being part of"
+ " volume should not be running gsyncd. Hence"
+ " no gsyncd process to restart.");
+ ret = 0;
+ goto out;
+ }
+
+ ret = glusterd_check_restart_gsync_session (param->volinfo, slave,
+ param->rsp_dict, path_list,
+ conf_path, 0);
+ if (ret)
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to restart gsync session.");
+
+out:
+ gf_log ("", GF_LOG_DEBUG, "Returning %d.", ret);
+ return ret;
+}
/* op-sm */
@@ -835,17 +980,22 @@ int
glusterd_op_perform_add_bricks (glusterd_volinfo_t *volinfo, int32_t count,
char *bricks, dict_t *dict)
{
- glusterd_brickinfo_t *brickinfo = NULL;
- char *brick = NULL;
- int32_t i = 1;
- char *brick_list = NULL;
- char *free_ptr1 = NULL;
- char *free_ptr2 = NULL;
- char *saveptr = NULL;
- int32_t ret = -1;
- int32_t stripe_count = 0;
- int32_t replica_count = 0;
- int32_t type = 0;
+ char *brick = NULL;
+ int32_t i = 1;
+ char *brick_list = NULL;
+ char *free_ptr1 = NULL;
+ char *free_ptr2 = NULL;
+ char *saveptr = NULL;
+ int32_t ret = -1;
+ int32_t stripe_count = 0;
+ int32_t replica_count = 0;
+ int32_t type = 0;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ glusterd_gsync_status_temp_t param = {0, };
+ gf_boolean_t restart_needed = 0;
+ char msg[1024] __attribute__((unused)) = {0, };
+ int caps = 0;
+ int brickid = 0;
GF_ASSERT (volinfo);
@@ -873,11 +1023,17 @@ glusterd_op_perform_add_bricks (glusterd_volinfo_t *volinfo, int32_t count,
"type is set %d, need to change it", type);
}
+ brickid = glusterd_get_next_available_brickid (volinfo);
+ if (brickid < 0)
+ goto out;
while ( i <= count) {
ret = glusterd_brickinfo_new_from_brick (brick, &brickinfo);
if (ret)
goto out;
+ GLUSTERD_ASSIGN_BRICKID_TO_BRICKINFO (brickinfo, volinfo,
+ brickid++);
+
ret = glusterd_resolve_brick (brickinfo);
if (ret)
goto out;
@@ -903,13 +1059,15 @@ glusterd_op_perform_add_bricks (glusterd_volinfo_t *volinfo, int32_t count,
if (stripe_count) {
volinfo->stripe_count = stripe_count;
}
- volinfo->dist_leaf_count = (volinfo->stripe_count *
- volinfo->replica_count);
+ volinfo->dist_leaf_count = glusterd_get_dist_leaf_count (volinfo);
/* backward compatibility */
volinfo->sub_count = ((volinfo->dist_leaf_count == 1) ? 0:
volinfo->dist_leaf_count);
+ volinfo->subvol_count = (volinfo->brick_count /
+ volinfo->dist_leaf_count);
+
ret = glusterd_create_volfiles_and_notify_services (volinfo);
if (ret)
goto out;
@@ -924,13 +1082,40 @@ glusterd_op_perform_add_bricks (glusterd_volinfo_t *volinfo, int32_t count,
if (count)
brick = strtok_r (brick_list+1, " \n", &saveptr);
+#ifdef HAVE_BD_XLATOR
+ if (brickinfo->vg[0])
+ caps = CAPS_BD | CAPS_THIN |
+ CAPS_OFFLOAD_COPY | CAPS_OFFLOAD_SNAPSHOT;
+#endif
while (i <= count) {
-
ret = glusterd_volume_brickinfo_get_by_brick (brick, volinfo,
&brickinfo);
if (ret)
goto out;
+#ifdef HAVE_BD_XLATOR
+ /* Check for VG/thin pool if its BD volume */
+ if (brickinfo->vg[0]) {
+ ret = glusterd_is_valid_vg (brickinfo, 0, msg);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_CRITICAL, "%s", msg);
+ goto out;
+ }
+ /* if anyone of the brick does not have thin support,
+ disable it for entire volume */
+ caps &= brickinfo->caps;
+ } else
+ caps = 0;
+#endif
+
+ if (uuid_is_null (brickinfo->uuid)) {
+ ret = glusterd_resolve_brick (brickinfo);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, FMTSTR_RESOLVE_BRICK,
+ brickinfo->hostname, brickinfo->path);
+ goto out;
+ }
+ }
ret = glusterd_brick_start (volinfo, brickinfo,
_gf_true);
@@ -938,8 +1123,27 @@ glusterd_op_perform_add_bricks (glusterd_volinfo_t *volinfo, int32_t count,
goto out;
i++;
brick = strtok_r (NULL, " \n", &saveptr);
+
+ /* Check if the brick is added in this node, and set
+ * the restart_needed flag. */
+ if ((!uuid_compare (brickinfo->uuid, MY_UUID)) &&
+ !restart_needed) {
+ restart_needed = 1;
+ gf_log ("", GF_LOG_DEBUG,
+ "Restart gsyncd session, if it's already "
+ "running.");
+ }
}
+ /* If the restart_needed flag is set, restart gsyncd sessions for that
+ * particular master with all the slaves. */
+ if (restart_needed) {
+ param.rsp_dict = dict;
+ param.volinfo = volinfo;
+ dict_foreach (volinfo->gsync_slaves,
+ _glusterd_restart_gsync_session, &param);
+ }
+ volinfo->caps = caps;
out:
GF_FREE (free_ptr1);
GF_FREE (free_ptr2);
@@ -981,14 +1185,11 @@ glusterd_op_perform_remove_brick (glusterd_volinfo_t *volinfo, char *brick,
}
if (force) {
- if (GLUSTERD_STATUS_STARTED == volinfo->status) {
- ret = glusterd_brick_stop (volinfo, brickinfo,
- _gf_true);
- if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "Unable to stop "
- "glusterfs, ret: %d", ret);
- goto out;
- }
+ ret = glusterd_brick_stop (volinfo, brickinfo,
+ _gf_true);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR, "Unable to stop "
+ "glusterfs, ret: %d", ret);
}
goto out;
}
@@ -1006,6 +1207,7 @@ glusterd_op_stage_add_brick (dict_t *dict, char **op_errstr)
int ret = 0;
char *volname = NULL;
int count = 0;
+ int replica_count = 0;
int i = 0;
char *bricks = NULL;
char *brick_list = NULL;
@@ -1014,16 +1216,31 @@ glusterd_op_stage_add_brick (dict_t *dict, char **op_errstr)
char *brick = NULL;
glusterd_brickinfo_t *brickinfo = NULL;
glusterd_volinfo_t *volinfo = NULL;
- glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
char msg[2048] = {0,};
gf_boolean_t brick_alloc = _gf_false;
char *all_bricks = NULL;
char *str_ret = NULL;
+ gf_boolean_t is_force = _gf_false;
- priv = THIS->private;
- if (!priv)
- goto out;
+ this = THIS;
+ GF_ASSERT (this);
+ ret = dict_get_int32 (dict, "replica-count", &replica_count);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "Unable to get replica count");
+ }
+
+ if (replica_count > 0) {
+ ret = op_version_check (this, GD_OP_VER_PERSISTENT_AFR_XATTRS,
+ msg, sizeof(msg));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg);
+ *op_errstr = gf_strdup (msg);
+ goto out;
+ }
+ }
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
gf_log (THIS->name, GF_LOG_ERROR,
@@ -1072,6 +1289,8 @@ glusterd_op_stage_add_brick (dict_t *dict, char **op_errstr)
goto out;
}
+ is_force = dict_get_str_boolean (dict, "force", _gf_false);
+
if (bricks) {
brick_list = gf_strdup (bricks);
all_bricks = gf_strdup (bricks);
@@ -1113,10 +1332,21 @@ glusterd_op_stage_add_brick (dict_t *dict, char **op_errstr)
}
if (!uuid_compare (brickinfo->uuid, MY_UUID)) {
- ret = glusterd_brick_create_path (brickinfo->hostname,
- brickinfo->path,
+#ifdef HAVE_BD_XLATOR
+ if (brickinfo->vg[0]) {
+ ret = glusterd_is_valid_vg (brickinfo, 1, msg);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR, "%s",
+ msg);
+ *op_errstr = gf_strdup (msg);
+ goto out;
+ }
+ }
+#endif
+
+ ret = glusterd_validate_and_create_brickpath (brickinfo,
volinfo->volume_id,
- op_errstr);
+ op_errstr, is_force);
if (ret)
goto out;
}
@@ -1143,25 +1373,42 @@ out:
int
glusterd_op_stage_remove_brick (dict_t *dict, char **op_errstr)
{
- int ret = -1;
- char *volname = NULL;
- glusterd_volinfo_t *volinfo = NULL;
- char *errstr = NULL;
- int32_t brick_count = 0;
- char msg[2048] = {0,};
- int32_t flag = 0;
- gf1_op_commands cmd = GF_OP_CMD_NONE;
+ int ret = -1;
+ char *volname = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ char *errstr = NULL;
+ int32_t brick_count = 0;
+ char msg[2048] = {0,};
+ int32_t flag = 0;
+ gf1_op_commands cmd = GF_OP_CMD_NONE;
+ char *task_id_str = NULL;
+ xlator_t *this = NULL;
+ int i = 1;
+ char key[256] = {0,};
+ char *brick = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ ret = op_version_check (this, GD_OP_VER_PERSISTENT_AFR_XATTRS,
+ msg, sizeof(msg));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg);
+ *op_errstr = gf_strdup (msg);
+ goto out;
+ }
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get volume name");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get volume name");
goto out;
}
ret = glusterd_volinfo_find (volname, &volinfo);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Volume %s does not exist", volname);
+ gf_log (this->name, GF_LOG_ERROR, "Volume %s does not exist", volname);
goto out;
}
@@ -1173,7 +1420,7 @@ glusterd_op_stage_remove_brick (dict_t *dict, char **op_errstr)
snprintf (msg, sizeof (msg), "Replace brick is in progress on "
"volume %s. Please retry after replace-brick "
"operation is committed or aborted", volname);
- gf_log (THIS->name, GF_LOG_ERROR, "%s", msg);
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg);
*op_errstr = gf_strdup (msg);
ret = -1;
goto out;
@@ -1181,11 +1428,26 @@ glusterd_op_stage_remove_brick (dict_t *dict, char **op_errstr)
ret = dict_get_int32 (dict, "command", &flag);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get brick count");
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to get brick command");
goto out;
}
cmd = flag;
+ ret = dict_get_int32 (dict, "count", &brick_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get brick count");
+ goto out;
+ }
+
+ ret = 0;
+ if (volinfo->brick_count == brick_count) {
+ errstr = gf_strdup ("Deleting all the bricks of the "
+ "volume is not allowed");
+ ret = -1;
+ goto out;
+ }
+
ret = -1;
switch (cmd) {
case GF_OP_CMD_NONE:
@@ -1198,21 +1460,60 @@ glusterd_op_stage_remove_brick (dict_t *dict, char **op_errstr)
case GF_OP_CMD_START:
{
+ if ((volinfo->type == GF_CLUSTER_TYPE_REPLICATE) &&
+ dict_get (dict, "replica-count")) {
+ snprintf (msg, sizeof(msg), "Migration of data is not "
+ "needed when reducing replica count. Use the"
+ " 'force' option");
+ errstr = gf_strdup (msg);
+ gf_log (this->name, GF_LOG_ERROR, "%s", errstr);
+ goto out;
+ }
+
if (GLUSTERD_STATUS_STARTED != volinfo->status) {
- snprintf (msg, sizeof (msg), "Volume %s needs to be started "
- "before remove-brick (you can use 'force' or "
- "'commit' to override this behavior)",
+ snprintf (msg, sizeof (msg), "Volume %s needs to be "
+ "started before remove-brick (you can use "
+ "'force' or 'commit' to override this "
+ "behavior)", volinfo->volname);
+ errstr = gf_strdup (msg);
+ gf_log (this->name, GF_LOG_ERROR, "%s", errstr);
+ goto out;
+ }
+ if (!gd_is_remove_brick_committed (volinfo)) {
+ snprintf (msg, sizeof (msg), "An earlier remove-brick "
+ "task exists for volume %s. Either commit it"
+ " or stop it before starting a new task.",
volinfo->volname);
errstr = gf_strdup (msg);
- gf_log (THIS->name, GF_LOG_ERROR, "%s", errstr);
+ gf_log (this->name, GF_LOG_ERROR, "Earlier remove-brick"
+ " task exists for volume %s.",
+ volinfo->volname);
goto out;
}
if (glusterd_is_defrag_on(volinfo)) {
- errstr = gf_strdup("Rebalance is in progress. Please retry"
- " after completion");
- gf_log ("glusterd", GF_LOG_ERROR, "%s", errstr);
+ errstr = gf_strdup("Rebalance is in progress. Please "
+ "retry after completion");
+ gf_log (this->name, GF_LOG_ERROR, "%s", errstr);
goto out;
}
+
+ if (is_origin_glusterd (dict)) {
+ ret = glusterd_generate_and_set_task_id
+ (dict, GF_REMOVE_BRICK_TID_KEY);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to generate task-id");
+ goto out;
+ }
+ } else {
+ ret = dict_get_str (dict, GF_REMOVE_BRICK_TID_KEY,
+ &task_id_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Missing remove-brick-id");
+ ret = 0;
+ }
+ }
break;
}
@@ -1226,28 +1527,47 @@ glusterd_op_stage_remove_brick (dict_t *dict, char **op_errstr)
"is in progress");
goto out;
}
+
+ /* Do not allow commit if the bricks are not decommissioned */
+ for ( i = 1; i <= brick_count; i++ ) {
+ snprintf (key, sizeof (key), "brick%d", i);
+ ret = dict_get_str (dict, key, &brick);
+ if (ret) {
+ snprintf (msg, sizeof (msg),
+ "Unable to get %s", key);
+ errstr = gf_strdup (msg);
+ goto out;
+ }
+
+ ret =
+ glusterd_volume_brickinfo_get_by_brick(brick, volinfo,
+ &brickinfo);
+ if (ret) {
+ snprintf (msg, sizeof (msg), "Incorrect brick "
+ "%s for volume %s", brick, volname);
+ errstr = gf_strdup (msg);
+ goto out;
+ }
+ if ( !brickinfo->decommissioned ) {
+ snprintf (msg, sizeof (msg), "Brick %s "
+ "is not decommissioned. "
+ "Use start or force option",
+ brick);
+ errstr = gf_strdup (msg);
+ ret = -1;
+ goto out;
+ }
+ }
+
break;
case GF_OP_CMD_COMMIT_FORCE:
break;
}
-
- ret = dict_get_int32 (dict, "count", &brick_count);
- if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get brick count");
- goto out;
- }
-
ret = 0;
- if (volinfo->brick_count == brick_count) {
- errstr = gf_strdup ("Deleting all the bricks of the "
- "volume is not allowed");
- ret = -1;
- goto out;
- }
out:
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
if (ret && errstr) {
if (op_errstr)
*op_errstr = errstr;
@@ -1383,15 +1703,6 @@ glusterd_op_add_brick (dict_t *dict, char **op_errstr)
goto out;
}
- /* Need to reset the defrag/rebalance status accordingly */
- switch (volinfo->defrag_status) {
- case GF_DEFRAG_STATUS_FAILED:
- case GF_DEFRAG_STATUS_COMPLETE:
- volinfo->defrag_status = 0;
- default:
- break;
- }
-
ret = glusterd_store_volinfo (volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT);
if (ret)
goto out;
@@ -1406,42 +1717,74 @@ out:
int
glusterd_op_remove_brick (dict_t *dict, char **op_errstr)
{
- int ret = -1;
- char *volname = NULL;
- glusterd_volinfo_t *volinfo = NULL;
- char *brick = NULL;
- int32_t count = 0;
- int32_t i = 1;
- char key[256] = {0,};
- int32_t flag = 0;
- char err_str[4096] = {0,};
- int need_rebalance = 0;
- int force = 0;
- gf1_op_commands cmd = 0;
- int32_t replica_count = 0;
- glusterd_brickinfo_t *brickinfo = NULL;
- glusterd_brickinfo_t *tmp = NULL;
+ int ret = -1;
+ char *volname = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ char *brick = NULL;
+ int32_t count = 0;
+ int32_t i = 1;
+ char key[256] = {0,};
+ int32_t flag = 0;
+ char err_str[4096] = {0,};
+ int need_rebalance = 0;
+ int force = 0;
+ gf1_op_commands cmd = 0;
+ int32_t replica_count = 0;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ glusterd_brickinfo_t *tmp = NULL;
+ char *task_id_str = NULL;
+ xlator_t *this = NULL;
+ dict_t *bricks_dict = NULL;
+ char *brick_tmpstr = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get volume name");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get volume name");
goto out;
}
ret = glusterd_volinfo_find (volname, &volinfo);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to allocate memory");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to allocate memory");
goto out;
}
ret = dict_get_int32 (dict, "command", &flag);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get brick count");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get command");
goto out;
}
cmd = flag;
+ /* Set task-id, if available, in ctx dict for operations other than
+ * start
+ */
+ if (is_origin_glusterd (dict) && (cmd != GF_OP_CMD_START)) {
+ if (!uuid_is_null (volinfo->rebal.rebalance_id)) {
+ ret = glusterd_copy_uuid_to_dict
+ (volinfo->rebal.rebalance_id, dict,
+ GF_REMOVE_BRICK_TID_KEY);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set remove-brick-id");
+ goto out;
+ }
+ }
+ }
+
+ /* Clear task-id, rebal.op and stored bricks on commmitting/stopping
+ * remove-brick */
+ if ((cmd != GF_OP_CMD_START) || (cmd != GF_OP_CMD_STATUS)) {
+ uuid_clear (volinfo->rebal.rebalance_id);
+ volinfo->rebal.op = GD_OP_NONE;
+ dict_unref (volinfo->rebal.dict);
+ volinfo->rebal.dict = NULL;
+ }
+
ret = -1;
switch (cmd) {
case GF_OP_CMD_NONE:
@@ -1462,7 +1805,7 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr)
}
ret = glusterd_create_volfiles_and_notify_services (volinfo);
if (ret) {
- gf_log (THIS->name, GF_LOG_WARNING,
+ gf_log (this->name, GF_LOG_WARNING,
"failed to create volfiles");
goto out;
}
@@ -1470,7 +1813,7 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr)
ret = glusterd_store_volinfo (volinfo,
GLUSTERD_VOLINFO_VER_AC_INCREMENT);
if (ret) {
- gf_log (THIS->name, GF_LOG_WARNING,
+ gf_log (this->name, GF_LOG_WARNING,
"failed to store volinfo");
goto out;
}
@@ -1480,6 +1823,20 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr)
}
case GF_OP_CMD_START:
+ /* Reset defrag status to 'NOT STARTED' whenever a
+ * remove-brick/rebalance command is issued to remove
+ * stale information from previous run.
+ */
+ volinfo->rebal.defrag_status = GF_DEFRAG_STATUS_NOT_STARTED;
+ ret = dict_get_str (dict, GF_REMOVE_BRICK_TID_KEY, &task_id_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Missing remove-brick-id");
+ ret = 0;
+ } else {
+ uuid_parse (task_id_str, volinfo->rebal.rebalance_id) ;
+ volinfo->rebal.op = GD_OP_REMOVE_BRICK;
+ }
force = 0;
break;
@@ -1490,13 +1847,14 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr)
case GF_OP_CMD_COMMIT_FORCE:
if (volinfo->decommission_in_progress) {
- if (volinfo->defrag) {
- LOCK (&volinfo->defrag->lock);
+ if (volinfo->rebal.defrag) {
+ LOCK (&volinfo->rebal.defrag->lock);
/* Fake 'rebalance-complete' so the graph change
happens right away */
- volinfo->defrag_status = GF_DEFRAG_STATUS_COMPLETE;
+ volinfo->rebal.defrag_status =
+ GF_DEFRAG_STATUS_COMPLETE;
- UNLOCK (&volinfo->defrag->lock);
+ UNLOCK (&volinfo->rebal.defrag->lock);
}
/* Graph change happens in rebalance _cbk function,
no need to do anything here */
@@ -1514,30 +1872,70 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr)
goto out;
}
-
+ /* Save the list of bricks for later usage only on starting a
+ * remove-brick. Right now this is required for displaying the task
+ * parameters with task status in volume status.
+ */
+ if (GF_OP_CMD_START == cmd) {
+ bricks_dict = dict_new ();
+ if (!bricks_dict) {
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_int32 (bricks_dict, "count", count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to save remove-brick count");
+ goto out;
+ }
+ }
while ( i <= count) {
snprintf (key, 256, "brick%d", i);
ret = dict_get_str (dict, key, &brick);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get %s", key);
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get %s",
+ key);
goto out;
}
+ if (GF_OP_CMD_START == cmd) {
+ brick_tmpstr = gf_strdup (brick);
+ if (!brick_tmpstr) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to duplicate brick name");
+ goto out;
+ }
+ ret = dict_set_dynstr (bricks_dict, key, brick_tmpstr);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to add brick to dict");
+ goto out;
+ }
+ brick_tmpstr = NULL;
+ }
+
ret = glusterd_op_perform_remove_brick (volinfo, brick, force,
&need_rebalance);
if (ret)
goto out;
i++;
}
+ if (GF_OP_CMD_START == cmd)
+ volinfo->rebal.dict = dict_ref (bricks_dict);
+
ret = dict_get_int32 (dict, "replica-count", &replica_count);
if (!ret) {
- gf_log (THIS->name, GF_LOG_INFO,
+ gf_log (this->name, GF_LOG_INFO,
"changing replica count %d to %d on volume %s",
volinfo->replica_count, replica_count,
volinfo->volname);
volinfo->replica_count = replica_count;
- volinfo->dist_leaf_count = (volinfo->stripe_count *
- replica_count);
+ volinfo->sub_count = replica_count;
+ volinfo->dist_leaf_count = glusterd_get_dist_leaf_count (volinfo);
+ volinfo->subvol_count = (volinfo->brick_count /
+ volinfo->dist_leaf_count);
+
if (replica_count == 1) {
if (volinfo->type == GF_CLUSTER_TYPE_REPLICATE) {
volinfo->type = GF_CLUSTER_TYPE_NONE;
@@ -1553,34 +1951,46 @@ glusterd_op_remove_brick (dict_t *dict, char **op_errstr)
ret = glusterd_create_volfiles_and_notify_services (volinfo);
if (ret) {
- gf_log (THIS->name, GF_LOG_WARNING, "failed to create volfiles");
+ gf_log (this->name, GF_LOG_WARNING, "failed to create volfiles");
goto out;
}
ret = glusterd_store_volinfo (volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT);
if (ret) {
- gf_log (THIS->name, GF_LOG_WARNING, "failed to store volinfo");
+ gf_log (this->name, GF_LOG_WARNING, "failed to store volinfo");
goto out;
}
+ if (GF_OP_CMD_START == cmd &&
+ volinfo->status == GLUSTERD_STATUS_STARTED) {
+ ret = glusterd_nodesvcs_handle_reconfigure (volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Unable to reconfigure NFS-Server");
+ goto out;
+ }
+ }
+
/* Need to reset the defrag/rebalance status accordingly */
- switch (volinfo->defrag_status) {
+ switch (volinfo->rebal.defrag_status) {
case GF_DEFRAG_STATUS_FAILED:
case GF_DEFRAG_STATUS_COMPLETE:
- volinfo->defrag_status = 0;
+ volinfo->rebal.defrag_status = 0;
default:
break;
}
if (!force && need_rebalance) {
/* perform the rebalance operations */
- ret = glusterd_handle_defrag_start (volinfo, err_str, 4096,
- GF_DEFRAG_CMD_START_FORCE,
- glusterd_remove_brick_migrate_cbk);
+ ret = glusterd_handle_defrag_start
+ (volinfo, err_str, sizeof (err_str),
+ GF_DEFRAG_CMD_START_FORCE,
+ glusterd_remove_brick_migrate_cbk, GD_OP_REMOVE_BRICK);
+
if (!ret)
volinfo->decommission_in_progress = 1;
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"failed to start the rebalance");
}
} else {
@@ -1592,5 +2002,9 @@ out:
if (ret && err_str[0] && op_errstr)
*op_errstr = gf_strdup (err_str);
+ GF_FREE (brick_tmpstr);
+ if (bricks_dict)
+ dict_unref (bricks_dict);
+
return ret;
}
diff --git a/xlators/mgmt/glusterd/src/glusterd-etcd.c b/xlators/mgmt/glusterd/src/glusterd-etcd.c
new file mode 100644
index 000000000..656ea3b9b
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-etcd.c
@@ -0,0 +1,87 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <errno.h>
+#include <signal.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include "glusterfs.h"
+#include "run.h"
+#include "glusterd-etcd.h"
+
+#define GLUSTERD_ETCD_DIR "/var/lib/glusterd/etcd"
+#define GLUSTERD_ETCD_CMD "/root/etcd/bin/etcd"
+
+pid_t
+start_etcd (char *this_host, char *other_host)
+{
+ runner_t runner;
+ char me[256];
+
+ if (gethostname(me,sizeof(me)-1) != 0) {
+ gf_log (__func__, GF_LOG_ERROR, "gethostname failed?!?");
+ return -1;
+ }
+ me[sizeof(me)-1] = '\0';
+
+ if ((mkdir(GLUSTERD_ETCD_DIR,0700) < 0) && (errno != EEXIST)) {
+ gf_log (__func__, GF_LOG_ERROR,
+ "failed to create %s", GLUSTERD_ETCD_DIR);
+ return -1;
+ }
+
+ runinit (&runner);
+ runner_add_args (&runner, GLUSTERD_ETCD_CMD,
+ "-name", this_host,
+ "-data-dir", GLUSTERD_ETCD_DIR,
+ "-bind-addr", NULL);
+ runner_argprintf( &runner, "%s:4001", me);
+ runner_add_arg (&runner, "-peer-addr");
+ runner_argprintf (&runner, "%s:7001", me);
+ if (other_host) {
+ runner_add_arg (&runner, "-peers");
+ runner_argprintf (&runner, "%s:7001", other_host);
+ gf_log (__func__, GF_LOG_INFO, "starting etcd via %s", other_host);
+ } else {
+ gf_log (__func__, GF_LOG_INFO, "starting etcd standalone");
+ }
+
+ /*
+ * Runner_run would wait for it. Runner_run_nowait would not wait,
+ * but would detach it so thoroughly that it won't die when we do.
+ * Also, runner->chpid would be the PID of the transient middle
+ * process, not the one we might actually need to kill later. This
+ * seems to do exactly what we need.
+ */
+ if (runner_start(&runner) != 0) {
+ gf_log (__func__, GF_LOG_ERROR,
+ "failed to start %s", GLUSTERD_ETCD_CMD);
+ return -1;
+ }
+
+ return runner.chpid;
+}
+
+void
+stop_etcd (pid_t pid)
+{
+ if (pid > 0) {
+ gf_log (__func__, GF_LOG_INFO, "killing etcd %d", pid);
+ (void)kill(pid,SIGKILL);
+ (void)waitpid(pid,NULL,0);
+ }
+}
+
+void
+nuke_etcd_dir (void)
+{
+ (void)runcmd("rm","-rf",GLUSTERD_ETCD_DIR,NULL);
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-etcd.h b/xlators/mgmt/glusterd/src/glusterd-etcd.h
new file mode 100644
index 000000000..9459f6bbd
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-etcd.h
@@ -0,0 +1,23 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _GLUSTERD_ETCD_H_
+#define _GLUSTERD_ETCD_H_
+
+#include <sys/types.h>
+#include "glusterfs.h"
+
+pid_t start_etcd (char *this_host, char *other_host);
+
+void stop_etcd (pid_t pid);
+
+void nuke_etcd_dir (void);
+
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-geo-rep.c b/xlators/mgmt/glusterd/src/glusterd-geo-rep.c
index 3dcb12f39..9433a128e 100644
--- a/xlators/mgmt/glusterd/src/glusterd-geo-rep.c
+++ b/xlators/mgmt/glusterd/src/glusterd-geo-rep.c
@@ -25,16 +25,198 @@
#include <signal.h>
+static int
+dict_get_param (dict_t *dict, char *key, char **param);
+
+struct gsync_config_opt_vals_ gsync_confopt_vals[] = {
+ {.op_name = "change_detector",
+ .no_of_pos_vals = 2,
+ .case_sensitive = _gf_true,
+ .values = {"xsync", "changelog"},
+ },
+ {.op_name = "special_sync_mode",
+ .no_of_pos_vals = 2,
+ .case_sensitive = _gf_true,
+ .values = {"partial", "recover"}
+ },
+ {.op_name = "log-level",
+ .no_of_pos_vals = 5,
+ .case_sensitive = _gf_false,
+ .values = {"critical", "error", "warning", "info", "debug"}
+ },
+ {.op_name = "use-tarssh",
+ .no_of_pos_vals = 6,
+ .case_sensitive = _gf_false,
+ .values = {"true", "false", "0", "1", "yes", "no"}
+ },
+ {.op_name = NULL,
+ },
+};
+
static char *gsync_reserved_opts[] = {
"gluster-command-dir",
"pid-file",
+ "remote-gsyncd"
"state-file",
"session-owner",
+ "state-socket-unencoded",
+ "socketdir",
+ "ignore-deletes",
+ "local-id",
+ "local-path",
+ "slave-id",
+ NULL
+};
+
+static char *gsync_no_restart_opts[] = {
+ "checkpoint",
NULL
};
int
-glusterd_handle_gsync_set (rpcsvc_request_t *req)
+__glusterd_handle_sys_exec (rpcsvc_request_t *req)
+{
+ int32_t ret = 0;
+ dict_t *dict = NULL;
+ gf_cli_req cli_req = {{0},};
+ glusterd_op_t cli_op = GD_OP_SYS_EXEC;
+ glusterd_conf_t *priv = NULL;
+ char *host_uuid = NULL;
+ char err_str[2048] = {0,};
+ xlator_t *this = NULL;
+
+ GF_ASSERT (req);
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ ret = xdr_to_generic (req->msg[0], &cli_req,
+ (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
+ req->rpc_err = GARBAGE_ARGS;
+ goto out;
+ }
+
+ if (cli_req.dict.dict_len) {
+ dict = dict_new ();
+ if (!dict)
+ goto out;
+
+
+ ret = dict_unserialize (cli_req.dict.dict_val,
+ cli_req.dict.dict_len,
+ &dict);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to "
+ "unserialize req-buffer to dictionary");
+ snprintf (err_str, sizeof (err_str), "Unable to decode "
+ "the command");
+ goto out;
+ } else {
+ dict->extra_stdfree = cli_req.dict.dict_val;
+ }
+
+ host_uuid = gf_strdup (uuid_utoa(MY_UUID));
+ if (host_uuid == NULL) {
+ snprintf (err_str, sizeof (err_str), "Failed to get "
+ "the uuid of local glusterd");
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_dynstr (dict, "host-uuid", host_uuid);
+ if (ret)
+ goto out;
+ }
+
+ ret = glusterd_op_begin_synctask (req, cli_op, dict);
+
+out:
+ if (ret) {
+ if (err_str[0] == '\0')
+ snprintf (err_str, sizeof (err_str),
+ "Operation failed");
+ ret = glusterd_op_send_cli_response (cli_op, ret, 0, req,
+ dict, err_str);
+ }
+ return ret;
+}
+
+int
+__glusterd_handle_copy_file (rpcsvc_request_t *req)
+{
+ int32_t ret = 0;
+ dict_t *dict = NULL;
+ gf_cli_req cli_req = {{0},};
+ glusterd_op_t cli_op = GD_OP_COPY_FILE;
+ glusterd_conf_t *priv = NULL;
+ char *host_uuid = NULL;
+ char err_str[2048] = {0,};
+ xlator_t *this = NULL;
+
+ GF_ASSERT (req);
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ ret = xdr_to_generic (req->msg[0], &cli_req,
+ (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
+ req->rpc_err = GARBAGE_ARGS;
+ goto out;
+ }
+
+ if (cli_req.dict.dict_len) {
+ dict = dict_new ();
+ if (!dict)
+ goto out;
+
+
+ ret = dict_unserialize (cli_req.dict.dict_val,
+ cli_req.dict.dict_len,
+ &dict);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to "
+ "unserialize req-buffer to dictionary");
+ snprintf (err_str, sizeof (err_str), "Unable to decode "
+ "the command");
+ goto out;
+ } else {
+ dict->extra_stdfree = cli_req.dict.dict_val;
+ }
+
+ host_uuid = gf_strdup (uuid_utoa(MY_UUID));
+ if (host_uuid == NULL) {
+ snprintf (err_str, sizeof (err_str), "Failed to get "
+ "the uuid of local glusterd");
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_dynstr (dict, "host-uuid", host_uuid);
+ if (ret)
+ goto out;
+ }
+
+ ret = glusterd_op_begin_synctask (req, cli_op, dict);
+
+out:
+ if (ret) {
+ if (err_str[0] == '\0')
+ snprintf (err_str, sizeof (err_str),
+ "Operation failed");
+ ret = glusterd_op_send_cli_response (cli_op, ret, 0, req,
+ dict, err_str);
+ }
+ return ret;
+}
+
+int
+__glusterd_handle_gsync_set (rpcsvc_request_t *req)
{
int32_t ret = 0;
dict_t *dict = NULL;
@@ -46,15 +228,19 @@ glusterd_handle_gsync_set (rpcsvc_request_t *req)
int type = 0;
glusterd_conf_t *priv = NULL;
char *host_uuid = NULL;
+ char err_str[2048] = {0,};
+ xlator_t *this = NULL;
GF_ASSERT (req);
- GF_ASSERT (THIS);
- GF_ASSERT (THIS->private);
- priv = THIS->private;
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
- if (!xdr_to_generic (req->msg[0], &cli_req,
- (xdrproc_t)xdr_gf_cli_req)) {
+ ret = xdr_to_generic (req->msg[0], &cli_req,
+ (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
req->rpc_err = GARBAGE_ARGS;
goto out;
}
@@ -68,8 +254,10 @@ glusterd_handle_gsync_set (rpcsvc_request_t *req)
cli_req.dict.dict_len,
&dict);
if (ret < 0) {
- gf_log ("glusterd", GF_LOG_ERROR, "failed to "
+ gf_log (this->name, GF_LOG_ERROR, "failed to "
"unserialize req-buffer to dictionary");
+ snprintf (err_str, sizeof (err_str), "Unable to decode "
+ "the command");
goto out;
} else {
dict->extra_stdfree = cli_req.dict.dict_val;
@@ -77,8 +265,8 @@ glusterd_handle_gsync_set (rpcsvc_request_t *req)
host_uuid = gf_strdup (uuid_utoa(MY_UUID));
if (host_uuid == NULL) {
- gf_log ("glusterd", GF_LOG_ERROR, "failed to get"
- "the uuid of the host machine");
+ snprintf (err_str, sizeof (err_str), "Failed to get "
+ "the uuid of local glusterd");
ret = -1;
goto out;
}
@@ -90,26 +278,31 @@ glusterd_handle_gsync_set (rpcsvc_request_t *req)
ret = dict_get_str (dict, "master", &master);
if (ret < 0) {
- gf_log ("", GF_LOG_INFO, "master not found, while handling"
- GEOREP" options");
+ gf_log (this->name, GF_LOG_INFO, "master not found, while "
+ "handling "GEOREP" options");
master = "(No Master)";
}
ret = dict_get_str (dict, "slave", &slave);
if (ret < 0) {
- gf_log ("", GF_LOG_INFO, "slave not not found, while"
+ gf_log (this->name, GF_LOG_INFO, "slave not found, while "
"handling "GEOREP" options");
slave = "(No Slave)";
}
ret = dict_get_int32 (dict, "type", &type);
if (ret < 0) {
- gf_log ("", GF_LOG_WARNING, "command type not found, while"
- "handling "GEOREP" options");
+ snprintf (err_str, sizeof (err_str), "Command type not found "
+ "while handling "GEOREP" options");
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
switch (type) {
+ case GF_GSYNC_OPTION_TYPE_CREATE:
+ strncpy (operation, "create", sizeof (operation));
+ cli_op = GD_OP_GSYNC_CREATE;
+ break;
case GF_GSYNC_OPTION_TYPE_START:
strncpy (operation, "start", sizeof (operation));
@@ -126,26 +319,38 @@ glusterd_handle_gsync_set (rpcsvc_request_t *req)
case GF_GSYNC_OPTION_TYPE_STATUS:
strncpy (operation, "status", sizeof (operation));
break;
- case GF_GSYNC_OPTION_TYPE_ROTATE:
- strncpy (operation, "rotate", sizeof(operation));
- break;
}
- ret = glusterd_op_begin (req, GD_OP_GSYNC_SET, dict);
+ ret = glusterd_op_begin_synctask (req, cli_op, dict);
out:
- glusterd_friend_sm ();
- glusterd_op_sm ();
-
if (ret) {
+ if (err_str[0] == '\0')
+ snprintf (err_str, sizeof (err_str),
+ "Operation failed");
ret = glusterd_op_send_cli_response (cli_op, ret, 0, req,
- dict, "operation failed");
- if (dict)
- dict_unref (dict);
+ dict, err_str);
}
return ret;
}
+int
+glusterd_handle_sys_exec (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req, __glusterd_handle_sys_exec);
+}
+
+int
+glusterd_handle_copy_file (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req, __glusterd_handle_copy_file);
+}
+
+int
+glusterd_handle_gsync_set (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req, __glusterd_handle_gsync_set);
+}
/*****
*
@@ -430,7 +635,7 @@ _fcbk_conftodict (char *resbuf, size_t blen, FILE *fp, void *data)
}
static int
-glusterd_gsync_get_config (char *master, char *slave, char *gl_workdir, dict_t *dict)
+glusterd_gsync_get_config (char *master, char *slave, char *conf_path, dict_t *dict)
{
/* key + value, where value must be able to accommodate a path */
char resbuf[256 + PATH_MAX] = {0,};
@@ -438,7 +643,7 @@ glusterd_gsync_get_config (char *master, char *slave, char *gl_workdir, dict_t *
runinit (&runner);
runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", "-c", NULL);
- runner_argprintf (&runner, "%s/"GSYNC_CONF, gl_workdir);
+ runner_argprintf (&runner, "%s", conf_path);
runner_argprintf (&runner, ":%s", master);
runner_add_args (&runner, slave, "--config-get-all", NULL);
@@ -448,13 +653,13 @@ glusterd_gsync_get_config (char *master, char *slave, char *gl_workdir, dict_t *
static int
glusterd_gsync_get_param_file (char *prmfile, const char *param, char *master,
- char *slave, char *gl_workdir)
+ char *slave, char *conf_path)
{
runner_t runner = {0,};
runinit (&runner);
runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", "-c", NULL);
- runner_argprintf (&runner, "%s/"GSYNC_CONF, gl_workdir);
+ runner_argprintf (&runner, "%s", conf_path);
runner_argprintf (&runner, ":%s", master);
runner_add_args (&runner, slave, "--config-get", NULL);
runner_argprintf (&runner, "%s-file", param);
@@ -463,83 +668,14 @@ glusterd_gsync_get_param_file (char *prmfile, const char *param, char *master,
}
static int
-glusterd_gsync_get_session_owner (char *master, char *slave, char *session_owner,
- char *gl_workdir)
-{
- runner_t runner = {0,};
-
- runinit(&runner);
- runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", "-c", NULL);
- runner_argprintf (&runner, "%s/"GSYNC_CONF, gl_workdir);
- runner_argprintf (&runner, ":%s", master);
- runner_add_args (&runner, slave, "--config-get", "session-owner",
- NULL);
-
- return glusterd_query_extutil (session_owner, &runner);
-}
-
-/* check whether @slave is local or remote. normalized
- * urls starting with ssh are considered to be remote
- * @returns
- * 1 if slave is remote
- * 0 is slave is local
- */
-static int
-glusterd_gsync_slave_is_remote (char *slave)
-{
- int ret = 0;
- char *ssh_pos = NULL;
-
- ssh_pos = strstr(slave, "ssh://");
- if ( ssh_pos && ((ssh_pos - slave) == 0) )
- ret = 1;
-
- return ret;
-}
-
-static int
-glusterd_gsync_get_slave_log_file (char *master, char *slave, char *log_file)
-{
- int ret = -1;
- runner_t runner = {0,};
- char uuid_str[64] = {0,};
- glusterd_conf_t *priv = NULL;
- char *gl_workdir = NULL;
-
- GF_ASSERT(THIS);
- GF_ASSERT(THIS->private);
-
- priv = THIS->private;
-
- GF_VALIDATE_OR_GOTO("gsyncd", master, out);
- GF_VALIDATE_OR_GOTO("gsyncd", slave, out);
-
- gl_workdir = priv->workdir;
-
- /* get the session owner for the master-slave session */
- ret = glusterd_gsync_get_session_owner (master, slave, uuid_str,
- gl_workdir);
- if (ret)
- goto out;
-
- /* get the log file for the slave */
- runinit(&runner);
- runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", "-c", NULL);
- runner_argprintf (&runner, "%s/"GSYNC_CONF, gl_workdir);
- runner_argprintf (&runner, "--session-owner=%s", uuid_str);
- runner_add_args (&runner, slave, "--config-get", "log-file", NULL);
-
- ret = glusterd_query_extutil (log_file, &runner);
-
- out:
- return ret;
-}
-
-static int
-gsyncd_getpidfile (char *master, char *slave, char *pidfile)
+gsyncd_getpidfile (char *master, char *slave, char *pidfile, char *conf_path)
{
int ret = -1;
glusterd_conf_t *priv = NULL;
+ char *confpath = NULL;
+ char conf_buf[PATH_MAX] = "";
+ struct stat stbuf = {0,};
+
GF_ASSERT (THIS);
GF_ASSERT (THIS->private);
@@ -549,8 +685,22 @@ gsyncd_getpidfile (char *master, char *slave, char *pidfile)
GF_VALIDATE_OR_GOTO ("gsync", master, out);
GF_VALIDATE_OR_GOTO ("gsync", slave, out);
+ ret = lstat (conf_path, &stbuf);
+ if (!ret) {
+ gf_log ("", GF_LOG_DEBUG, "Using passed config template(%s).",
+ conf_path);
+ confpath = conf_path;
+ } else {
+ ret = snprintf (conf_buf, sizeof(conf_buf) - 1,
+ "%s/"GSYNC_CONF_TEMPLATE, priv->workdir);
+ conf_buf[ret] = '\0';
+ confpath = conf_buf;
+ gf_log ("", GF_LOG_DEBUG, "Using default config template(%s).",
+ confpath);
+ }
+
ret = glusterd_gsync_get_param_file (pidfile, "pid", master,
- slave, priv->workdir);
+ slave, confpath);
if (ret == -1) {
ret = -2;
gf_log ("", GF_LOG_WARNING, "failed to create the pidfile string");
@@ -564,32 +714,6 @@ gsyncd_getpidfile (char *master, char *slave, char *pidfile)
}
static int
-glusterd_gsyncd_getlogfile (char *master, char *slave, char *log_file)
-{
- int ret = -1;
- glusterd_conf_t *priv = NULL;
-
- GF_ASSERT (THIS);
- GF_ASSERT (THIS->private);
-
- priv = THIS->private;
-
- GF_VALIDATE_OR_GOTO ("gsync", master, out);
- GF_VALIDATE_OR_GOTO ("gsync", slave, out);
-
- ret = glusterd_gsync_get_param_file (log_file, "log", master,
- slave, priv->workdir);
- if (ret == -1) {
- ret = -2;
- gf_log ("", GF_LOG_WARNING, "failed to gsyncd logfile");
- goto out;
- }
-
- out:
- return ret;
-}
-
-static int
gsync_status_byfd (int fd)
{
GF_ASSERT (fd >= -1);
@@ -606,12 +730,12 @@ gsync_status_byfd (int fd)
* return -1 when not running
*/
int
-gsync_status (char *master, char *slave, int *status)
+gsync_status (char *master, char *slave, char *conf_path, int *status)
{
char pidfile[PATH_MAX] = {0,};
int fd = -1;
- fd = gsyncd_getpidfile (master, slave, pidfile);
+ fd = gsyncd_getpidfile (master, slave, pidfile, conf_path);
if (fd == -2)
return -1;
@@ -648,16 +772,48 @@ out:
}
static int
-gsync_verify_config_options (dict_t *dict, char **op_errstr)
+glusterd_verify_gsyncd_spawn (char *master, char *slave)
+{
+ int ret = 0;
+ runner_t runner = {0,};
+
+ runinit (&runner);
+ runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd",
+ "--verify", "spawning", NULL);
+ runner_argprintf (&runner, ":%s", master);
+ runner_add_args (&runner, slave, NULL);
+ runner_redir (&runner, STDOUT_FILENO, RUN_PIPE);
+ ret = runner_start (&runner);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "spawning child failed");
+ ret = -1;
+ goto out;
+ }
+
+ if (runner_end (&runner) != 0)
+ ret = -1;
+
+out:
+ gf_log ("", GF_LOG_DEBUG, "returning %d", ret);
+ return ret;
+}
+
+static int
+gsync_verify_config_options (dict_t *dict, char **op_errstr, char *volname)
{
- char **resopt = NULL;
- int i = 0;
- char *subop = NULL;
- char *slave = NULL;
- char *op_name = NULL;
- char *op_value = NULL;
- char *t = NULL;
- gf_boolean_t banned = _gf_true;
+ char **resopt = NULL;
+ int i = 0;
+ int ret = -1;
+ char *subop = NULL;
+ char *slave = NULL;
+ char *op_name = NULL;
+ char *op_value = NULL;
+ char *t = NULL;
+ char errmsg[PATH_MAX] = "";
+ gf_boolean_t banned = _gf_true;
+ gf_boolean_t op_match = _gf_true;
+ gf_boolean_t val_match = _gf_true;
+ struct gsync_config_opt_vals_ *conf_vals = NULL;
if (dict_get_str (dict, "subop", &subop) != 0) {
gf_log ("", GF_LOG_WARNING, "missing subop");
@@ -681,6 +837,12 @@ gsync_verify_config_options (dict_t *dict, char **op_errstr)
}
if (runcmd (GSYNCD_PREFIX"/gsyncd", "--config-check", op_name, NULL)) {
+ ret = glusterd_verify_gsyncd_spawn (volname, slave);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to spawn gsyncd");
+ return 0;
+ }
+
gf_log ("", GF_LOG_WARNING, "Invalid option %s", op_name);
*op_errstr = gf_strdup ("Invalid option");
@@ -725,34 +887,113 @@ gsync_verify_config_options (dict_t *dict, char **op_errstr)
}
}
+ /* Check options in gsync_confopt_vals for invalid values */
+ for (conf_vals = gsync_confopt_vals; conf_vals->op_name; conf_vals++) {
+ op_match = _gf_true;
+ for (i = 0; conf_vals->op_name[i] && op_name[i]; i++) {
+ if (conf_vals->op_name[i] == op_name[i] ||
+ (conf_vals->op_name[i] == '_' && op_name[i] == '-'))
+ continue;
+ op_match = _gf_false;
+ }
+
+ if (op_match) {
+ if (!op_value)
+ goto out;
+ val_match = _gf_false;
+ for (i = 0; i < conf_vals->no_of_pos_vals; i++) {
+ if(conf_vals->case_sensitive){
+ if (!strcmp (conf_vals->values[i], op_value))
+ val_match = _gf_true;
+ } else {
+ if (!strcasecmp (conf_vals->values[i], op_value))
+ val_match = _gf_true;
+ }
+ }
+
+ if (!val_match) {
+ ret = snprintf (errmsg, sizeof(errmsg) - 1,
+ "Invalid value(%s) for"
+ " option %s", op_value,
+ op_name);
+ errmsg[ret] = '\0';
+
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ *op_errstr = gf_strdup (errmsg);
+ return -1;
+ }
+ }
+ }
+out:
return 0;
}
static int
glusterd_get_gsync_status_mst_slv (glusterd_volinfo_t *volinfo,
- char *slave, dict_t *rsp_dict);
+ char *slave, char *conf_path,
+ dict_t *rsp_dict, char *node);
static int
_get_status_mst_slv (dict_t *this, char *key, data_t *value, void *data)
{
glusterd_gsync_status_temp_t *param = NULL;
char *slave = NULL;
- int ret = 0;
+ char *slave_buf = NULL;
+ char *slave_ip = NULL;
+ char *slave_vol = NULL;
+ char *errmsg = NULL;
+ char conf_path[PATH_MAX] = "";
+ int ret = -1;
+ glusterd_conf_t *priv = NULL;
param = (glusterd_gsync_status_temp_t *)data;
GF_ASSERT (param);
GF_ASSERT (param->volinfo);
+ if (THIS)
+ priv = THIS->private;
+ if (priv == NULL) {
+ gf_log ("", GF_LOG_ERROR, "priv of glusterd not present");
+ goto out;
+ }
+
slave = strchr(value->data, ':');
- if (slave)
- slave ++;
- else
+ if (!slave)
return 0;
+ slave++;
+
+ ret = glusterd_get_slave_info (slave, &slave_ip, &slave_vol, &errmsg);
+ if (ret) {
+ if (errmsg)
+ gf_log ("", GF_LOG_ERROR, "Unable to fetch "
+ "slave details. Error: %s", errmsg);
+ else
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to fetch slave details.");
+ ret = -1;
+ goto out;
+ }
+
+ ret = snprintf (conf_path, sizeof(conf_path) - 1,
+ "%s/"GEOREP"/%s_%s_%s/gsyncd.conf",
+ priv->workdir, param->volinfo->volname,
+ slave_ip, slave_vol);
+ conf_path[ret] = '\0';
ret = glusterd_get_gsync_status_mst_slv(param->volinfo,
- slave, param->rsp_dict);
- return 0;
+ slave, conf_path,
+ param->rsp_dict,
+ param->node);
+out:
+
+ GF_FREE (errmsg);
+
+ if (slave_buf)
+ GF_FREE(slave_buf);
+
+ gf_log ("", GF_LOG_DEBUG, "Returning %d.", ret);
+ return ret;
}
@@ -773,19 +1014,22 @@ static int
glusterd_remove_slave_in_info (glusterd_volinfo_t *volinfo, char *slave,
char **op_errstr)
{
+ int zero_slave_entries = _gf_true;
int ret = 0;
char *slavekey = NULL;
GF_ASSERT (volinfo);
GF_ASSERT (slave);
- ret = glusterd_get_slave (volinfo, slave, &slavekey);
- if (ret < 0) {
- ret++;
- goto out;
- }
-
- dict_del (volinfo->gsync_slaves, slavekey);
+ do {
+ ret = glusterd_get_slave (volinfo, slave, &slavekey);
+ if (ret < 0 && zero_slave_entries) {
+ ret++;
+ goto out;
+ }
+ zero_slave_entries = _gf_false;
+ dict_del (volinfo->gsync_slaves, slavekey);
+ } while (ret >= 0);
ret = glusterd_store_volinfo (volinfo,
GLUSTERD_VOLINFO_VER_AC_INCREMENT);
@@ -838,8 +1082,9 @@ glusterd_gsync_get_uuid (char *slave, glusterd_volinfo_t *vol,
return ret;
}
-static int
+int
glusterd_check_gsync_running_local (char *master, char *slave,
+ char *conf_path,
gf_boolean_t *is_run)
{
int ret = -1;
@@ -850,7 +1095,7 @@ glusterd_check_gsync_running_local (char *master, char *slave,
GF_ASSERT (is_run);
*is_run = _gf_false;
- ret = gsync_status (master, slave, &ret_status);
+ ret = gsync_status (master, slave, conf_path, &ret_status);
if (ret == 0 && ret_status == 0) {
*is_run = _gf_true;
} else if (ret == -1) {
@@ -867,7 +1112,8 @@ glusterd_check_gsync_running_local (char *master, char *slave,
static int
glusterd_store_slave_in_info (glusterd_volinfo_t *volinfo, char *slave,
- char *host_uuid, char **op_errstr)
+ char *host_uuid, char **op_errstr,
+ gf_boolean_t is_force)
{
int ret = 0;
int maxslv = 0;
@@ -890,7 +1136,8 @@ glusterd_store_slave_in_info (glusterd_volinfo_t *volinfo, char *slave,
case -1:
break;
default:
- GF_ASSERT (ret > 0);
+ if (!is_force)
+ GF_ASSERT (ret > 0);
ret = dict_get_str (volinfo->gsync_slaves, slavekey, &slaveentry);
GF_ASSERT (ret == 0);
@@ -899,13 +1146,23 @@ glusterd_store_slave_in_info (glusterd_volinfo_t *volinfo, char *slave,
* assert an uuid mismatch
*/
t = strtail (slaveentry, host_uuid);
- GF_ASSERT (!t || *t != ':');
+ if (!is_force)
+ GF_ASSERT (!t || *t != ':');
+
+ if (is_force) {
+ gf_log ("", GF_LOG_DEBUG, GEOREP" has already been "
+ "invoked for the %s (master) and %s (slave)."
+ " Allowing without saving info again due to"
+ " force command.", volinfo->volname, slave);
+ ret = 0;
+ goto out;
+ }
gf_log ("", GF_LOG_ERROR, GEOREP" has already been invoked for "
"the %s (master) and %s (slave) "
"from a different machine",
volinfo->volname, slave);
- *op_errstr = gf_strdup (GEOREP" already running in an an"
+ *op_errstr = gf_strdup (GEOREP" already running in "
"another machine");
ret = -1;
goto out;
@@ -938,23 +1195,26 @@ glusterd_store_slave_in_info (glusterd_volinfo_t *volinfo, char *slave,
return ret;
}
-
static int
glusterd_op_verify_gsync_start_options (glusterd_volinfo_t *volinfo,
- char *slave, char **op_errstr)
+ char *slave, char *conf_path,
+ char *statefile, char **op_errstr,
+ gf_boolean_t is_force)
{
int ret = -1;
gf_boolean_t is_running = _gf_false;
char msg[2048] = {0};
uuid_t uuid = {0};
- glusterd_conf_t *priv = NULL;
- xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+ struct stat stbuf = {0,};
this = THIS;
GF_ASSERT (volinfo);
GF_ASSERT (slave);
GF_ASSERT (op_errstr);
+ GF_ASSERT (conf_path);
GF_ASSERT (this && this->private);
priv = this->private;
@@ -964,26 +1224,56 @@ glusterd_op_verify_gsync_start_options (glusterd_volinfo_t *volinfo,
"before "GEOREP" start", volinfo->volname);
goto out;
}
+
+ ret = lstat (statefile, &stbuf);
+ if (ret) {
+ snprintf (msg, sizeof (msg), "Session between %s and %s has"
+ " not been created. Please create session and retry.",
+ volinfo->volname, slave);
+ gf_log ("", GF_LOG_ERROR, "%s", msg);
+ *op_errstr = gf_strdup (msg);
+ goto out;
+ }
+
+ /* Check if the gsync slave info is stored. If not
+ * session has not been created */
+ ret = glusterd_gsync_get_uuid (slave, volinfo, uuid);
+ if (ret) {
+ snprintf (msg, sizeof (msg), "Session between %s and %s has"
+ " not been created. Please create session and retry.",
+ volinfo->volname, slave);
+ gf_log ("", GF_LOG_ERROR, "%s", msg);
+ goto out;
+ }
+
+ if (is_force) {
+ ret = 0;
+ goto out;
+ }
+
/*Check if the gsync is already started in cmd. inited host
* If so initiate add it into the glusterd's priv*/
- ret = glusterd_gsync_get_uuid (slave, volinfo, uuid);
- if ((ret == 0) && (uuid_compare (MY_UUID, uuid) == 0)) {
- ret = glusterd_check_gsync_running_local (volinfo->volname,
- slave, &is_running);
- if (ret) {
- snprintf (msg, sizeof (msg), GEOREP" start option "
- "validation failed ");
- goto out;
- }
- if (_gf_true == is_running) {
- snprintf (msg, sizeof (msg), GEOREP " session between"
- " %s & %s already started", volinfo->volname,
- slave);
- ret = -1;
- goto out;
- }
+ ret = glusterd_check_gsync_running_local (volinfo->volname,
+ slave, conf_path,
+ &is_running);
+ if (ret) {
+ snprintf (msg, sizeof (msg), GEOREP" start option "
+ "validation failed ");
+ goto out;
+ }
+ if (_gf_true == is_running) {
+ snprintf (msg, sizeof (msg), GEOREP " session between"
+ " %s & %s already started", volinfo->volname,
+ slave);
+ ret = -1;
+ goto out;
+ }
+
+ ret = glusterd_verify_gsyncd_spawn (volinfo->volname, slave);
+ if (ret) {
+ snprintf (msg, sizeof (msg), "Unable to spawn gsyncd");
+ gf_log ("", GF_LOG_ERROR, "%s", msg);
}
- ret = 0;
out:
if (ret && (msg[0] != '\0')) {
*op_errstr = gf_strdup (msg);
@@ -1007,13 +1297,168 @@ glusterd_check_gsync_running (glusterd_volinfo_t *volinfo, gf_boolean_t *flag)
return 0;
}
+/*
+ * is_geo_rep_active:
+ * This function reads the state_file and sets is_active to 1 if the
+ * monitor status is neither "Stopped" or "Not Started"
+ *
+ * RETURN VALUE:
+ * 0: On successful read of state_file.
+ * -1: error.
+ */
+
+static int
+is_geo_rep_active (glusterd_volinfo_t *volinfo, char *slave,
+ char *conf_path, int *is_active)
+{
+ dict_t *confd = NULL;
+ char *statefile = NULL;
+ char *master = NULL;
+ char monitor_status[PATH_MAX] = "";
+ int ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ master = volinfo->volname;
+
+ confd = dict_new ();
+ if (!confd) {
+ gf_log ("", GF_LOG_ERROR, "Not able to create dict.");
+ goto out;
+ }
+
+ ret = glusterd_gsync_get_config (master, slave, conf_path,
+ confd);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to get configuration data "
+ "for %s(master), %s(slave)", master, slave);
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_get_param (confd, "state_file", &statefile);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to get state_file's name "
+ "for %s(master), %s(slave). Please check gsync "
+ "config file.", master, slave);
+ ret = -1;
+ goto out;
+ }
+
+ ret = glusterd_gsync_read_frm_status (statefile, monitor_status,
+ sizeof (monitor_status));
+ if (ret <= 0) {
+ gf_log ("", GF_LOG_ERROR, "Unable to read the status "
+ "file for %s(master), %s(slave)", master, slave);
+ strncpy (monitor_status, "defunct", sizeof (monitor_status));
+ }
+
+ if ((!strcmp(monitor_status, "Stopped")) ||
+ (!strcmp(monitor_status, "Not Started"))) {
+ *is_active = 0;
+ } else {
+ *is_active = 1;
+ }
+ ret = 0;
+out:
+ if (confd)
+ dict_destroy (confd);
+ return ret;
+}
+
+/*
+ * _get_slave_status:
+ * Called for each slave in the volume from dict_foreach.
+ * It calls is_geo_rep_active to get the monitor status.
+ *
+ * RETURN VALUE:
+ * 0: On successful read of state_file from is_geo_rep_active.
+ * When it is found geo-rep is already active from previous calls.
+ * When there is no slave.
+ * -1: On error.
+ */
+
+int
+_get_slave_status (dict_t *dict, char *key, data_t *value, void *data)
+{
+ gsync_status_param_t *param = NULL;
+ char *slave = NULL;
+ char *slave_ip = NULL;
+ char *slave_vol = NULL;
+ char *errmsg = NULL;
+ char conf_path[PATH_MAX] = "";
+ int ret = -1;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+
+ param = (gsync_status_param_t *)data;
+
+ GF_ASSERT (param);
+ GF_ASSERT (param->volinfo);
+
+ if (param->is_active) {
+ ret = 0;
+ goto out;
+ }
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ if (this)
+ priv = this->private;
+ if (priv == NULL) {
+ gf_log ("", GF_LOG_ERROR, "priv of glusterd not present");
+ goto out;
+ }
+
+ slave = strchr(value->data, ':');
+ if (!slave) {
+ ret = 0;
+ goto out;
+ }
+ slave++;
+
+ ret = glusterd_get_slave_info (slave, &slave_ip, &slave_vol, &errmsg);
+ if (ret) {
+ if (errmsg)
+ gf_log ("", GF_LOG_ERROR, "Unable to fetch "
+ "slave details. Error: %s", errmsg);
+ else
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to fetch slave details.");
+ ret = -1;
+ goto out;
+ }
+
+ ret = snprintf (conf_path, sizeof(conf_path) - 1,
+ "%s/"GEOREP"/%s_%s_%s/gsyncd.conf",
+ priv->workdir, param->volinfo->volname,
+ slave_ip, slave_vol);
+ if (ret < 0) {
+ gf_log ("", GF_LOG_ERROR, "Unable to assign conf_path.");
+ ret = -1;
+ goto out;
+ }
+ conf_path[ret] = '\0';
+
+ ret = is_geo_rep_active (param->volinfo,slave, conf_path,
+ &param->is_active);
+out:
+ GF_FREE(errmsg);
+ return ret;
+}
+
static int
glusterd_op_verify_gsync_running (glusterd_volinfo_t *volinfo,
- char *slave, char **op_errstr)
+ char *slave, char *conf_path,
+ char **op_errstr)
{
- int ret = -1;
- char msg[2048] = {0};
- uuid_t uuid = {0};
+ int pfd = -1;
+ int ret = -1;
+ char msg[2048] = {0};
+ char pidfile[PATH_MAX] = {0,};
GF_ASSERT (THIS && THIS->private);
GF_ASSERT (volinfo);
@@ -1026,13 +1471,26 @@ glusterd_op_verify_gsync_running (glusterd_volinfo_t *volinfo,
goto out;
}
- ret = glusterd_gsync_get_uuid (slave, volinfo, uuid);
- if (ret == -1) {
- snprintf (msg, sizeof (msg), GEOREP" session between %s & %s"
- " not active", volinfo->volname, slave);
+
+ pfd = gsyncd_getpidfile (volinfo->volname, slave, pidfile, conf_path);
+ if (pfd == -2) {
+ gf_log ("", GF_LOG_ERROR, GEOREP" stop validation "
+ "failed for %s & %s", volinfo->volname, slave);
+ ret = -1;
+ goto out;
+ }
+ if (gsync_status_byfd (pfd) == -1) {
+ snprintf (msg, sizeof (msg), GEOREP" session b/w %s & %s is not"
+ " running on this node.", volinfo->volname, slave);
+ gf_log ("", GF_LOG_ERROR, "%s", msg);
+ ret = -1;
+ /* monitor gsyncd already dead */
goto out;
}
+ if (pfd < 0)
+ goto out;
+
ret = 0;
out:
if (ret && (msg[0] != '\0')) {
@@ -1051,6 +1509,18 @@ glusterd_verify_gsync_status_opts (dict_t *dict, char **op_errstr)
gf_boolean_t exists = _gf_false;
glusterd_volinfo_t *volinfo = NULL;
int ret = 0;
+ char *conf_path = NULL;
+ char *slave_ip = NULL;
+ char *slave_vol = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ if (THIS)
+ priv = THIS->private;
+ if (priv == NULL) {
+ gf_log ("", GF_LOG_ERROR, "priv of glusterd not present");
+ *op_errstr = gf_strdup ("glusterd defunct");
+ goto out;
+ }
ret = dict_get_str (dict, "master", &volname);
if (ret < 0) {
@@ -1075,16 +1545,25 @@ glusterd_verify_gsync_status_opts (dict_t *dict, char **op_errstr)
goto out;
}
- out:
+ ret = glusterd_get_slave_details_confpath (volinfo, dict, &slave_ip,
+ &slave_vol, &conf_path,
+ op_errstr);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to fetch slave or confpath details.");
+ ret = -1;
+ goto out;
+ }
+
+out:
gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
-
}
int
glusterd_op_gsync_args_get (dict_t *dict, char **op_errstr,
- char **master, char **slave)
+ char **master, char **slave, char **host_uuid)
{
int ret = -1;
@@ -1109,6 +1588,14 @@ glusterd_op_gsync_args_get (dict_t *dict, char **op_errstr,
}
}
+ if (host_uuid) {
+ ret = dict_get_str (dict, "host-uuid", host_uuid);
+ if (ret < 0) {
+ gf_log ("", GF_LOG_WARNING, "host_uuid not found");
+ *op_errstr = gf_strdup ("host_uuid not found");
+ goto out;
+ }
+ }
ret = 0;
out:
@@ -1117,17 +1604,647 @@ out:
}
int
+glusterd_op_stage_sys_exec (dict_t *dict, char **op_errstr)
+{
+ char errmsg[PATH_MAX] = "";
+ char *command = NULL;
+ char command_path[PATH_MAX] = "";
+ struct stat st = {0,};
+ int ret = -1;
+ glusterd_conf_t *conf = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ if (conf->op_version < 2) {
+ gf_log ("", GF_LOG_ERROR, "Op Version not supported.");
+ snprintf (errmsg, sizeof(errmsg), "One or more nodes do not"
+ " support the required op version.");
+ *op_errstr = gf_strdup (errmsg);
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "command", &command);
+ if (ret) {
+ strcpy (errmsg, "internal error");
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to get command from dict");
+ goto out;
+ }
+
+ /* enforce local occurrence of the command */
+ if (strchr (command, '/')) {
+ strcpy (errmsg, "invalid command name");
+ ret = -1;
+ goto out;
+ }
+
+ sprintf (command_path, GSYNCD_PREFIX"/peer_%s", command);
+ /* check if it's executable */
+ ret = access (command_path, X_OK);
+ if (!ret)
+ /* check if it's a regular file */
+ ret = stat (command_path, &st);
+ if (!ret && !S_ISREG (st.st_mode))
+ ret = -1;
+
+out:
+ if (ret) {
+ if (errmsg[0] == '\0')
+ snprintf (errmsg, sizeof (errmsg), "%s not found.",
+ command);
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ }
+
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_op_stage_copy_file (dict_t *dict, char **op_errstr)
+{
+ char abs_filename[PATH_MAX] = "";
+ char errmsg[PATH_MAX] = "";
+ char *filename = NULL;
+ char *host_uuid = NULL;
+ char uuid_str [64] = {0};
+ int ret = -1;
+ glusterd_conf_t *priv = NULL;
+ struct stat stbuf = {0,};
+
+ if (THIS)
+ priv = THIS->private;
+ if (priv == NULL) {
+ gf_log ("", GF_LOG_ERROR, "priv of glusterd not present");
+ *op_errstr = gf_strdup ("glusterd defunct");
+ goto out;
+ }
+
+ if (priv->op_version < 2) {
+ gf_log ("", GF_LOG_ERROR, "Op Version not supported.");
+ snprintf (errmsg, sizeof(errmsg), "One or more nodes do not"
+ " support the required op version.");
+ *op_errstr = gf_strdup (errmsg);
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "host-uuid", &host_uuid);
+ if (ret < 0) {
+ gf_log ("", GF_LOG_ERROR, "Unable to fetch"
+ " host-uuid from dict.");
+ goto out;
+ }
+
+ uuid_utoa_r (MY_UUID, uuid_str);
+ if (!strcmp (uuid_str, host_uuid)) {
+ ret = dict_get_str (dict, "source", &filename);
+ if (ret < 0) {
+ gf_log ("", GF_LOG_ERROR, "Unable to fetch"
+ " filename from dict.");
+ *op_errstr = gf_strdup ("command unsuccessful");
+ goto out;
+ }
+ snprintf (abs_filename, sizeof(abs_filename),
+ "%s/%s", priv->workdir, filename);
+
+ ret = lstat (abs_filename, &stbuf);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg), "Source file"
+ " does not exist in %s", priv->workdir);
+ *op_errstr = gf_strdup (errmsg);
+ goto out;
+ }
+
+ if (!S_ISREG(stbuf.st_mode)) {
+ snprintf (errmsg, sizeof (errmsg), "Source file"
+ " is not a regular file.");
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ ret = -1;
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_get_statefile_name (glusterd_volinfo_t *volinfo, char *slave,
+ char *conf_path, char **statefile)
+{
+ glusterd_conf_t *priv = NULL;
+ int ret = -1;
+ char *master = NULL;
+ char *buf = NULL;
+ dict_t *confd = NULL;
+ char *confpath = NULL;
+ char conf_buf[PATH_MAX] = "";
+ struct stat stbuf = {0,};
+
+ GF_ASSERT (THIS);
+ GF_ASSERT (THIS->private);
+ GF_ASSERT (volinfo);
+
+ master = volinfo->volname;
+
+ confd = dict_new ();
+ if (!confd) {
+ gf_log ("", GF_LOG_ERROR, "Unable to create new dict");
+ goto out;
+ }
+
+ priv = THIS->private;
+
+ ret = lstat (conf_path, &stbuf);
+ if (!ret) {
+ gf_log ("", GF_LOG_INFO, "Using passed config template(%s).",
+ conf_path);
+ confpath = conf_path;
+ } else {
+ ret = snprintf (conf_buf, sizeof(conf_buf) - 1,
+ "%s/"GSYNC_CONF_TEMPLATE, priv->workdir);
+ conf_buf[ret] = '\0';
+ confpath = conf_buf;
+ gf_log ("", GF_LOG_INFO, "Using default config template(%s).",
+ confpath);
+ }
+
+ ret = glusterd_gsync_get_config (master, slave, confpath,
+ confd);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to get configuration data"
+ "for %s(master), %s(slave)", master, slave);
+ goto out;
+
+ }
+
+ ret = dict_get_param (confd, "state_file", &buf);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to get state_file's name.");
+ goto out;
+ }
+
+ *statefile = gf_strdup(buf);
+ if (!*statefile) {
+ gf_log ("", GF_LOG_ERROR, "Unable to gf_strdup.");
+ ret = -1;
+ goto out;
+ }
+
+ ret = 0;
+ out:
+ if (confd)
+ dict_destroy (confd);
+
+ gf_log ("", GF_LOG_DEBUG, "Returning %d ", ret);
+ return ret;
+}
+
+static int
+glusterd_create_status_file (char *master, char *slave, char *slave_ip,
+ char *slave_vol, char *status)
+{
+ int ret = -1;
+ runner_t runner = {0,};
+ glusterd_conf_t *priv = NULL;
+
+ if (THIS)
+ priv = THIS->private;
+ if (priv == NULL) {
+ gf_log ("", GF_LOG_ERROR, "priv of glusterd not present");
+ goto out;
+ }
+
+ if (!status) {
+ gf_log ("", GF_LOG_ERROR, "Status Empty");
+ goto out;
+ }
+ gf_log ("", GF_LOG_DEBUG, "slave = %s", slave);
+
+ runinit (&runner);
+ runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", "--create",
+ status, "-c", NULL);
+ runner_argprintf (&runner, "%s/"GEOREP"/%s_%s_%s/gsyncd.conf",
+ priv->workdir, master, slave_ip, slave_vol);
+ runner_argprintf (&runner, ":%s", master);
+ runner_add_args (&runner, slave, NULL);
+ synclock_unlock (&priv->big_lock);
+ ret = runner_run (&runner);
+ synclock_lock (&priv->big_lock);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Creating status file failed.");
+ ret = -1;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ gf_log ("", GF_LOG_DEBUG, "returning %d", ret);
+ return ret;
+}
+
+static int
+glusterd_verify_slave (char *volname, char *slave_ip, char *slave,
+ char **op_errstr, gf_boolean_t *is_force_blocker)
+{
+ int32_t ret = -1;
+ runner_t runner = {0,};
+ char log_file_path[PATH_MAX] = "";
+ char buf[PATH_MAX] = "";
+ char *tmp = NULL;
+ char *save_ptr = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ GF_ASSERT (volname);
+ GF_ASSERT (slave_ip);
+ GF_ASSERT (slave);
+
+ if (THIS)
+ priv = THIS->private;
+ if (priv == NULL) {
+ gf_log ("", GF_LOG_ERROR, "priv of glusterd not present");
+ goto out;
+ }
+
+ snprintf (log_file_path, sizeof(log_file_path),
+ DEFAULT_LOG_FILE_DIRECTORY"/create_verify_log");
+
+ runinit (&runner);
+ runner_add_args (&runner, GSYNCD_PREFIX"/gverify.sh", NULL);
+ runner_argprintf (&runner, "%s", volname);
+ runner_argprintf (&runner, "%s", slave_ip);
+ runner_argprintf (&runner, "%s", slave);
+ runner_argprintf (&runner, "%s", log_file_path);
+ runner_redir (&runner, STDOUT_FILENO, RUN_PIPE);
+ synclock_unlock (&priv->big_lock);
+ ret = runner_run (&runner);
+ synclock_lock (&priv->big_lock);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Not a valid slave");
+ ret = glusterd_gsync_read_frm_status (log_file_path,
+ buf, sizeof(buf));
+ if (ret <= 0) {
+ gf_log ("", GF_LOG_ERROR, "Unable to read from %s",
+ log_file_path);
+ goto out;
+ }
+
+ /* Tokenize the error message from gverify.sh to figure out
+ * if the error is a force blocker or not. */
+ tmp = strtok_r (buf, "|", &save_ptr);
+ if (!strcmp (tmp, "FORCE_BLOCKER"))
+ *is_force_blocker = 1;
+ else {
+ /* No FORCE_BLOCKER flag present so all that is
+ * present is the error message. */
+ *is_force_blocker = 0;
+ if (tmp)
+ *op_errstr = gf_strdup (tmp);
+ ret = -1;
+ goto out;
+ }
+
+ /* Copy rest of the error message to op_errstr */
+ tmp = strtok_r (NULL, "|", &save_ptr);
+ if (tmp)
+ *op_errstr = gf_strdup (tmp);
+ ret = -1;
+ goto out;
+ }
+ ret = 0;
+out:
+ unlink (log_file_path);
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_mountbroker_check (char **slave_ip, char **op_errstr)
+{
+ int ret = -1;
+ char *tmp = NULL;
+ char *save_ptr = NULL;
+ char *username = NULL;
+ char *host = NULL;
+ char errmsg[PATH_MAX] = "";
+
+ GF_ASSERT (slave_ip);
+ GF_ASSERT (*slave_ip);
+
+ /* Checking if hostname has user specified */
+ host = strstr (*slave_ip, "@");
+ if (!host) {
+ gf_log ("", GF_LOG_DEBUG, "No username provided.");
+ ret = 0;
+ goto out;
+ } else {
+ /* Moving the host past the '@' and checking if the
+ * actual hostname also has '@' */
+ host++;
+ if (strstr (host, "@")) {
+ gf_log ("", GF_LOG_DEBUG, "host = %s", host);
+ ret = snprintf (errmsg, sizeof(errmsg) - 1,
+ "Invalid Hostname (%s).", host);
+ errmsg[ret] = '\0';
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ ret = -1;
+ if (op_errstr)
+ *op_errstr = gf_strdup (errmsg);
+ goto out;
+ }
+
+ /* Fetching the username and hostname
+ * and checking if the username is non-root */
+ username = strtok_r (*slave_ip, "@", &save_ptr);
+ tmp = strtok_r (NULL, "@", &save_ptr);
+ if (strcmp (username, "root")) {
+ ret = snprintf (errmsg, sizeof(errmsg) - 1,
+ "Non-root username (%s@%s) not allowed.",
+ username, tmp);
+ errmsg[ret] = '\0';
+ if (op_errstr)
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR,
+ "Non-Root username not allowed.");
+ ret = -1;
+ goto out;
+ }
+
+ *slave_ip = gf_strdup (tmp);
+ if (!*slave_ip) {
+ gf_log ("", GF_LOG_ERROR, "Out of memory");
+ ret = -1;
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_op_stage_gsync_create (dict_t *dict, char **op_errstr)
+{
+ char *down_peerstr = NULL;
+ char *slave = NULL;
+ char *volname = NULL;
+ char *host_uuid = NULL;
+ char *statefile = NULL;
+ char *slave_ip = NULL;
+ char *slave_vol = NULL;
+ char *conf_path = NULL;
+ char errmsg[PATH_MAX] = "";
+ char common_pem_file[PATH_MAX] = "";
+ char hook_script[PATH_MAX] = "";
+ char uuid_str [64] = "";
+ int ret = -1;
+ int is_pem_push = -1;
+ gf_boolean_t is_force = -1;
+ gf_boolean_t is_force_blocker = -1;
+ gf_boolean_t exists = _gf_false;
+ glusterd_conf_t *conf = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ struct stat stbuf = {0,};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ ret = glusterd_op_gsync_args_get (dict, op_errstr, &volname,
+ &slave, &host_uuid);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to fetch arguments");
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return -1;
+ }
+
+ if (conf->op_version < 2) {
+ gf_log ("", GF_LOG_ERROR, "Op Version not supported.");
+ snprintf (errmsg, sizeof(errmsg), "One or more nodes do not"
+ " support the required op version.");
+ *op_errstr = gf_strdup (errmsg);
+ ret = -1;
+ goto out;
+ }
+
+ exists = glusterd_check_volume_exists (volname);
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if ((ret) || (!exists)) {
+ gf_log ("", GF_LOG_WARNING, "volume name does not exist");
+ snprintf (errmsg, sizeof(errmsg), "Volume name %s does not"
+ " exist", volname);
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return -1;
+ }
+
+ ret = glusterd_get_slave_details_confpath (volinfo, dict, &slave_ip,
+ &slave_vol, &conf_path,
+ op_errstr);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to fetch slave or confpath details.");
+ ret = -1;
+ goto out;
+ }
+
+ is_force = dict_get_str_boolean (dict, "force", _gf_false);
+
+ uuid_utoa_r (MY_UUID, uuid_str);
+ if (!strcmp (uuid_str, host_uuid)) {
+ ret = glusterd_are_vol_all_peers_up (volinfo,
+ &conf->peers,
+ &down_peerstr);
+ if ((ret == _gf_false) && !is_force) {
+ snprintf (errmsg, sizeof (errmsg), "Peer %s,"
+ " which is a part of %s volume, is"
+ " down. Please bring up the peer and"
+ " retry.", down_peerstr,
+ volinfo->volname);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ *op_errstr = gf_strdup (errmsg);
+ GF_FREE (down_peerstr);
+ down_peerstr = NULL;
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return -1;
+ } else if (ret == _gf_false) {
+ gf_log ("", GF_LOG_INFO, "Peer %s,"
+ " which is a part of %s volume, is"
+ " down. Force creating geo-rep session."
+ " On bringing up the peer, re-run"
+ " \"gluster system:: execute"
+ " gsec_create\" and \"gluster volume"
+ " geo-replication %s %s create push-pem"
+ " force\"", down_peerstr, volinfo->volname,
+ volinfo->volname, slave);
+ }
+
+ /* Checking if slave host is pingable, has proper passwordless
+ * ssh login setup, slave volume is created, slave vol is empty,
+ * and if it has enough memory and bypass in case of force if
+ * the error is not a force blocker */
+ ret = glusterd_verify_slave (volname, slave_ip, slave_vol,
+ op_errstr, &is_force_blocker);
+ if (ret) {
+ if (is_force && !is_force_blocker) {
+ gf_log ("", GF_LOG_INFO, "%s is not a valid slave"
+ " volume. Error: %s. Force creating geo-rep"
+ " session.", slave, *op_errstr);
+ } else {
+ gf_log ("", GF_LOG_ERROR,
+ "%s is not a valid slave volume. Error: %s",
+ slave, *op_errstr);
+ ret = -1;
+ goto out;
+ }
+ }
+
+ ret = dict_get_int32 (dict, "push_pem", &is_pem_push);
+ if (!ret && is_pem_push) {
+ ret = snprintf (common_pem_file,
+ sizeof(common_pem_file) - 1,
+ "%s"GLUSTERD_COMMON_PEM_PUB_FILE,
+ conf->workdir);
+ common_pem_file[ret] = '\0';
+
+ ret = snprintf (hook_script, sizeof(hook_script) - 1,
+ "%s"GLUSTERD_CREATE_HOOK_SCRIPT,
+ conf->workdir);
+ hook_script[ret] = '\0';
+
+ ret = lstat (common_pem_file, &stbuf);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg), "%s"
+ " required for push-pem is"
+ " not present. Please run"
+ " \"gluster system:: execute"
+ " gsec_create\"", common_pem_file);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ *op_errstr = gf_strdup (errmsg);
+ ret = -1;
+ goto out;
+ }
+
+ ret = lstat (hook_script, &stbuf);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg),
+ "The hook-script (%s) required "
+ "for push-pem is not present. "
+ "Please install the hook-script "
+ "and retry", hook_script);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ *op_errstr = gf_strdup (errmsg);
+ ret = -1;
+ goto out;
+ }
+
+ if (!S_ISREG(stbuf.st_mode)) {
+ snprintf (errmsg, sizeof (errmsg), "%s"
+ " required for push-pem is"
+ " not a regular file. Please run"
+ " \"gluster system:: execute"
+ " gsec_create\"", common_pem_file);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ ret = -1;
+ goto out;
+ }
+ }
+ }
+
+ ret = glusterd_get_statefile_name (volinfo, slave, conf_path, &statefile);
+ if (ret) {
+ if (!strstr(slave, "::"))
+ snprintf (errmsg, sizeof (errmsg),
+ "%s is not a valid slave url.", slave);
+ else
+ snprintf (errmsg, sizeof (errmsg), "Please check gsync "
+ "config file. Unable to get statefile's name");
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_str (dict, "statefile", statefile);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to store statefile path");
+ goto out;
+ }
+
+ ret = lstat (statefile, &stbuf);
+ if (!ret && !is_force) {
+ snprintf (errmsg, sizeof (errmsg), "Session between %s"
+ " and %s is already created.",
+ volinfo->volname, slave);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ ret = -1;
+ goto out;
+ } else if (!ret)
+ gf_log ("", GF_LOG_INFO, "Session between %s"
+ " and %s is already created. Force"
+ " creating again.", volinfo->volname, slave);
+
+ ret = glusterd_verify_gsyncd_spawn (volinfo->volname, slave);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg), "Unable to spawn gsyncd.");
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ goto out;
+ }
+
+ ret = 0;
+out:
+
+ if (ret && errmsg[0] != '\0')
+ *op_errstr = gf_strdup (errmsg);
+
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int
glusterd_op_stage_gsync_set (dict_t *dict, char **op_errstr)
{
int ret = 0;
int type = 0;
char *volname = NULL;
char *slave = NULL;
+ char *slave_ip = NULL;
+ char *slave_vol = NULL;
+ char *down_peerstr = NULL;
+ char *statefile = NULL;
+ char *path_list = NULL;
+ char *conf_path = NULL;
gf_boolean_t exists = _gf_false;
glusterd_volinfo_t *volinfo = NULL;
char errmsg[PATH_MAX] = {0,};
dict_t *ctx = NULL;
+ gf_boolean_t is_force = 0;
+ gf_boolean_t is_force_blocker = -1;
+ gf_boolean_t is_running = _gf_false;
+ uuid_t uuid = {0};
+ char uuid_str [64] = {0};
+ char *host_uuid = NULL;
+ xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
+ struct stat stbuf = {0,};
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
ret = dict_get_int32 (dict, "type", &type);
if (ret < 0) {
@@ -1136,25 +2253,26 @@ glusterd_op_stage_gsync_set (dict_t *dict, char **op_errstr)
goto out;
}
- switch (type) {
- case GF_GSYNC_OPTION_TYPE_STATUS:
+ if (type == GF_GSYNC_OPTION_TYPE_STATUS) {
ret = glusterd_verify_gsync_status_opts (dict, op_errstr);
-
goto out;
- case GF_GSYNC_OPTION_TYPE_CONFIG:
- ret = gsync_verify_config_options (dict, op_errstr);
+ }
+ ret = glusterd_op_gsync_args_get (dict, op_errstr,
+ &volname, &slave, &host_uuid);
+ if (ret)
goto out;
- case GF_GSYNC_OPTION_TYPE_ROTATE:
- /* checks same as status mode */
- ret = glusterd_verify_gsync_status_opts(dict, op_errstr);
- goto out;
- }
+ uuid_utoa_r (MY_UUID, uuid_str);
- ret = glusterd_op_gsync_args_get (dict, op_errstr, &volname, &slave);
- if (ret)
+ if (conf->op_version < 2) {
+ gf_log ("", GF_LOG_ERROR, "Op Version not supported.");
+ snprintf (errmsg, sizeof(errmsg), "One or more nodes do not"
+ " support the required op version.");
+ *op_errstr = gf_strdup (errmsg);
+ ret = -1;
goto out;
+ }
exists = glusterd_check_volume_exists (volname);
ret = glusterd_volinfo_find (volname, &volinfo);
@@ -1167,19 +2285,115 @@ glusterd_op_stage_gsync_set (dict_t *dict, char **op_errstr)
goto out;
}
+ ret = glusterd_get_slave_details_confpath (volinfo, dict, &slave_ip,
+ &slave_vol, &conf_path,
+ op_errstr);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to fetch slave or confpath details.");
+ ret = -1;
+ goto out;
+ }
+
+ ret = glusterd_get_statefile_name (volinfo, slave, conf_path, &statefile);
+ if (ret) {
+ /* Checking if slave host is pingable, has proper passwordless
+ * ssh login setup */
+ ret = glusterd_verify_slave (volname, slave_ip, slave_vol,
+ op_errstr, &is_force_blocker);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "%s is not a valid slave volume. Error: %s",
+ slave, *op_errstr);
+ goto out;
+ }
+
+ if (!strstr(slave, "::"))
+ snprintf (errmsg, sizeof (errmsg),
+ "%s is not a valid slave url.", slave);
+ else
+ snprintf (errmsg, sizeof (errmsg),
+ "Unable to get statefile's name");
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_str (dict, "statefile", statefile);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to store statefile path");
+ goto out;
+ }
+
+ is_force = dict_get_str_boolean (dict, "force", _gf_false);
+
+ /* Allowing stop force to bypass the statefile check
+ * as this command acts as a fail safe method to stop geo-rep
+ * session. */
+ if ((type == GF_GSYNC_OPTION_TYPE_CONFIG) ||
+ ((type == GF_GSYNC_OPTION_TYPE_STOP) && !is_force) ||
+ (type == GF_GSYNC_OPTION_TYPE_DELETE)) {
+ ret = lstat (statefile, &stbuf);
+ if (ret) {
+ snprintf (errmsg, sizeof(errmsg), "Geo-replication"
+ " session between %s and %s does not exist.",
+ volinfo->volname, slave);
+ gf_log ("", GF_LOG_ERROR, "%s. statefile = %s",
+ errmsg, statefile);
+ *op_errstr = gf_strdup (errmsg);
+ ret = -1;
+ goto out;
+ }
+ }
+
+ /* Check if all peers that are a part of the volume are up or not */
+ if ((type == GF_GSYNC_OPTION_TYPE_DELETE) ||
+ ((type == GF_GSYNC_OPTION_TYPE_STOP) && !is_force)) {
+ if (!strcmp (uuid_str, host_uuid)) {
+ ret = glusterd_are_vol_all_peers_up (volinfo,
+ &conf->peers,
+ &down_peerstr);
+ if (ret == _gf_false) {
+ snprintf (errmsg, sizeof (errmsg), "Peer %s,"
+ " which is a part of %s volume, is"
+ " down. Please bring up the peer and"
+ " retry.", down_peerstr,
+ volinfo->volname);
+ *op_errstr = gf_strdup (errmsg);
+ ret = -1;
+ GF_FREE (down_peerstr);
+ down_peerstr = NULL;
+ goto out;
+ }
+ }
+ }
+
switch (type) {
case GF_GSYNC_OPTION_TYPE_START:
+ /* don't attempt to start gsync if replace-brick is
+ * in progress */
+ if (glusterd_is_rb_ongoing (volinfo)) {
+ snprintf (errmsg, sizeof(errmsg), "replace-brick is in"
+ " progress, not starting geo-replication");
+ *op_errstr = gf_strdup (errmsg);
+ ret = -1;
+ goto out;
+ }
+
ret = glusterd_op_verify_gsync_start_options (volinfo, slave,
- op_errstr);
+ conf_path, statefile,
+ op_errstr, is_force);
if (ret)
goto out;
ctx = glusterd_op_get_ctx();
if (ctx) {
- /*gsyncd does a fuse mount to start the geo-rep session*/
+ /* gsyncd does a fuse mount to start
+ * the geo-rep session */
if (!glusterd_is_fuse_available ()) {
- gf_log ("glusterd", GF_LOG_ERROR, "Unable to open"
- " /dev/fuse (%s), geo-replication start"
- " failed", strerror (errno));
+ gf_log ("glusterd", GF_LOG_ERROR, "Unable to "
+ "open /dev/fuse (%s), geo-replication "
+ "start failed", strerror (errno));
snprintf (errmsg, sizeof(errmsg),
"fuse unvailable");
*op_errstr = gf_strdup (errmsg);
@@ -1190,17 +2404,72 @@ glusterd_op_stage_gsync_set (dict_t *dict, char **op_errstr)
break;
case GF_GSYNC_OPTION_TYPE_STOP:
- ret = glusterd_op_verify_gsync_running (volinfo, slave,
- op_errstr);
+ if (!is_force) {
+ ret = glusterd_op_verify_gsync_running (volinfo, slave,
+ conf_path,
+ op_errstr);
+ if (ret) {
+ ret = glusterd_get_local_brickpaths (volinfo,
+ &path_list);
+ if (path_list)
+ ret = -1;
+ }
+ }
+ break;
+
+ case GF_GSYNC_OPTION_TYPE_CONFIG:
+ ret = gsync_verify_config_options (dict, op_errstr, volname);
+ goto out;
+ break;
+
+ case GF_GSYNC_OPTION_TYPE_DELETE:
+ /* Check if the gsync session is still running
+ * If so ask the user to stop geo-replication first.*/
+ ret = glusterd_gsync_get_uuid (slave, volinfo, uuid);
+ if (ret) {
+ snprintf (errmsg, sizeof(errmsg), "Geo-replication"
+ " session between %s and %s does not exist.",
+ volinfo->volname, slave);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ *op_errstr = gf_strdup (errmsg);
+ ret = -1;
+ goto out;
+ } else {
+ ret = glusterd_check_gsync_running_local (volinfo->volname,
+ slave, conf_path,
+ &is_running);
+ if (_gf_true == is_running) {
+ snprintf (errmsg, sizeof (errmsg), GEOREP
+ " session between %s & %s is "
+ "still active. Please stop the "
+ "session and retry.",
+ volinfo->volname, slave);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ *op_errstr = gf_strdup (errmsg);
+ ret = -1;
+ goto out;
+ }
+ }
+
+ ret = glusterd_verify_gsyncd_spawn (volinfo->volname, slave);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg),
+ "Unable to spawn gsyncd");
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ }
+
break;
}
out:
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
static int
-stop_gsync (char *master, char *slave, char **msg)
+stop_gsync (char *master, char *slave, char **msg,
+ char *conf_path, gf_boolean_t is_force)
{
int32_t ret = 0;
int pfd = -1;
@@ -1212,19 +2481,16 @@ stop_gsync (char *master, char *slave, char **msg)
GF_ASSERT (THIS);
GF_ASSERT (THIS->private);
- pfd = gsyncd_getpidfile (master, slave, pidfile);
- if (pfd == -2) {
+ pfd = gsyncd_getpidfile (master, slave, pidfile, conf_path);
+ if (pfd == -2 && !is_force) {
gf_log ("", GF_LOG_ERROR, GEOREP" stop validation "
" failed for %s & %s", master, slave);
ret = -1;
goto out;
}
- if (gsync_status_byfd (pfd) == -1) {
+ if (gsync_status_byfd (pfd) == -1 && !is_force) {
gf_log ("", GF_LOG_ERROR, "gsyncd b/w %s & %s is not"
" running", master, slave);
- if (msg)
- *msg = gf_strdup ("Warning: "GEOREP" session was "
- "defunct at stop time");
/* monitor gsyncd already dead */
goto out;
}
@@ -1259,16 +2525,100 @@ stop_gsync (char *master, char *slave, char **msg)
out:
sys_close (pfd);
+
+ if (is_force)
+ ret = 0;
return ret;
}
-static int
-glusterd_check_restart_gsync_session (glusterd_volinfo_t *volinfo, char *slave,
- dict_t *resp_dict);
+/*
+ * glusterd_gsync_op_already_set:
+ * This funcion checks whether the op_value is same as in the
+ * gsyncd.conf file.
+ *
+ * RETURN VALUE:
+ * 0 : op_value matches the conf file.
+ * 1 : op_value does not matches the conf file or op_param not
+ * found in conf file.
+ * -1 : error
+ */
+
+int
+glusterd_gsync_op_already_set (char* master, char* slave, char* conf_path,
+ char* op_name, char* op_value)
+{
+ dict_t *confd = NULL;
+ char *op_val_buf = NULL;
+ int32_t op_val_conf = 0;
+ int32_t op_val_cli = 0;
+ int32_t ret = -1;
+ gf_boolean_t is_bool = _gf_true;
+
+ confd = dict_new ();
+ if (!confd) {
+ gf_log ("", GF_LOG_ERROR, "Not able to create dict.");
+ return -1;
+ }
+
+ ret = glusterd_gsync_get_config (master, slave, conf_path,
+ confd);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to get configuration data"
+ "for %s(master), %s(slave)", master, slave);
+ goto out;
+ }
+
+ ret = dict_get_param (confd, op_name, &op_val_buf);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to get op_value "
+ "for %s(master), %s(slave). Please check gsync "
+ "config file.", master, slave);
+ ret = 1;
+ goto out;
+ }
+
+ gf_log("",GF_LOG_DEBUG, "val_cli:%s val_conf:%s",op_value,op_val_buf);
+
+ if (!strcmp(op_val_buf,"true") || !strcmp(op_val_buf,"1")
+ || !strcmp(op_val_buf,"yes")) {
+ op_val_conf = 1;
+ } else if(!strcmp(op_val_buf,"false") || !strcmp(op_val_buf,"0")
+ || !strcmp(op_val_buf,"no")) {
+ op_val_conf = 0;
+ } else {
+ is_bool = _gf_false;
+ }
+
+ if (is_bool) {
+ if (!strcmp(op_value,"true") || !strcmp(op_value,"1")
+ || !strcmp(op_value,"yes")) {
+ op_val_cli = 1;
+ } else {
+ op_val_cli = 0;
+ }
+
+ if ( op_val_cli == op_val_conf ) {
+ ret = 0;
+ goto out;
+ }
+ } else {
+ if (!strcmp(op_val_buf,op_value)) {
+ ret = 0;
+ goto out;
+ }
+ }
+
+ ret = 1;
+
+out:
+ dict_unref(confd);
+ return ret;
+}
static int
glusterd_gsync_configure (glusterd_volinfo_t *volinfo, char *slave,
- dict_t *dict, dict_t *resp_dict, char **op_errstr)
+ char *path_list, dict_t *dict,
+ dict_t *resp_dict, char **op_errstr)
{
int32_t ret = -1;
char *op_name = NULL;
@@ -1277,6 +2627,13 @@ glusterd_gsync_configure (glusterd_volinfo_t *volinfo, char *slave,
glusterd_conf_t *priv = NULL;
char *subop = NULL;
char *master = NULL;
+ char *conf_path = NULL;
+ char *slave_ip = NULL;
+ char *slave_vol = NULL;
+ struct stat stbuf = {0, };
+ gf_boolean_t restart_required = _gf_true;
+ char **resopt = NULL;
+ gf_boolean_t op_already_set = _gf_false;
GF_ASSERT (slave);
GF_ASSERT (op_errstr);
@@ -1311,10 +2668,17 @@ glusterd_gsync_configure (glusterd_volinfo_t *volinfo, char *slave,
goto out;
}
+ ret = dict_get_str (dict, "conf_path", &conf_path);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to fetch conf file path.");
+ goto out;
+ }
+
master = "";
runinit (&runner);
runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", "-c", NULL);
- runner_argprintf (&runner, "%s/"GSYNC_CONF, priv->workdir);
+ runner_argprintf (&runner, "%s", conf_path);
if (volinfo) {
master = volinfo->volname;
runner_argprintf (&runner, ":%s", master);
@@ -1324,7 +2688,27 @@ glusterd_gsync_configure (glusterd_volinfo_t *volinfo, char *slave,
runner_add_arg (&runner, op_name);
if (op_value)
runner_add_arg (&runner, op_value);
+
+ if ( strcmp(op_name,"checkpoint") != 0 ) {
+ ret = glusterd_gsync_op_already_set(master,slave,conf_path,
+ op_name,op_value);
+ if (ret == -1) {
+ gf_log ("", GF_LOG_WARNING,
+ "glusterd_gsync_op_already_set failed.");
+ gf_asprintf (op_errstr, GEOREP" config-%s failed for "
+ "%s %s", subop, master, slave);
+ goto out;
+ }
+ if (ret == 0) {
+ gf_log("", GF_LOG_DEBUG, "op_value is already set");
+ op_already_set = _gf_true;
+ goto out;
+ }
+ }
+
+ synclock_unlock (&priv->big_lock);
ret = runner_run (&runner);
+ synclock_lock (&priv->big_lock);
if (ret) {
gf_log ("", GF_LOG_WARNING, "gsyncd failed to "
"%s %s option for %s %s peers",
@@ -1335,22 +2719,65 @@ glusterd_gsync_configure (glusterd_volinfo_t *volinfo, char *slave,
goto out;
}
+
+ if ((!strcmp (op_name, "state_file")) && (op_value)) {
+
+ ret = lstat (op_value, &stbuf);
+ if (ret) {
+ ret = dict_get_str (dict, "slave_ip", &slave_ip);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to fetch slave IP.");
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "slave_vol", &slave_vol);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to fetch slave volume name.");
+ goto out;
+ }
+
+ ret = glusterd_create_status_file (volinfo->volname, slave,
+ slave_ip, slave_vol,
+ "Switching Status File");
+ if (ret || lstat (op_value, &stbuf)) {
+ gf_log ("", GF_LOG_ERROR, "Unable to create %s"
+ ". Error : %s", op_value,
+ strerror (errno));
+ ret = -1;
+ goto out;
+ }
+ }
+ }
+
ret = 0;
gf_asprintf (op_errstr, "config-%s successful", subop);
out:
- if (!ret && volinfo) {
+ if (!ret && volinfo && !op_already_set) {
+ for (resopt = gsync_no_restart_opts; *resopt; resopt++) {
+ restart_required = _gf_true;
+ if (!strcmp ((*resopt), op_name)){
+ restart_required = _gf_false;
+ break;
+ }
+ }
+
+ if (restart_required) {
ret = glusterd_check_restart_gsync_session (volinfo, slave,
- resp_dict);
+ resp_dict, path_list,
+ conf_path, 0);
if (ret)
- *op_errstr = gf_strdup ("internal error");
+ *op_errstr = gf_strdup ("internal error");
+ }
}
gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
-static int
+int
glusterd_gsync_read_frm_status (char *path, char *buf, size_t blen)
{
int ret = 0;
@@ -1374,7 +2801,6 @@ glusterd_gsync_read_frm_status (char *path, char *buf, size_t blen)
char *p = buf + len - 1;
while (isspace (*p))
*p-- = '\0';
- ret = 0;
}
} else if (ret < 0)
gf_log ("", GF_LOG_ERROR, "Status file of gsyncd is corrupt");
@@ -1384,20 +2810,146 @@ glusterd_gsync_read_frm_status (char *path, char *buf, size_t blen)
}
static int
-glusterd_gsync_fetch_status_extra (char *path, char *buf, size_t blen)
+dict_get_param (dict_t *dict, char *key, char **param)
+{
+ char *dk = NULL;
+ char *s = NULL;
+ char x = '\0';
+ int ret = 0;
+
+ if (dict_get_str (dict, key, param) == 0)
+ return 0;
+
+ dk = gf_strdup (key);
+ if (!key)
+ return -1;
+
+ s = strpbrk (dk, "-_");
+ if (!s)
+ return -1;
+ x = (*s == '-') ? '_' : '-';
+ *s++ = x;
+ while ((s = strpbrk (s, "-_")))
+ *s++ = x;
+
+ ret = dict_get_str (dict, dk, param);
+
+ GF_FREE (dk);
+ return ret;
+}
+
+static int
+glusterd_parse_gsync_status (char *buf, gf_gsync_status_t *sts_val)
+{
+ int ret = -1;
+ int i = -1;
+ int num_of_fields = 8;
+ char *token = NULL;
+ char **tokens = NULL;
+ char **ptr = NULL;
+ char *save_ptr = NULL;
+ char na_buf[] = "N/A";
+
+ if (!buf) {
+ gf_log ("", GF_LOG_ERROR, "Empty buf");
+ goto out;
+ }
+
+ tokens = calloc (num_of_fields, sizeof (char *));
+ if (!tokens) {
+ gf_log ("", GF_LOG_ERROR, "Out of memory");
+ goto out;
+ }
+
+ ptr = tokens;
+
+ for (token = strtok_r (buf, ",", &save_ptr); token;
+ token = strtok_r (NULL, ",", &save_ptr)) {
+ *ptr = gf_strdup(token);
+ if (!*ptr) {
+ gf_log ("", GF_LOG_ERROR, "Out of memory");
+ goto out;
+ }
+ ptr++;
+ }
+
+ for (i = 0; i < num_of_fields; i++) {
+ token = strtok_r (tokens[i], ":", &save_ptr);
+ token = strtok_r (NULL, "\0", &save_ptr);
+ token++;
+
+ /* token NULL check */
+ if (!token && (i != 0) &&
+ (i != 5) && (i != 7))
+ token = na_buf;
+
+ if (i == 0) {
+ if (!token)
+ token = na_buf;
+ else {
+ token++;
+ if (!token)
+ token = na_buf;
+ else
+ token[strlen(token) - 1] = '\0';
+ }
+ memcpy (sts_val->slave_node, token, strlen(token));
+ }
+ if (i == 1)
+ memcpy (sts_val->files_syncd, token, strlen(token));
+ if (i == 2)
+ memcpy (sts_val->purges_remaining, token, strlen(token));
+ if (i == 3)
+ memcpy (sts_val->total_files_skipped, token, strlen(token));
+ if (i == 4)
+ memcpy (sts_val->files_remaining, token, strlen(token));
+ if (i == 5) {
+ if (!token)
+ token = na_buf;
+ else {
+ token++;
+ if (!token)
+ token = na_buf;
+ else
+ token[strlen(token) - 1] = '\0';
+ }
+ memcpy (sts_val->worker_status, token, strlen(token));
+ }
+ if (i == 6)
+ memcpy (sts_val->bytes_remaining, token, strlen(token));
+ if (i == 7) {
+ if (!token)
+ token = na_buf;
+ else {
+ token++;
+ if (!token)
+ token = na_buf;
+ else
+ token[strlen(token) - 2] = '\0';
+ }
+ memcpy (sts_val->crawl_status, token, strlen(token));
+ }
+ }
+
+ ret = 0;
+out:
+ for (i = 0; i< num_of_fields; i++)
+ if (tokens[i])
+ GF_FREE(tokens[i]);
+
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+static int
+glusterd_gsync_fetch_status_extra (char *path, gf_gsync_status_t *sts_val)
{
char sockpath[PATH_MAX] = {0,};
struct sockaddr_un sa = {0,};
- size_t l = 0;
int s = -1;
struct pollfd pfd = {0,};
int ret = 0;
- l = strlen (buf);
- /* seek to end of data in buf */
- buf += l;
- blen -= l;
-
glusterd_set_socket_filepath (path, sockpath, sizeof (sockpath));
strncpy(sa.sun_path, sockpath, sizeof(sa.sun_path));
@@ -1408,6 +2960,11 @@ glusterd_gsync_fetch_status_extra (char *path, char *buf, size_t blen)
s = socket(AF_UNIX, SOCK_STREAM, 0);
if (s == -1)
return -1;
+ ret = fcntl (s, F_GETFL);
+ if (ret != -1)
+ ret = fcntl (s, F_SETFL, ret | O_NONBLOCK);
+ if (ret == -1)
+ goto out;
ret = connect (s, (struct sockaddr *)&sa, sizeof (sa));
if (ret == -1)
@@ -1420,72 +2977,56 @@ glusterd_gsync_fetch_status_extra (char *path, char *buf, size_t blen)
ret = -1;
goto out;
}
- ret = read(s, buf, blen);
+ ret = read(s, sts_val->checkpoint_status,
+ sizeof(sts_val->checkpoint_status));
/* we expect a terminating 0 byte */
- if (ret == 0 || (ret > 0 && buf[ret - 1]))
+ if (ret == 0 || (ret > 0 && sts_val->checkpoint_status[ret - 1]))
ret = -1;
- if (ret > 0)
+ if (ret > 0) {
ret = 0;
+ }
- out:
+out:
close (s);
return ret;
}
-static int
-dict_get_param (dict_t *dict, char *key, char **param)
-{
- char *dk = NULL;
- char *s = NULL;
- char x = '\0';
- int ret = 0;
-
- if (dict_get_str (dict, key, param) == 0)
- return 0;
-
- dk = gf_strdup (key);
- if (!key)
- return -1;
-
- s = strpbrk (dk, "-_");
- if (!s)
- return -1;
- x = (*s == '-') ? '_' : '-';
- *s++ = x;
- while ((s = strpbrk (s, "-_")))
- *s++ = x;
-
- ret = dict_get_str (dict, dk, param);
-
- GF_FREE (dk);
- return ret;
-}
-
-static int
-glusterd_read_status_file (char *master, char *slave,
- dict_t *dict)
+int
+glusterd_read_status_file (glusterd_volinfo_t *volinfo, char *slave,
+ char *conf_path, dict_t *dict, char *node)
{
- glusterd_conf_t *priv = NULL;
- int ret = 0;
- char *statefile = NULL;
- char buf[1024] = {0, };
- char mst[1024] = {0, };
- char slv[1024] = {0, };
- char sts[1024] = {0, };
- char *bufp = NULL;
- dict_t *confd = NULL;
- int gsync_count = 0;
- int status = 0;
+ char brick_state_file[PATH_MAX] = "";
+ char brick_path[PATH_MAX] = "";
+ char *georep_session_wrkng_dir = NULL;
+ char *master = NULL;
+ char tmp[1024] = "";
+ char sts_val_name[1024] = "";
+ char monitor_status[NAME_MAX] = "";
+ char *statefile = NULL;
+ char *socketfile = NULL;
+ dict_t *confd = NULL;
+ int gsync_count = 0;
+ int i = 0;
+ int ret = 0;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ gf_gsync_status_t *sts_val = NULL;
+ glusterd_conf_t *priv = NULL;
GF_ASSERT (THIS);
GF_ASSERT (THIS->private);
+ GF_ASSERT (volinfo);
+
+ master = volinfo->volname;
confd = dict_new ();
- if (!dict)
+ if (!dict) {
+ gf_log ("", GF_LOG_ERROR, "Not able to create dict.");
return -1;
+ }
priv = THIS->private;
- ret = glusterd_gsync_get_config (master, slave, priv->workdir,
+
+ ret = glusterd_gsync_get_config (master, slave, conf_path,
confd);
if (ret) {
gf_log ("", GF_LOG_ERROR, "Unable to get configuration data"
@@ -1494,106 +3035,188 @@ glusterd_read_status_file (char *master, char *slave,
}
- ret = gsync_status (master, slave, &status);
- if (ret == 0 && status == -1) {
- strncpy (buf, "defunct", sizeof (buf));
- goto done;
- } else if (ret == -1)
- goto out;
-
ret = dict_get_param (confd, "state_file", &statefile);
- if (ret)
- goto out;
- ret = glusterd_gsync_read_frm_status (statefile, buf, sizeof (buf));
if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to get state_file's name "
+ "for %s(master), %s(slave). Please check gsync "
+ "config file.", master, slave);
+ goto out;
+ }
+
+ ret = glusterd_gsync_read_frm_status (statefile, monitor_status,
+ sizeof (monitor_status));
+ if (ret <= 0) {
gf_log ("", GF_LOG_ERROR, "Unable to read the status"
"file for %s(master), %s(slave)", master, slave);
- strncpy (buf, "defunct", sizeof (buf));
- goto done;
+ strncpy (monitor_status, "defunct", sizeof (monitor_status));
}
- if (strcmp (buf, "OK") != 0)
- goto done;
- ret = dict_get_param (confd, "state_socket_unencoded", &statefile);
- if (ret)
+ ret = dict_get_param (confd, "georep_session_working_dir",
+ &georep_session_wrkng_dir);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to get geo-rep session's "
+ "working directory name for %s(master), %s(slave). "
+ "Please check gsync config file.", master, slave);
goto out;
- ret = glusterd_gsync_fetch_status_extra (statefile, buf, sizeof (buf));
+ }
+
+ ret = dict_get_param (confd, "state_socket_unencoded", &socketfile);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to fetch extra status"
- "for %s(master), %s(slave)", master, slave);
- /* there is a slight chance that this occurs due to race
- * -- in that case, the following options all seem bad:
- *
- * - suppress irregurlar behavior by just leaving status
- * on "OK"
- * - freak out users with a misleading "defunct"
- * - overload the meaning of the regular error signal
- * mechanism of gsyncd, that is, when status is "faulty"
- *
- * -- so we just come up with something new...
- */
- strncpy (buf, "N/A", sizeof (buf));
- goto done;
+ gf_log ("", GF_LOG_ERROR, "Unable to get socket file's name "
+ "for %s(master), %s(slave). Please check gsync "
+ "config file.", master, slave);
+ goto out;
}
- done:
ret = dict_get_int32 (dict, "gsync-count", &gsync_count);
-
if (ret)
- gsync_count = 1;
- else
- gsync_count++;
+ gsync_count = 0;
- snprintf (mst, sizeof (mst), "master%d", gsync_count);
- master = gf_strdup (master);
- if (!master)
- goto out;
- ret = dict_set_dynstr (dict, mst, master);
- if (ret) {
- GF_FREE (master);
- goto out;
- }
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ if (uuid_compare (brickinfo->uuid, MY_UUID))
+ continue;
- snprintf (slv, sizeof (slv), "slave%d", gsync_count);
- slave = gf_strdup (slave);
- if (!slave)
- goto out;
- ret = dict_set_dynstr (dict, slv, slave);
- if (ret) {
- GF_FREE (slave);
- goto out;
- }
+ sts_val = GF_CALLOC (1, sizeof(gf_gsync_status_t),
+ gf_common_mt_gsync_status_t);
+ if (!sts_val) {
+ gf_log ("", GF_LOG_ERROR, "Out Of Memory");
+ goto out;
+ }
- snprintf (sts, sizeof (slv), "status%d", gsync_count);
- bufp = gf_strdup (buf);
- if (!bufp)
- goto out;
- ret = dict_set_dynstr (dict, sts, bufp);
- if (ret) {
- GF_FREE (bufp);
- goto out;
+ /* Creating the brick state file's path */
+ memset(brick_state_file, '\0', PATH_MAX);
+ memcpy (brick_path, brickinfo->path, PATH_MAX - 1);
+ for (i = 0; i < strlen(brick_path) - 1; i++)
+ if (brick_path[i] == '/')
+ brick_path[i] = '_';
+ ret = snprintf(brick_state_file, PATH_MAX - 1, "%s%s.status",
+ georep_session_wrkng_dir, brick_path);
+ brick_state_file[ret] = '\0';
+
+ gf_log ("", GF_LOG_DEBUG, "brick_state_file = %s", brick_state_file);
+
+ memset (tmp, '\0', sizeof(tmp));
+
+ ret = glusterd_gsync_read_frm_status (brick_state_file,
+ tmp, sizeof (tmp));
+ if (ret <= 0) {
+ gf_log ("", GF_LOG_ERROR, "Unable to read the status"
+ "file for %s brick for %s(master), %s(slave) "
+ "session", brickinfo->path, master, slave);
+ memcpy (sts_val->slave_node, slave, strlen(slave));
+ sts_val->slave_node[strlen(slave)] = '\0';
+ ret = snprintf (sts_val->worker_status, sizeof(sts_val->worker_status), "N/A");
+ sts_val->worker_status[ret] = '\0';
+ ret = snprintf (sts_val->checkpoint_status, sizeof(sts_val->checkpoint_status), "N/A");
+ sts_val->checkpoint_status[ret] = '\0';
+ ret = snprintf (sts_val->crawl_status, sizeof(sts_val->crawl_status), "N/A");
+ sts_val->crawl_status[ret] = '\0';
+ ret = snprintf (sts_val->files_syncd, sizeof(sts_val->files_syncd), "N/A");
+ sts_val->files_syncd[ret] = '\0';
+ ret = snprintf (sts_val->purges_remaining, sizeof(sts_val->purges_remaining), "N/A");
+ sts_val->purges_remaining[ret] = '\0';
+ ret = snprintf (sts_val->total_files_skipped, sizeof(sts_val->total_files_skipped), "N/A");
+ sts_val->total_files_skipped[ret] = '\0';
+ ret = snprintf (sts_val->files_remaining, sizeof(sts_val->files_remaining), "N/A");
+ sts_val->files_remaining[ret] = '\0';
+ ret = snprintf (sts_val->bytes_remaining, sizeof(sts_val->bytes_remaining), "N/A");
+ sts_val->bytes_remaining[ret] = '\0';
+ goto store_status;
+ }
+
+ ret = glusterd_gsync_fetch_status_extra (socketfile, sts_val);
+ if (ret || strlen(sts_val->checkpoint_status) == 0) {
+ gf_log ("", GF_LOG_DEBUG, "No checkpoint status"
+ "for %s(master), %s(slave)", master, slave);
+ ret = snprintf (sts_val->checkpoint_status, sizeof(sts_val->checkpoint_status), "N/A");
+ sts_val->checkpoint_status[ret] = '\0';
+ }
+
+ ret = glusterd_parse_gsync_status (tmp, sts_val);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to parse the gsync status for %s",
+ brickinfo->path);
+ memcpy (sts_val->slave_node, slave, strlen(slave));
+ sts_val->slave_node[strlen(slave)] = '\0';
+ ret = snprintf (sts_val->worker_status, sizeof(sts_val->worker_status), "N/A");
+ sts_val->worker_status[ret] = '\0';
+ ret = snprintf (sts_val->checkpoint_status, sizeof(sts_val->checkpoint_status), "N/A");
+ sts_val->checkpoint_status[ret] = '\0';
+ ret = snprintf (sts_val->crawl_status, sizeof(sts_val->crawl_status), "N/A");
+ sts_val->crawl_status[ret] = '\0';
+ ret = snprintf (sts_val->files_syncd, sizeof(sts_val->files_syncd), "N/A");
+ sts_val->files_syncd[ret] = '\0';
+ ret = snprintf (sts_val->purges_remaining, sizeof(sts_val->purges_remaining), "N/A");
+ sts_val->purges_remaining[ret] = '\0';
+ ret = snprintf (sts_val->total_files_skipped, sizeof(sts_val->total_files_skipped), "N/A");
+ sts_val->total_files_skipped[ret] = '\0';
+ ret = snprintf (sts_val->files_remaining, sizeof(sts_val->files_remaining), "N/A");
+ sts_val->files_remaining[ret] = '\0';
+ ret = snprintf (sts_val->bytes_remaining, sizeof(sts_val->bytes_remaining), "N/A");
+ sts_val->bytes_remaining[ret] = '\0';
+ }
+
+store_status:
+ if ((strcmp (monitor_status, "Stable"))) {
+ memcpy (sts_val->worker_status, monitor_status, strlen(monitor_status));
+ sts_val->worker_status[strlen(monitor_status)] = '\0';
+ ret = snprintf (sts_val->crawl_status, sizeof(sts_val->crawl_status), "N/A");
+ sts_val->crawl_status[ret] = '\0';
+ ret = snprintf (sts_val->checkpoint_status, sizeof(sts_val->checkpoint_status), "N/A");
+ sts_val->checkpoint_status[ret] = '\0';
+ }
+
+ if (strcmp (sts_val->worker_status, "Active")) {
+ ret = snprintf (sts_val->checkpoint_status, sizeof(sts_val->checkpoint_status), "N/A");
+ sts_val->checkpoint_status[ret] = '\0';
+ ret = snprintf (sts_val->crawl_status, sizeof(sts_val->crawl_status), "N/A");
+ sts_val->crawl_status[ret] = '\0';
+ }
+
+ if (!strcmp (sts_val->slave_node, "N/A")) {
+ memcpy (sts_val->slave_node, slave, strlen(slave));
+ sts_val->slave_node[strlen(slave)] = '\0';
+ }
+
+ memcpy (sts_val->node, node, strlen(node));
+ sts_val->node[strlen(node)] = '\0';
+ memcpy (sts_val->brick, brickinfo->path, strlen(brickinfo->path));
+ sts_val->brick[strlen(brickinfo->path)] = '\0';
+ memcpy (sts_val->master, master, strlen(master));
+ sts_val->master[strlen(master)] = '\0';
+
+ snprintf (sts_val_name, sizeof (sts_val_name), "status_value%d", gsync_count);
+ ret = dict_set_bin (dict, sts_val_name, sts_val, sizeof(gf_gsync_status_t));
+ if (ret) {
+ GF_FREE (sts_val);
+ goto out;
+ }
+
+ gsync_count++;
+ sts_val = NULL;
}
+
ret = dict_set_int32 (dict, "gsync-count", gsync_count);
if (ret)
goto out;
- ret = 0;
- out:
+out:
dict_destroy (confd);
- gf_log ("", GF_LOG_DEBUG, "Returning %d ", ret);
- return ret;
+ return 0;
}
-static int
+int
glusterd_check_restart_gsync_session (glusterd_volinfo_t *volinfo, char *slave,
- dict_t *resp_dict)
+ dict_t *resp_dict, char *path_list,
+ char *conf_path, gf_boolean_t is_force)
{
int ret = 0;
- uuid_t uuid = {0, };
glusterd_conf_t *priv = NULL;
char *status_msg = NULL;
+ gf_boolean_t is_running = _gf_false;
GF_ASSERT (volinfo);
GF_ASSERT (slave);
@@ -1602,18 +3225,22 @@ glusterd_check_restart_gsync_session (glusterd_volinfo_t *volinfo, char *slave,
priv = THIS->private;
- if (glusterd_gsync_get_uuid (slave, volinfo, uuid))
- /* session does not exist, nothing to do */
+ ret = glusterd_check_gsync_running_local (volinfo->volname,
+ slave, conf_path,
+ &is_running);
+ if (!ret && (_gf_true != is_running))
+ /* gsynd not running, nothing to do */
goto out;
- if (uuid_compare (MY_UUID, uuid) == 0) {
- ret = stop_gsync (volinfo->volname, slave, &status_msg);
- if (ret == 0 && status_msg)
- ret = dict_set_str (resp_dict, "gsync-status",
- status_msg);
- if (ret == 0)
- ret = glusterd_start_gsync (volinfo, slave,
- uuid_utoa(MY_UUID), NULL);
- }
+
+ ret = stop_gsync (volinfo->volname, slave, &status_msg,
+ conf_path, is_force);
+ if (ret == 0 && status_msg)
+ ret = dict_set_str (resp_dict, "gsync-status",
+ status_msg);
+ if (ret == 0)
+ ret = glusterd_start_gsync (volinfo, slave, path_list,
+ conf_path, uuid_utoa(MY_UUID),
+ NULL);
out:
gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
@@ -1621,7 +3248,7 @@ glusterd_check_restart_gsync_session (glusterd_volinfo_t *volinfo, char *slave,
}
static int32_t
-glusterd_marker_create_volfile (glusterd_volinfo_t *volinfo)
+glusterd_marker_changelog_create_volfile (glusterd_volinfo_t *volinfo)
{
int32_t ret = 0;
@@ -1645,57 +3272,82 @@ out:
}
static int
-glusterd_set_marker_gsync (glusterd_volinfo_t *volinfo)
+glusterd_set_gsync_knob (glusterd_volinfo_t *volinfo, char *key, int *vc)
{
- int ret = -1;
- int marker_set = _gf_false;
- char *gsync_status = NULL;
+ int ret = -1;
+ int conf_enabled = _gf_false;
+ char *knob_on = NULL;
GF_ASSERT (THIS);
GF_ASSERT (THIS->private);
- marker_set = glusterd_volinfo_get_boolean (volinfo, VKEY_MARKER_XTIME);
- if (marker_set == -1) {
- gf_log ("", GF_LOG_ERROR, "failed to get the marker status");
- ret = -1;
+ conf_enabled = glusterd_volinfo_get_boolean (volinfo, key);
+ if (conf_enabled == -1) {
+ gf_log ("", GF_LOG_ERROR,
+ "failed to get key %s from volinfo", key);
goto out;
}
- if (marker_set == _gf_false) {
- gsync_status = gf_strdup ("on");
- if (gsync_status == NULL) {
+ ret = 0;
+ if (conf_enabled == _gf_false) {
+ *vc = 1;
+ knob_on = gf_strdup ("on");
+ if (knob_on == NULL) {
ret = -1;
goto out;
}
ret = glusterd_gsync_volinfo_dict_set (volinfo,
- VKEY_MARKER_XTIME, gsync_status);
- if (ret < 0)
- goto out;
-
- ret = glusterd_marker_create_volfile (volinfo);
- if (ret) {
- gf_log ("", GF_LOG_ERROR, "Setting dict failed");
- goto out;
- }
+ key, knob_on);
}
- ret = 0;
-out:
+ out:
gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
+static int
+glusterd_set_gsync_confs (glusterd_volinfo_t *volinfo)
+{
+ int ret = -1;
+ int volfile_changed = 0;
+
+ ret = glusterd_set_gsync_knob (volinfo,
+ VKEY_MARKER_XTIME, &volfile_changed);
+ if (ret)
+ goto out;
+
+ /**
+ * enable ignore-pid-check blindly as it could be needed for
+ * cascading setups.
+ */
+ ret = glusterd_set_gsync_knob (volinfo, VKEY_MARKER_XTIME_FORCE,
+ &volfile_changed);
+ if (ret)
+ goto out;
+
+ ret = glusterd_set_gsync_knob (volinfo,
+ VKEY_CHANGELOG, &volfile_changed);
+ if (ret)
+ goto out;
+ if (volfile_changed)
+ ret = glusterd_marker_changelog_create_volfile (volinfo);
+ out:
+ return ret;
+}
static int
glusterd_get_gsync_status_mst_slv (glusterd_volinfo_t *volinfo,
- char *slave, dict_t *rsp_dict)
+ char *slave, char *conf_path,
+ dict_t *rsp_dict, char *node)
{
+ char *statefile = NULL;
uuid_t uuid = {0, };
glusterd_conf_t *priv = NULL;
int ret = 0;
+ struct stat stbuf = {0, };
GF_ASSERT (volinfo);
GF_ASSERT (slave);
@@ -1705,24 +3357,45 @@ glusterd_get_gsync_status_mst_slv (glusterd_volinfo_t *volinfo,
priv = THIS->private;
ret = glusterd_gsync_get_uuid (slave, volinfo, uuid);
- if ((ret == 0) && (uuid_compare (MY_UUID, uuid) != 0))
- goto out;
-
if (ret) {
- ret = 0;
gf_log ("", GF_LOG_INFO, "geo-replication status %s %s :"
"session is not active", volinfo->volname, slave);
- goto out;
+
+ ret = glusterd_get_statefile_name (volinfo, slave,
+ conf_path, &statefile);
+ if (ret) {
+ if (!strstr(slave, "::"))
+ gf_log ("", GF_LOG_INFO,
+ "%s is not a valid slave url.", slave);
+ else
+ gf_log ("", GF_LOG_INFO, "Unable to get"
+ " statefile's name");
+ ret = 0;
+ goto out;
+ }
+
+ ret = lstat (statefile, &stbuf);
+ if (ret) {
+ gf_log ("", GF_LOG_INFO, "%s statefile not present.",
+ statefile);
+ ret = 0;
+ goto out;
+ }
}
- ret = glusterd_read_status_file (volinfo->volname, slave, rsp_dict);
- out:
+ ret = glusterd_read_status_file (volinfo, slave, conf_path,
+ rsp_dict, node);
+out:
+ if (statefile)
+ GF_FREE (statefile);
+
gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
return ret;
}
static int
-glusterd_get_gsync_status_mst (glusterd_volinfo_t *volinfo, dict_t *rsp_dict)
+glusterd_get_gsync_status_mst (glusterd_volinfo_t *volinfo, dict_t *rsp_dict,
+ char *node)
{
glusterd_gsync_status_temp_t param = {0, };
@@ -1730,13 +3403,14 @@ glusterd_get_gsync_status_mst (glusterd_volinfo_t *volinfo, dict_t *rsp_dict)
param.rsp_dict = rsp_dict;
param.volinfo = volinfo;
+ param.node = node;
dict_foreach (volinfo->gsync_slaves, _get_status_mst_slv, &param);
return 0;
}
static int
-glusterd_get_gsync_status_all ( dict_t *rsp_dict)
+glusterd_get_gsync_status_all (dict_t *rsp_dict, char *node)
{
int32_t ret = 0;
@@ -1749,7 +3423,7 @@ glusterd_get_gsync_status_all ( dict_t *rsp_dict)
GF_ASSERT (priv);
list_for_each_entry (volinfo, &priv->volumes, vol_list) {
- ret = glusterd_get_gsync_status_mst (volinfo, rsp_dict);
+ ret = glusterd_get_gsync_status_mst (volinfo, rsp_dict, node);
if (ret)
goto out;
}
@@ -1765,15 +3439,22 @@ glusterd_get_gsync_status (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
{
char *slave = NULL;
char *volname = NULL;
+ char *conf_path = NULL;
char errmsg[PATH_MAX] = {0, };
gf_boolean_t exists = _gf_false;
glusterd_volinfo_t *volinfo = NULL;
int ret = 0;
+ char my_hostname[256] = {0,};
+ ret = gethostname(my_hostname, 256);
+ if (ret) {
+ /* stick to N/A */
+ (void) strcpy (my_hostname, "N/A");
+ }
ret = dict_get_str (dict, "master", &volname);
if (ret < 0){
- ret = glusterd_get_gsync_status_all (rsp_dict);
+ ret = glusterd_get_gsync_status_all (rsp_dict, my_hostname);
goto out;
}
@@ -1791,11 +3472,20 @@ glusterd_get_gsync_status (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
ret = dict_get_str (dict, "slave", &slave);
if (ret < 0) {
- ret = glusterd_get_gsync_status_mst (volinfo, rsp_dict);
+ ret = glusterd_get_gsync_status_mst (volinfo,
+ rsp_dict, my_hostname);
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "conf_path", &conf_path);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to fetch conf file path.");
goto out;
}
- ret = glusterd_get_gsync_status_mst_slv (volinfo, slave, rsp_dict);
+ ret = glusterd_get_gsync_status_mst_slv (volinfo, slave, conf_path,
+ rsp_dict, my_hostname);
out:
gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
@@ -1803,319 +3493,431 @@ glusterd_get_gsync_status (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
}
static int
-glusterd_send_sigstop (pid_t pid)
+glusterd_gsync_delete (glusterd_volinfo_t *volinfo, char *slave, char *slave_ip,
+ char *slave_vol, char *path_list, dict_t *dict,
+ dict_t *resp_dict, char **op_errstr)
{
- int ret = 0;
- ret = kill (pid, SIGSTOP);
- if (ret)
- gf_log ("", GF_LOG_ERROR, GEOREP"failed to send SIGSTOP signal");
- return ret;
-}
-
-static int
-glusterd_send_sigcont (pid_t pid)
-{
- int ret = 0;
- ret = kill (pid, SIGCONT);
- if (ret)
- gf_log ("", GF_LOG_ERROR, GEOREP"failed to send SIGCONT signal");
- return ret;
-}
-
-/*
- * Log rotations flow is something like this:
- * - Send SIGSTOP to process group (this will stop monitor/worker process
- * and also the slave if it's local)
- * - Rotate log file for monitor/worker
- * - Rotate log file for slave if it's local
- * - Send SIGCONT to the process group. Monitor wakes up, kills the worker
- * (this is done in the SIGCONT handler), which results in the termination
- * of the slave (local/remote). After returning from signal handler,
- * monitor detects absence of worker and starts it again, which in-turn
- * starts the slave.
- */
-static int
-glusterd_send_log_rotate_signal (pid_t pid, char *logfile1, char *logfile2)
-{
- int ret = 0;
- char rlogfile[PATH_MAX] = {0,};
- time_t rottime = 0;
-
- ret = glusterd_send_sigstop (-pid);
- rottime = time (NULL);
+ int32_t ret = -1;
+ runner_t runner = {0,};
+ glusterd_conf_t *priv = NULL;
+ char *master = NULL;
+ char *gl_workdir = NULL;
+ char geo_rep_dir[PATH_MAX] = "";
+ char *conf_path = NULL;
- snprintf (rlogfile, sizeof (rlogfile), "%s.%"PRIu64, logfile1,
- (uint64_t) rottime);
- ret = rename (logfile1, rlogfile);
- if (ret)
- gf_log ("", GF_LOG_ERROR, "rename failed for geo-rep log file");
+ GF_ASSERT (slave);
+ GF_ASSERT (slave_ip);
+ GF_ASSERT (slave_vol);
+ GF_ASSERT (op_errstr);
+ GF_ASSERT (dict);
+ GF_ASSERT (resp_dict);
- if (!*logfile2) {
- gf_log ("", GF_LOG_DEBUG, "Slave is not local,"
- " skipping rotation");
- ret = 0;
+ if (THIS)
+ priv = THIS->private;
+ if (priv == NULL) {
+ gf_log ("", GF_LOG_ERROR, "priv of glusterd not present");
+ *op_errstr = gf_strdup ("glusterd defunct");
goto out;
}
- (void) snprintf (rlogfile, sizeof (rlogfile), "%s.%"PRIu64, logfile2,
- (uint64_t) rottime);
- ret = rename (logfile2, rlogfile);
- if (ret)
- gf_log ("", GF_LOG_ERROR, "rename failed for geo-rep slave"
- " log file");
+ ret = dict_get_str (dict, "conf_path", &conf_path);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to fetch conf file path.");
+ goto out;
+ }
- out:
- ret = glusterd_send_sigcont (-pid);
+ gl_workdir = priv->workdir;
+ master = "";
+ runinit (&runner);
+ runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd",
+ "--delete", "-c", NULL);
+ runner_argprintf (&runner, "%s", conf_path);
- return ret;
-}
+ if (volinfo) {
+ master = volinfo->volname;
+ runner_argprintf (&runner, ":%s", master);
+ }
+ runner_add_arg (&runner, slave);
+ runner_redir (&runner, STDOUT_FILENO, RUN_PIPE);
+ synclock_unlock (&priv->big_lock);
+ ret = runner_run (&runner);
+ synclock_lock (&priv->big_lock);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "gsyncd failed to "
+ "delete session info for %s and %s peers",
+ master, slave);
-static int
-glusterd_get_pid_from_file (char *master, char *slave, pid_t *pid)
-{
- int ret = -1;
- int pfd = 0;
- char pidfile[PATH_MAX] = {0,};
- char buff[1024] = {0,};
+ gf_asprintf (op_errstr, "gsyncd failed to "
+ "delete session info for %s and %s peers",
+ master, slave);
- pfd = gsyncd_getpidfile (master, slave, pidfile);
- if (pfd == -2) {
- gf_log ("", GF_LOG_ERROR, GEOREP" log-rotate validation "
- " failed for %s & %s", master, slave);
- goto out;
- }
- if (gsync_status_byfd (pfd) == -1) {
- gf_log ("", GF_LOG_ERROR, "gsyncd b/w %s & %s is not"
- " running", master, slave);
goto out;
}
- if (pfd < 0)
- goto out;
+ ret = snprintf (geo_rep_dir, sizeof(geo_rep_dir) - 1,
+ "%s/"GEOREP"/%s_%s_%s", gl_workdir,
+ volinfo->volname, slave_ip, slave_vol);
+ geo_rep_dir[ret] = '\0';
- ret = read (pfd, buff, 1024);
- if (ret < 0) {
- gf_log ("", GF_LOG_ERROR, GEOREP" cannot read pid from pid-file");
- goto out;
+ ret = rmdir (geo_rep_dir);
+ if (ret) {
+ if (errno == ENOENT)
+ gf_log ("", GF_LOG_DEBUG, "Geo Rep Dir(%s) Not Present.",
+ geo_rep_dir);
+ else {
+ gf_log ("", GF_LOG_ERROR, "Unable to delete "
+ "Geo Rep Dir(%s). Error: %s", geo_rep_dir,
+ strerror (errno));
+ goto out;
+ }
}
-
- *pid = strtol (buff, NULL, 10);
ret = 0;
+ gf_asprintf (op_errstr, "delete successful");
+
out:
- sys_close(pfd);
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
-static int
-glusterd_do_gsync_log_rotate (char *master, char *slave, uuid_t *uuid, char **op_errstr)
+int
+glusterd_op_sys_exec (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
{
- int ret = 0;
- glusterd_conf_t *priv = NULL;
- pid_t pid = 0;
- char log_file1[PATH_MAX] = {0,};
- char log_file2[PATH_MAX] = {0,};
+ char buf[PATH_MAX] = "";
+ char cmd_arg_name[PATH_MAX] = "";
+ char output_name[PATH_MAX] = "";
+ char errmsg[PATH_MAX] = "";
+ char *ptr = NULL;
+ char *bufp = NULL;
+ char *command = NULL;
+ char **cmd_args = NULL;
+ int ret = -1;
+ int i = -1;
+ int cmd_args_count = 0;
+ int output_count = 0;
+ glusterd_conf_t *priv = NULL;
+ runner_t runner = {0,};
- GF_ASSERT (THIS);
- GF_ASSERT (THIS->private);
-
- priv = THIS->private;
+ GF_ASSERT (dict);
+ GF_ASSERT (op_errstr);
+ GF_ASSERT (rsp_dict);
- ret = glusterd_get_pid_from_file (master, slave, &pid);
- if (ret)
+ if (THIS)
+ priv = THIS->private;
+ if (priv == NULL) {
+ gf_log ("", GF_LOG_ERROR, "priv of glusterd not present");
+ *op_errstr = gf_strdup ("glusterd defunct");
goto out;
+ }
- /* log file */
- ret = glusterd_gsyncd_getlogfile (master, slave, log_file1);
- if (ret)
+ ret = dict_get_str (dict, "command", &command);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to get command from dict");
goto out;
+ }
- /* check if slave is local or remote */
- ret = glusterd_gsync_slave_is_remote (slave);
- if (ret)
- goto do_rotate;
-
- /* slave log file - slave is local and it's log can be rotated */
- ret = glusterd_gsync_get_slave_log_file (master, slave, log_file2);
+ ret = dict_get_int32 (dict, "cmd_args_count", &cmd_args_count);
if (ret)
- goto out;
-
- do_rotate:
- ret = glusterd_send_log_rotate_signal (pid, log_file1, log_file2);
-
- out:
- if (ret && op_errstr)
- *op_errstr = gf_strdup("Error rotating log file");
- return ret;
-}
-
-static int
-glusterd_do_gsync_log_rotation_mst_slv (glusterd_volinfo_t *volinfo, char *slave,
- char **op_errstr)
-{
- uuid_t uuid = {0, };
- glusterd_conf_t *priv = NULL;
- int ret = 0;
- char errmsg[1024] = {0,};
- xlator_t *this = NULL;
+ gf_log ("", GF_LOG_INFO, "No cmd_args_count");
+
+ if (cmd_args_count) {
+ cmd_args = GF_CALLOC (cmd_args_count, sizeof (char*),
+ gf_common_mt_char);
+ if (!cmd_args) {
+ gf_log ("", GF_LOG_ERROR, "Unable to calloc. "
+ "Errno = %s", strerror(errno));
+ goto out;
+ }
- GF_ASSERT (volinfo);
- GF_ASSERT (slave);
- GF_ASSERT (THIS);
- this = THIS;
- GF_ASSERT (this->private);
- priv = this->private;
+ for (i=1; i <= cmd_args_count; i++) {
+ memset (cmd_arg_name, '\0', sizeof(cmd_arg_name));
+ snprintf (cmd_arg_name, sizeof(cmd_arg_name),
+ "cmd_arg_%d", i);
+ ret = dict_get_str (dict, cmd_arg_name, &cmd_args[i-1]);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to get %s in dict",
+ cmd_arg_name);
+ goto out;
+ }
+ }
+ }
- ret = glusterd_gsync_get_uuid (slave, volinfo, uuid);
- if ((ret == 0) && (uuid_compare (MY_UUID, uuid) != 0))
+ runinit (&runner);
+ runner_argprintf (&runner, GSYNCD_PREFIX"/peer_%s", command);
+ for (i=0; i < cmd_args_count; i++)
+ runner_add_arg (&runner, cmd_args[i]);
+ runner_redir (&runner, STDOUT_FILENO, RUN_PIPE);
+ synclock_unlock (&priv->big_lock);
+ ret = runner_start (&runner);
+ if (ret == -1) {
+ snprintf (errmsg, sizeof (errmsg), "Unable to "
+ "execute command. Error : %s",
+ strerror (errno));
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ ret = -1;
+ synclock_lock (&priv->big_lock);
goto out;
+ }
+
+ do {
+ ptr = fgets(buf, sizeof(buf), runner_chio (&runner, STDOUT_FILENO));
+ if (ptr) {
+ ret = dict_get_int32 (rsp_dict, "output_count", &output_count);
+ if (ret)
+ output_count = 1;
+ else
+ output_count++;
+ memset (output_name, '\0', sizeof (output_name));
+ snprintf (output_name, sizeof (output_name),
+ "output_%d", output_count);
+ if (buf[strlen(buf) - 1] == '\n')
+ buf[strlen(buf) - 1] = '\0';
+ bufp = gf_strdup (buf);
+ if (!bufp)
+ gf_log ("", GF_LOG_ERROR, "gf_strdup failed.");
+ ret = dict_set_dynstr (rsp_dict, output_name, bufp);
+ if (ret) {
+ GF_FREE (bufp);
+ gf_log ("", GF_LOG_ERROR, "output set failed.");
+ }
+ ret = dict_set_int32 (rsp_dict, "output_count", output_count);
+ if (ret)
+ gf_log ("", GF_LOG_ERROR, "output_count set failed.");
+ }
+ } while (ptr);
+ ret = runner_end (&runner);
if (ret) {
- snprintf(errmsg, sizeof(errmsg), "geo-replication session b/w %s %s not active",
- volinfo->volname, slave);
- gf_log (this->name, GF_LOG_WARNING, "%s", errmsg);
- if (op_errstr)
- *op_errstr = gf_strdup(errmsg);
+ snprintf (errmsg, sizeof (errmsg), "Unable to "
+ "end. Error : %s",
+ strerror (errno));
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ ret = -1;
+ synclock_lock (&priv->big_lock);
goto out;
}
+ synclock_lock (&priv->big_lock);
- ret = glusterd_do_gsync_log_rotate (volinfo->volname, slave, &uuid, op_errstr);
+ ret = 0;
+out:
+ if (cmd_args) {
+ GF_FREE (cmd_args);
+ cmd_args = NULL;
+ }
- out:
- gf_log (this->name, GF_LOG_DEBUG, "Returning with %d", ret);
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
-static int
-_iterate_log_rotate_mst_slv (dict_t *this, char *key, data_t *value, void *data)
+int
+glusterd_op_copy_file (dict_t *dict, char **op_errstr)
{
- glusterd_gsync_status_temp_t *param = NULL;
- char *slave = NULL;
-
- param = (glusterd_gsync_status_temp_t *) data;
+ char abs_filename[PATH_MAX] = "";
+ char errmsg[PATH_MAX] = "";
+ char *filename = NULL;
+ char *host_uuid = NULL;
+ char uuid_str [64] = {0};
+ char *contents = NULL;
+ char buf[1024] = "";
+ int ret = -1;
+ int fd = -1;
+ int bytes_writen = 0;
+ int bytes_read = 0;
+ int contents_size = -1;
+ int file_mode = -1;
+ glusterd_conf_t *priv = NULL;
+ struct stat stbuf = {0,};
- GF_ASSERT (param);
- GF_ASSERT (param->volinfo);
- slave = strchr (value->data, ':');
- if (slave)
- slave++;
- else {
- gf_log ("", GF_LOG_ERROR, "geo-replication log-rotate: slave (%s) "
- "not conforming to format", slave);
- return -1;
+ if (THIS)
+ priv = THIS->private;
+ if (priv == NULL) {
+ gf_log ("", GF_LOG_ERROR, "priv of glusterd not present");
+ *op_errstr = gf_strdup ("glusterd defunct");
+ goto out;
}
- (void) glusterd_do_gsync_log_rotation_mst_slv (param->volinfo, slave, NULL);
- return 0;
-}
+ ret = dict_get_str (dict, "host-uuid", &host_uuid);
+ if (ret < 0)
+ goto out;
-static int
-glusterd_do_gsync_log_rotation_mst (glusterd_volinfo_t *volinfo)
-{
- glusterd_gsync_status_temp_t param = {0, };
+ ret = dict_get_str (dict, "source", &filename);
+ if (ret < 0) {
+ gf_log ("", GF_LOG_ERROR, "Unable to fetch"
+ " filename from dict.");
+ *op_errstr = gf_strdup ("command unsuccessful");
+ goto out;
+ }
+ snprintf (abs_filename, sizeof(abs_filename),
+ "%s/%s", priv->workdir, filename);
- GF_ASSERT (volinfo);
+ uuid_utoa_r (MY_UUID, uuid_str);
+ if (!strcmp (uuid_str, host_uuid)) {
+ ret = lstat (abs_filename, &stbuf);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg), "Source file"
+ " does not exist in %s", priv->workdir);
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ goto out;
+ }
- param.volinfo = volinfo;
- dict_foreach (volinfo->gsync_slaves, _iterate_log_rotate_mst_slv, &param);
- return 0;
-}
+ contents = GF_CALLOC(1, stbuf.st_size+1, gf_common_mt_char);
+ if (!contents) {
+ snprintf (errmsg, sizeof (errmsg),
+ "Unable to allocate memory");
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ ret = -1;
+ goto out;
+ }
-static int
-glusterd_rotate_gsync_all ()
-{
- int32_t ret = 0;
- glusterd_conf_t *priv = NULL;
- glusterd_volinfo_t *volinfo = NULL;
+ fd = open (abs_filename, O_RDONLY);
+ if (fd < 0) {
+ snprintf (errmsg, sizeof (errmsg), "Unable to open %s",
+ abs_filename);
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ ret = -1;
+ goto out;
+ }
- GF_ASSERT (THIS);
- priv = THIS->private;
+ do {
+ ret = read (fd, buf, sizeof(buf));
+ if (ret > 0) {
+ memcpy (contents+bytes_read, buf, ret);
+ bytes_read += ret;
+ memset (buf, '\0', sizeof(buf));
+ }
+ } while (ret > 0);
- GF_ASSERT (priv);
+ if (bytes_read != stbuf.st_size) {
+ snprintf (errmsg, sizeof (errmsg), "Unable to read all "
+ "the data from %s", abs_filename);
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ ret = -1;
+ goto out;
+ }
- list_for_each_entry (volinfo, &priv->volumes, vol_list) {
- ret = glusterd_do_gsync_log_rotation_mst (volinfo);
- if (ret)
+ ret = dict_set_int32 (dict, "contents_size", stbuf.st_size);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg), "Unable to set"
+ " contents size in dict.");
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
goto out;
- }
+ }
- out:
- gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
- return ret;
-}
+ ret = dict_set_int32 (dict, "file_mode",
+ (int32_t)stbuf.st_mode);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg), "Unable to set"
+ " file mode in dict.");
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ goto out;
+ }
-static int
-glusterd_rotate_gsync_logs (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
-{
- char *slave = NULL;
- char *volname = NULL;
- char errmsg[1024] = {0,};
- gf_boolean_t exists = _gf_false;
- glusterd_volinfo_t *volinfo = NULL;
- char **linearr = NULL;
- int ret = 0;
+ ret = dict_set_bin (dict, "common_pem_contents",
+ contents, stbuf.st_size);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg), "Unable to set"
+ " pem contents in dict.");
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ goto out;
+ }
+ close (fd);
+ } else {
+ ret = dict_get_bin (dict, "common_pem_contents",
+ (void **) &contents);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg), "Unable to get"
+ " pem contents in dict.");
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ goto out;
+ }
- ret = dict_get_str (dict, "master", &volname);
- if (ret < 0) {
- ret = glusterd_rotate_gsync_all ();
- goto out;
- }
+ ret = dict_get_int32 (dict, "contents_size", &contents_size);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg), "Unable to set"
+ " contents size in dict.");
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ goto out;
+ }
- exists = glusterd_check_volume_exists (volname);
- ret = glusterd_volinfo_find (volname, &volinfo);
- if ((ret) || (!exists)) {
- snprintf (errmsg, sizeof(errmsg), "Volume %s does not"
- " exist", volname);
- gf_log ("", GF_LOG_WARNING, "%s", errmsg);
- *op_errstr = gf_strdup (errmsg);
- ret = -1;
- goto out;
- }
+ ret = dict_get_int32 (dict, "file_mode", &file_mode);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg), "Unable to get"
+ " file mode in dict.");
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ goto out;
+ }
- ret = dict_get_str (dict, "slave", &slave);
- if (ret < 0) {
- ret = glusterd_do_gsync_log_rotation_mst (volinfo);
- goto out;
- }
+ fd = open (abs_filename, O_WRONLY | O_TRUNC | O_CREAT, 0600);
+ if (fd < 0) {
+ snprintf (errmsg, sizeof (errmsg), "Unable to open %s",
+ abs_filename);
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ ret = -1;
+ goto out;
+ }
- /* for the given slave use the normalized url */
- ret = glusterd_urltransform_single (slave, "normalize", &linearr);
- if (ret == -1)
- goto out;
+ bytes_writen = write (fd, contents, contents_size);
- ret = glusterd_do_gsync_log_rotation_mst_slv (volinfo, linearr[0],
- op_errstr);
- if (ret)
- gf_log ("gsyncd", GF_LOG_ERROR, "gsyncd log-rotate failed for"
- " %s & %s", volname, slave);
+ if (bytes_writen != contents_size) {
+ snprintf (errmsg, sizeof (errmsg), "Failed to write"
+ " to %s", abs_filename);
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ ret = -1;
+ goto out;
+ }
- glusterd_urltransform_free (linearr, 1);
- out:
+ fchmod (fd, file_mode);
+ close (fd);
+ }
+
+ ret = 0;
+out:
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
-
int
glusterd_op_gsync_set (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
{
int32_t ret = -1;
int32_t type = -1;
- dict_t *ctx = NULL;
- dict_t *resp_dict = NULL;
char *host_uuid = NULL;
char *slave = NULL;
+ char *slave_ip = NULL;
+ char *slave_vol = NULL;
char *volname = NULL;
+ char *path_list = NULL;
glusterd_volinfo_t *volinfo = NULL;
glusterd_conf_t *priv = NULL;
+ gf_boolean_t is_force = _gf_false;
char *status_msg = NULL;
- uuid_t uuid = {0, };
+ gf_boolean_t is_running = _gf_false;
+ char *conf_path = NULL;
GF_ASSERT (THIS);
GF_ASSERT (THIS->private);
GF_ASSERT (dict);
GF_ASSERT (op_errstr);
+ GF_ASSERT (rsp_dict);
priv = THIS->private;
@@ -2127,37 +3929,67 @@ glusterd_op_gsync_set (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
if (ret < 0)
goto out;
- ctx = glusterd_op_get_ctx ();
- resp_dict = ctx ? ctx : rsp_dict;
- GF_ASSERT (resp_dict);
-
if (type == GF_GSYNC_OPTION_TYPE_STATUS) {
- ret = glusterd_get_gsync_status (dict, op_errstr, resp_dict);
+ ret = glusterd_get_gsync_status (dict, op_errstr, rsp_dict);
goto out;
}
- if (type == GF_GSYNC_OPTION_TYPE_ROTATE) {
- ret = glusterd_rotate_gsync_logs (dict, op_errstr, resp_dict);
+ ret = dict_get_str (dict, "slave", &slave);
+ if (ret < 0)
+ goto out;
+
+ ret = dict_get_str (dict, "slave_ip", &slave_ip);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to fetch slave volume name.");
goto out;
+ }
+ ret = dict_get_str (dict, "slave_vol", &slave_vol);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to fetch slave volume name.");
+ goto out;
}
- ret = dict_get_str (dict, "slave", &slave);
- if (ret < 0)
+ ret = dict_get_str (dict, "conf_path", &conf_path);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to fetch conf file path.");
goto out;
+ }
if (dict_get_str (dict, "master", &volname) == 0) {
ret = glusterd_volinfo_find (volname, &volinfo);
if (ret) {
- gf_log ("", GF_LOG_WARNING, "Volinfo for %s (master) not found",
- volname);
+ gf_log ("", GF_LOG_WARNING, "Volinfo for %s (master)"
+ " not found", volname);
goto out;
}
+
+ ret = glusterd_get_local_brickpaths (volinfo, &path_list);
}
if (type == GF_GSYNC_OPTION_TYPE_CONFIG) {
- ret = glusterd_gsync_configure (volinfo, slave, dict, resp_dict,
- op_errstr);
+ ret = glusterd_gsync_configure (volinfo, slave, path_list,
+ dict, rsp_dict, op_errstr);
+ if (!ret) {
+ ret = dict_set_str (rsp_dict, "conf_path", conf_path);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to store conf_file_path.");
+ goto out;
+ }
+ }
+ goto out;
+ }
+
+ if (type == GF_GSYNC_OPTION_TYPE_DELETE) {
+ ret = glusterd_remove_slave_in_info(volinfo, slave, op_errstr);
+ if (ret && !is_force && path_list)
+ goto out;
+
+ ret = glusterd_gsync_delete (volinfo, slave, slave_ip,
+ slave_vol, path_list, dict,
+ rsp_dict, op_errstr);
goto out;
}
@@ -2166,48 +3998,671 @@ glusterd_op_gsync_set (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
goto out;
}
+ is_force = dict_get_str_boolean (dict, "force", _gf_false);
+
if (type == GF_GSYNC_OPTION_TYPE_START) {
- ret = glusterd_set_marker_gsync (volinfo);
+ ret = glusterd_set_gsync_confs (volinfo);
if (ret != 0) {
- gf_log ("", GF_LOG_WARNING, "marker start failed");
- *op_errstr = gf_strdup ("failed to initialize indexing");
+ gf_log ("", GF_LOG_WARNING, "marker/changelog"
+ " start failed");
+ *op_errstr = gf_strdup ("Index initialization failed");
ret = -1;
goto out;
}
- ret = glusterd_store_slave_in_info(volinfo, slave,
- host_uuid, op_errstr);
- if (ret)
- goto out;
- ret = glusterd_start_gsync (volinfo, slave, host_uuid,
- op_errstr);
+ ret = glusterd_start_gsync (volinfo, slave, path_list,
+ conf_path, host_uuid, op_errstr);
}
if (type == GF_GSYNC_OPTION_TYPE_STOP) {
-
- ret = glusterd_gsync_get_uuid (slave, volinfo, uuid);
- if (ret) {
+ ret = glusterd_check_gsync_running_local (volinfo->volname,
+ slave, conf_path,
+ &is_running);
+ if (!ret && !is_force && path_list &&
+ (_gf_true != is_running)) {
gf_log ("", GF_LOG_WARNING, GEOREP" is not set up for"
"%s(master) and %s(slave)", volname, slave);
*op_errstr = strdup (GEOREP" is not set up");
goto out;
}
- ret = glusterd_remove_slave_in_info(volinfo, slave, op_errstr);
- if (ret)
+ ret = stop_gsync (volname, slave, &status_msg,
+ conf_path, is_force);
+ if (ret == 0 && status_msg)
+ ret = dict_set_str (rsp_dict, "gsync-status",
+ status_msg);
+ if (ret != 0 && !is_force && path_list)
+ *op_errstr = gf_strdup ("internal error");
+
+ if (!ret) {
+ ret = glusterd_create_status_file (volinfo->volname,
+ slave, slave_ip,
+ slave_vol,"Stopped");
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to update"
+ "state_file. Error : %s",
+ strerror (errno));
+ }
+ }
+ }
+
+out:
+ if (path_list) {
+ GF_FREE (path_list);
+ path_list = NULL;
+ }
+
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_get_slave_details_confpath (glusterd_volinfo_t *volinfo, dict_t *dict,
+ char **slave_ip, char **slave_vol,
+ char **conf_path, char **op_errstr)
+{
+ int ret = -1;
+ char confpath[PATH_MAX] = "";
+ glusterd_conf_t *priv = NULL;
+ char *slave = NULL;
+
+ GF_ASSERT (THIS);
+ priv = THIS->private;
+ GF_ASSERT (priv);
+
+ ret = dict_get_str (dict, "slave", &slave);
+ if (ret || !slave) {
+ gf_log ("", GF_LOG_ERROR, "Unable to fetch slave from dict");
+ ret = -1;
+ goto out;
+ }
+
+ ret = glusterd_get_slave_info (slave, slave_ip, slave_vol, op_errstr);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to fetch slave details.");
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_str (dict, "slave_ip", *slave_ip);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to store slave IP.");
+ goto out;
+ }
+
+ ret = dict_set_str (dict, "slave_vol", *slave_vol);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to store slave volume name.");
+ goto out;
+ }
+
+ ret = snprintf (confpath, sizeof(confpath) - 1,
+ "%s/"GEOREP"/%s_%s_%s/gsyncd.conf",
+ priv->workdir, volinfo->volname,
+ *slave_ip, *slave_vol);
+ confpath[ret] = '\0';
+ *conf_path = gf_strdup (confpath);
+ if (!(*conf_path)) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to gf_strdup. Error: %s", strerror (errno));
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_str (dict, "conf_path", *conf_path);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to store conf_path");
+ goto out;
+ }
+
+out:
+ gf_log ("", GF_LOG_DEBUG,"Returning %d", ret);
+ return ret;
+
+}
+
+int
+glusterd_get_slave_info (char *slave, char **slave_ip,
+ char **slave_vol, char **op_errstr)
+{
+ char *tmp = NULL;
+ char *save_ptr = NULL;
+ char **linearr = NULL;
+ int32_t ret = -1;
+ char errmsg[PATH_MAX] = "";
+
+ ret = glusterd_urltransform_single (slave, "normalize",
+ &linearr);
+ if (ret == -1) {
+ ret = snprintf (errmsg, sizeof(errmsg) - 1,
+ "Invalid Url: %s", slave);
+ errmsg[ret] = '\0';
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "Failed to normalize url");
+ goto out;
+ }
+
+ tmp = strtok_r (linearr[0], "/", &save_ptr);
+ tmp = strtok_r (NULL, "/", &save_ptr);
+ slave = strtok_r (tmp, ":", &save_ptr);
+ if (slave) {
+ ret = glusterd_mountbroker_check (&slave, op_errstr);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Invalid slave url: %s", *op_errstr);
+ goto out;
+ }
+
+ *slave_ip = gf_strdup (slave);
+ if (!*slave_ip) {
+ gf_log ("", GF_LOG_ERROR,
+ "Failed to gf_strdup");
+ ret = -1;
+ goto out;
+ }
+ gf_log ("", GF_LOG_DEBUG, "Slave IP : %s", *slave_ip);
+ ret = 0;
+ } else {
+ gf_log ("", GF_LOG_ERROR, "Invalid slave name");
+ goto out;
+ }
+
+ slave = strtok_r (NULL, ":", &save_ptr);
+ if (slave) {
+ *slave_vol = gf_strdup (slave);
+ if (!*slave_vol) {
+ gf_log ("", GF_LOG_ERROR,
+ "Failed to gf_strdup");
+ ret = -1;
goto out;
+ }
+ gf_log ("", GF_LOG_DEBUG, "Slave Vol : %s", *slave_vol);
+ ret = 0;
+ } else {
+ gf_log ("", GF_LOG_ERROR, "Invalid slave name");
+ goto out;
+ }
+
+out:
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+static void
+runinit_gsyncd_setrx (runner_t *runner, char *conf_path)
+{
+ runinit (runner);
+ runner_add_args (runner, GSYNCD_PREFIX"/gsyncd", "-c", NULL);
+ runner_argprintf (runner, "%s", conf_path);
+ runner_add_arg (runner, "--config-set-rx");
+}
+
+static int
+glusterd_check_gsync_present (int *valid_state)
+{
+ char buff[PATH_MAX] = {0, };
+ runner_t runner = {0,};
+ char *ptr = NULL;
+ int ret = 0;
+
+ runinit (&runner);
+ runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", "--version", NULL);
+ runner_redir (&runner, STDOUT_FILENO, RUN_PIPE);
+ ret = runner_start (&runner);
+ if (ret == -1) {
+ if (errno == ENOENT) {
+ gf_log ("glusterd", GF_LOG_INFO, GEOREP
+ " module not installed in the system");
+ *valid_state = 0;
+ }
+ else {
+ gf_log ("glusterd", GF_LOG_ERROR, GEOREP
+ " module not working as desired");
+ *valid_state = -1;
+ }
+ goto out;
+ }
- if (uuid_compare (MY_UUID, uuid) != 0) {
+ ptr = fgets(buff, sizeof(buff), runner_chio (&runner, STDOUT_FILENO));
+ if (ptr) {
+ if (!strstr (buff, "gsyncd")) {
+ ret = -1;
+ gf_log ("glusterd", GF_LOG_ERROR, GEOREP" module not "
+ "working as desired");
+ *valid_state = -1;
goto out;
}
+ } else {
+ ret = -1;
+ gf_log ("glusterd", GF_LOG_ERROR, GEOREP" module not "
+ "working as desired");
+ *valid_state = -1;
+ goto out;
+ }
- ret = stop_gsync (volname, slave, &status_msg);
- if (ret == 0 && status_msg)
- ret = dict_set_str (resp_dict, "gsync-status",
- status_msg);
- if (ret != 0)
- *op_errstr = gf_strdup ("internal error");
+ ret = 0;
+ out:
+
+ runner_end (&runner);
+
+ gf_log ("glusterd", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+
+}
+
+static int
+create_conf_file (glusterd_conf_t *conf, char *conf_path)
+#define RUN_GSYNCD_CMD do { \
+ ret = runner_run_reuse (&runner); \
+ if (ret == -1) { \
+ runner_log (&runner, "glusterd", GF_LOG_ERROR, "command failed"); \
+ runner_end (&runner); \
+ goto out; \
+ } \
+ runner_end (&runner); \
+} while (0)
+{
+ int ret = 0;
+ runner_t runner = {0,};
+ char georepdir[PATH_MAX] = {0,};
+ int valid_state = 0;
+
+ valid_state = -1;
+ ret = glusterd_check_gsync_present (&valid_state);
+ if (-1 == ret) {
+ ret = valid_state;
+ goto out;
+ }
+
+ ret = snprintf (georepdir, sizeof(georepdir) - 1, "%s/"GEOREP,
+ conf->workdir);
+ georepdir[ret] = '\0';
+
+ /************
+ * master pre-configuration
+ ************/
+
+ /* remote-gsyncd */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_args (&runner, "remote-gsyncd", GSYNCD_PREFIX"/gsyncd", ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_args (&runner, "remote-gsyncd", "/nonexistent/gsyncd",
+ ".", "^ssh:", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* gluster-command-dir */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_args (&runner, "gluster-command-dir", SBIN_DIR"/",
+ ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* gluster-params */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_args (&runner, "gluster-params",
+ "aux-gfid-mount",
+ ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* ssh-command */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_arg (&runner, "ssh-command");
+ runner_argprintf (&runner,
+ "ssh -oPasswordAuthentication=no "
+ "-oStrictHostKeyChecking=no "
+ "-i %s/secret.pem", georepdir);
+ runner_add_args (&runner, ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* ssh-command tar */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_arg (&runner, "ssh-command-tar");
+ runner_argprintf (&runner,
+ "ssh -oPasswordAuthentication=no "
+ "-oStrictHostKeyChecking=no "
+ "-i %s/tar_ssh.pem", georepdir);
+ runner_add_args (&runner, ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* pid-file */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_arg (&runner, "pid-file");
+ runner_argprintf (&runner, "%s/${mastervol}_${remotehost}_${slavevol}/${eSlave}.pid", georepdir);
+ runner_add_args (&runner, ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* geo-rep-working-dir */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_arg (&runner, "georep-session-working-dir");
+ runner_argprintf (&runner, "%s/${mastervol}_${remotehost}_${slavevol}/", georepdir);
+ runner_add_args (&runner, ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* state-file */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_arg (&runner, "state-file");
+ runner_argprintf (&runner, "%s/${mastervol}_${remotehost}_${slavevol}/${eSlave}.status", georepdir);
+ runner_add_args (&runner, ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* state-detail-file */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_arg (&runner, "state-detail-file");
+ runner_argprintf (&runner, "%s/${mastervol}_${remotehost}_${slavevol}/${eSlave}-detail.status", georepdir);
+ runner_add_args (&runner, ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* state-socket */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_arg (&runner, "state-socket-unencoded");
+ runner_argprintf (&runner, "%s/${mastervol}_${remotehost}_${slavevol}/${eSlave}.socket", georepdir);
+ runner_add_args (&runner, ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* socketdir */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_args (&runner, "socketdir", GLUSTERD_SOCK_DIR, ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* log-file */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_args (&runner,
+ "log-file",
+ DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"/${mastervol}/${eSlave}.log",
+ ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* gluster-log-file */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_args (&runner,
+ "gluster-log-file",
+ DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"/${mastervol}/${eSlave}${local_id}.gluster.log",
+ ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* ignore-deletes */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_args (&runner, "ignore-deletes", "true", ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* special-sync-mode */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_args (&runner, "special-sync-mode", "partial", ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* change-detector == changelog */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_args(&runner, "change-detector", "changelog", ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_arg(&runner, "working-dir");
+ runner_argprintf(&runner, "%s/${mastervol}/${eSlave}",
+ DEFAULT_VAR_RUN_DIRECTORY);
+ runner_add_args (&runner, ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /************
+ * slave pre-configuration
+ ************/
+
+ /* gluster-command-dir */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_args (&runner, "gluster-command-dir", SBIN_DIR"/",
+ ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* gluster-params */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_args (&runner, "gluster-params",
+ "aux-gfid-mount",
+ ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* log-file */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_args (&runner,
+ "log-file",
+ DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"-slaves/${session_owner}:${eSlave}.log",
+ ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* MountBroker log-file */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_args (&runner,
+ "log-file-mbr",
+ DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"-slaves/mbr/${session_owner}:${eSlave}.log",
+ ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* gluster-log-file */
+ runinit_gsyncd_setrx (&runner, conf_path);
+ runner_add_args (&runner,
+ "gluster-log-file",
+ DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"-slaves/${session_owner}:${eSlave}.gluster.log",
+ ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ out:
+ return ret ? -1 : 0;
+}
+
+static int
+glusterd_create_essential_dir_files (glusterd_volinfo_t *volinfo, dict_t *dict,
+ char *slave, char *slave_ip,
+ char *slave_vol, char **op_errstr)
+{
+ int ret = -1;
+ char *conf_path = NULL;
+ char *statefile = NULL;
+ char buf[PATH_MAX] = "";
+ char errmsg[PATH_MAX] = "";
+ glusterd_conf_t *conf = NULL;
+ struct stat stbuf = {0,};
+
+ GF_ASSERT (THIS);
+ conf = THIS->private;
+
+ ret = dict_get_str (dict, "conf_path", &conf_path);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg),
+ "Unable to fetch conf file path.");
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "statefile", &statefile);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg),
+ "Unable to fetch statefile path.");
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ goto out;
+ }
+
+ ret = snprintf (buf, sizeof(buf) - 1, "%s/"GEOREP"/%s_%s_%s",
+ conf->workdir, volinfo->volname, slave_ip, slave_vol);
+ buf[ret] = '\0';
+ ret = mkdir_p (buf, 0777, _gf_true);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg), "Unable to create %s"
+ ". Error : %s", buf, strerror (errno));
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ goto out;
+ }
+
+ ret = snprintf (buf, PATH_MAX, DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"/%s",
+ volinfo->volname);
+ buf[ret] = '\0';
+ ret = mkdir_p (buf, 0777, _gf_true);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg), "Unable to create %s"
+ ". Error : %s", buf, strerror (errno));
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ goto out;
+ }
+
+ ret = lstat (conf_path, &stbuf);
+ if (!ret) {
+ gf_log ("", GF_LOG_DEBUG, "Session already running."
+ " Not creating config file again.");
+ } else {
+ ret = create_conf_file (conf, conf_path);
+ if (ret || lstat (conf_path, &stbuf)) {
+ snprintf (errmsg, sizeof (errmsg), "Failed to create"
+ " config file(%s).", conf_path);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ goto out;
+ }
+ }
+
+ ret = lstat (statefile, &stbuf);
+ if (!ret) {
+ gf_log ("", GF_LOG_DEBUG, "Session already running."
+ " Not creating status file again.");
+ goto out;
+ } else {
+ ret = glusterd_create_status_file (volinfo->volname, slave,
+ slave_ip, slave_vol,
+ "Not Started");
+ if (ret || lstat (statefile, &stbuf)) {
+ snprintf (errmsg, sizeof (errmsg), "Unable to create %s"
+ ". Error : %s", statefile, strerror (errno));
+ *op_errstr = gf_strdup (errmsg);
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ ret = -1;
+ goto out;
+ }
+ }
+
+out:
+ gf_log ("", GF_LOG_DEBUG,"Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_op_gsync_create (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+ char common_pem_file[PATH_MAX] = "";
+ char errmsg[PATH_MAX] = "";
+ char hooks_args[PATH_MAX] = "";
+ char uuid_str [64] = "";
+ char *host_uuid = NULL;
+ char *slave_ip = NULL;
+ char *slave_vol = NULL;
+ char *arg_buf = NULL;
+ char *volname = NULL;
+ char *slave = NULL;
+ int32_t ret = -1;
+ int32_t is_pem_push = -1;
+ gf_boolean_t is_force = -1;
+ glusterd_conf_t *conf = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+
+ GF_ASSERT (THIS);
+ conf = THIS->private;
+ GF_ASSERT (conf);
+ GF_ASSERT (dict);
+ GF_ASSERT (op_errstr);
+
+ ret = glusterd_op_gsync_args_get (dict, op_errstr,
+ &volname, &slave, &host_uuid);
+ if (ret)
+ goto out;
+
+ snprintf (common_pem_file, sizeof(common_pem_file),
+ "%s"GLUSTERD_COMMON_PEM_PUB_FILE, conf->workdir);
+
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Volinfo for %s"
+ " (master) not found", volname);
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "slave_vol", &slave_vol);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg),
+ "Unable to fetch slave volume name.");
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "slave_ip", &slave_ip);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg),
+ "Unable to fetch slave IP.");
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ ret = -1;
+ goto out;
+ }
+
+ is_force = dict_get_str_boolean (dict, "force", _gf_false);
+
+ uuid_utoa_r (MY_UUID, uuid_str);
+ if (!strcmp (uuid_str, host_uuid)) {
+ ret = dict_get_int32 (dict, "push_pem", &is_pem_push);
+ if (!ret && is_pem_push) {
+ gf_log ("", GF_LOG_DEBUG, "Trying to setup"
+ " pem files in slave");
+ is_pem_push = 1;
+ } else
+ is_pem_push = 0;
+
+ snprintf(hooks_args, sizeof(hooks_args),
+ "is_push_pem=%d pub_file=%s slave_ip=%s",
+ is_pem_push, common_pem_file, slave_ip);
+
+ } else
+ snprintf(hooks_args, sizeof(hooks_args),
+ "This argument will stop the hooks script");
+
+ arg_buf = gf_strdup (hooks_args);
+ if (!arg_buf) {
+ gf_log ("", GF_LOG_ERROR, "Failed to"
+ " gf_strdup");
+ if (is_force) {
+ ret = 0;
+ goto create_essentials;
+ }
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_str (dict, "hooks_args", arg_buf);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Failed to set"
+ " hooks_args in dict.");
+ if (is_force) {
+ ret = 0;
+ goto create_essentials;
+ }
+ goto out;
+ }
+
+create_essentials:
+
+ ret = glusterd_create_essential_dir_files (volinfo, dict, slave,
+ slave_ip, slave_vol,
+ op_errstr);
+ if (ret)
+ goto out;
+
+ ret = glusterd_store_slave_in_info (volinfo, slave,
+ host_uuid, op_errstr,
+ is_force);
+ if (ret) {
+ snprintf (errmsg, sizeof (errmsg), "Unable to store"
+ " slave info.");
+ gf_log ("", GF_LOG_ERROR, "%s", errmsg);
+ goto out;
}
out:
diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c
index a3106342d..e0373c774 100644
--- a/xlators/mgmt/glusterd/src/glusterd-handler.c
+++ b/xlators/mgmt/glusterd/src/glusterd-handler.c
@@ -33,6 +33,7 @@
#include "glusterd-op-sm.h"
#include "glusterd-utils.h"
#include "glusterd-store.h"
+#include "glusterd-locks.h"
#include "glusterd1-xdr.h"
#include "cli1-xdr.h"
@@ -49,6 +50,37 @@
#include "globals.h"
#include "glusterd-syncop.h"
+#include "glusterd-etcd.h"
+
+#ifdef HAVE_BD_XLATOR
+#include <lvm2app.h>
+#endif
+
+extern glusterd_op_info_t opinfo;
+
+int glusterd_big_locked_notify (struct rpc_clnt *rpc, void *mydata,
+ rpc_clnt_event_t event,
+ void *data, rpc_clnt_notify_t notify_fn)
+{
+ glusterd_conf_t *priv = THIS->private;
+ int ret = -1;
+ synclock_lock (&priv->big_lock);
+ ret = notify_fn (rpc, mydata, event, data);
+ synclock_unlock (&priv->big_lock);
+ return ret;
+}
+
+int glusterd_big_locked_handler (rpcsvc_request_t *req, rpcsvc_actor actor_fn)
+{
+ glusterd_conf_t *priv = THIS->private;
+ int ret = -1;
+
+ synclock_lock (&priv->big_lock);
+ ret = actor_fn (req);
+ synclock_unlock (&priv->big_lock);
+
+ return ret;
+}
static int
glusterd_handle_friend_req (rpcsvc_request_t *req, uuid_t uuid,
@@ -71,8 +103,8 @@ glusterd_handle_friend_req (rpcsvc_request_t *req, uuid_t uuid,
ret = glusterd_friend_find (uuid, rhost, &peerinfo);
if (ret) {
- ret = glusterd_xfer_friend_add_resp (req, rhost, port, -1,
- GF_PROBE_UNKNOWN_PEER);
+ ret = glusterd_xfer_friend_add_resp (req, hostname, rhost, port,
+ -1, GF_PROBE_UNKNOWN_PEER);
if (friend_req->vols.vols_val) {
free (friend_req->vols.vols_val);
friend_req->vols.vols_val = NULL;
@@ -226,13 +258,14 @@ glusterd_add_peer_detail_to_dict (glusterd_peerinfo_t *peerinfo,
int ret = -1;
char key[256] = {0, };
+ char *peer_uuid_str = NULL;
GF_ASSERT (peerinfo);
GF_ASSERT (friends);
snprintf (key, 256, "friend%d.uuid", count);
- uuid_utoa_r (peerinfo->uuid, peerinfo->uuid_str);
- ret = dict_set_str (friends, key, peerinfo->uuid_str);
+ peer_uuid_str = gd_peer_uuid_str (peerinfo);
+ ret = dict_set_str (friends, key, peer_uuid_str);
if (ret)
goto out;
@@ -278,8 +311,23 @@ _build_option_key (dict_t *d, char *k, data_t *v, void *tmp)
char reconfig_key[256] = {0, };
struct args_pack *pack = NULL;
int ret = -1;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
pack = tmp;
+ if (strcmp (k, GLUSTERD_GLOBAL_OPT_VERSION) == 0)
+ return 0;
+
+ if (priv->op_version > GD_OP_VERSION_MIN) {
+ if ((strcmp (k, "features.limit-usage") == 0) ||
+ (strcmp (k, "features.soft-limit") == 0))
+ return 0;
+ }
snprintf (reconfig_key, 256, "volume%d.option.%s",
pack->vol_count, k);
ret = dict_set_str (pack->dict, reconfig_key, v->data);
@@ -303,12 +351,14 @@ glusterd_add_volume_detail_to_dict (glusterd_volinfo_t *volinfo,
glusterd_conf_t *priv = NULL;
char *volume_id_str = NULL;
struct args_pack pack = {0,};
-
+ xlator_t *this = NULL;
+ GF_UNUSED int caps = 0;
GF_ASSERT (volinfo);
GF_ASSERT (volumes);
- priv = THIS->private;
+ this = THIS;
+ priv = this->private;
GF_ASSERT (priv);
@@ -327,6 +377,21 @@ glusterd_add_volume_detail_to_dict (glusterd_volinfo_t *volinfo,
if (ret)
goto out;
+ /* As of now, the snap volumes are also displayed as part of
+ volume info command. So this change is to display whether
+ the volume is original volume or the snap_volume. If
+ displaying of snap volumes in volume info o/p is not needed
+ this should be removed.
+ */
+ snprintf (key, 256, "volume%d.snap_volume", count);
+ ret = dict_set_int32 (volumes, key, volinfo->is_snap_volume);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "failed to set whether "
+ "the volume is a snap volume or actual volume (%s)",
+ volinfo->volname);
+ goto out;
+ }
+
snprintf (key, 256, "volume%d.brick_count", count);
ret = dict_set_int32 (volumes, key, volinfo->brick_count);
if (ret)
@@ -362,12 +427,97 @@ glusterd_add_volume_detail_to_dict (glusterd_volinfo_t *volinfo,
goto out;
snprintf (key, 256, "volume%d.rebalance", count);
- ret = dict_set_int32 (volumes, key, volinfo->defrag_cmd);
+ ret = dict_set_int32 (volumes, key, volinfo->rebal.defrag_cmd);
if (ret)
goto out;
+#ifdef HAVE_BD_XLATOR
+ if (volinfo->caps) {
+ caps = 0;
+ snprintf (key, 256, "volume%d.xlator0", count);
+ buf = GF_MALLOC (256, gf_common_mt_char);
+ if (!buf) {
+ ret = ENOMEM;
+ goto out;
+ }
+ if (volinfo->caps & CAPS_BD)
+ snprintf (buf, 256, "BD");
+ ret = dict_set_dynstr (volumes, key, buf);
+ if (ret) {
+ GF_FREE (buf);
+ goto out;
+ }
+
+ if (volinfo->caps & CAPS_THIN) {
+ snprintf (key, 256, "volume%d.xlator0.caps%d", count,
+ caps++);
+ buf = GF_MALLOC (256, gf_common_mt_char);
+ if (!buf) {
+ ret = ENOMEM;
+ goto out;
+ }
+ snprintf (buf, 256, "thin");
+ ret = dict_set_dynstr (volumes, key, buf);
+ if (ret) {
+ GF_FREE (buf);
+ goto out;
+ }
+ }
+
+ if (volinfo->caps & CAPS_OFFLOAD_COPY) {
+ snprintf (key, 256, "volume%d.xlator0.caps%d", count,
+ caps++);
+ buf = GF_MALLOC (256, gf_common_mt_char);
+ if (!buf) {
+ ret = ENOMEM;
+ goto out;
+ }
+ snprintf (buf, 256, "offload_copy");
+ ret = dict_set_dynstr (volumes, key, buf);
+ if (ret) {
+ GF_FREE (buf);
+ goto out;
+ }
+ }
+
+ if (volinfo->caps & CAPS_OFFLOAD_SNAPSHOT) {
+ snprintf (key, 256, "volume%d.xlator0.caps%d", count,
+ caps++);
+ buf = GF_MALLOC (256, gf_common_mt_char);
+ if (!buf) {
+ ret = ENOMEM;
+ goto out;
+ }
+ snprintf (buf, 256, "offload_snapshot");
+ ret = dict_set_dynstr (volumes, key, buf);
+ if (ret) {
+ GF_FREE (buf);
+ goto out;
+ }
+ }
+
+ if (volinfo->caps & CAPS_OFFLOAD_ZERO) {
+ snprintf (key, 256, "volume%d.xlator0.caps%d", count,
+ caps++);
+ buf = GF_MALLOC (256, gf_common_mt_char);
+ if (!buf) {
+ ret = ENOMEM;
+ goto out;
+ }
+ snprintf (buf, 256, "offload_zerofill");
+ ret = dict_set_dynstr (volumes, key, buf);
+ if (ret) {
+ GF_FREE (buf);
+ goto out;
+ }
+ }
+
+ }
+#endif
+
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
char brick[1024] = {0,};
+ char brick_uuid[64] = {0,};
snprintf (key, 256, "volume%d.brick%d", count, i);
snprintf (brick, 1024, "%s:%s", brickinfo->hostname,
brickinfo->path);
@@ -375,6 +525,25 @@ glusterd_add_volume_detail_to_dict (glusterd_volinfo_t *volinfo,
ret = dict_set_dynstr (volumes, key, buf);
if (ret)
goto out;
+ snprintf (key, 256, "volume%d.brick%d.uuid", count, i);
+ snprintf (brick_uuid, 64, "%s", uuid_utoa (brickinfo->uuid));
+ buf = gf_strdup (brick_uuid);
+ if (!buf)
+ goto out;
+ ret = dict_set_dynstr (volumes, key, buf);
+ if (ret)
+ goto out;
+
+#ifdef HAVE_BD_XLATOR
+ if (volinfo->caps & CAPS_BD) {
+ snprintf (key, 256, "volume%d.vg%d", count, i);
+ snprintf (brick, 1024, "%s", brickinfo->vg);
+ buf = gf_strdup (brick);
+ ret = dict_set_dynstr (volumes, key, buf);
+ if (ret)
+ goto out;
+ }
+#endif
i++;
}
@@ -388,6 +557,7 @@ glusterd_add_volume_detail_to_dict (glusterd_volinfo_t *volinfo,
pack.vol_count = count;
pack.opt_count = 0;
dict_foreach (dict, _build_option_key, (void *) &pack);
+ dict_foreach (priv->opts, _build_option_key, &pack);
snprintf (key, 256, "volume%d.opt_count", pack.vol_count);
ret = dict_set_int32 (volumes, key, pack.opt_count);
@@ -400,13 +570,18 @@ glusterd_friend_find (uuid_t uuid, char *hostname,
glusterd_peerinfo_t **peerinfo)
{
int ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
if (uuid) {
ret = glusterd_friend_find_by_uuid (uuid, peerinfo);
if (ret) {
- gf_log ("glusterd", GF_LOG_INFO,
- "Unable to find peer by uuid");
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Unable to find peer by uuid: %s",
+ uuid_utoa (uuid));
} else {
goto out;
}
@@ -417,7 +592,7 @@ glusterd_friend_find (uuid_t uuid, char *hostname,
ret = glusterd_friend_find_by_hostname (hostname, peerinfo);
if (ret) {
- gf_log ("glusterd", GF_LOG_INFO,
+ gf_log (this->name, GF_LOG_DEBUG,
"Unable to find hostname: %s", hostname);
} else {
goto out;
@@ -429,73 +604,186 @@ out:
}
int32_t
-glusterd_op_txn_begin (rpcsvc_request_t *req, glusterd_op_t op, void *ctx)
+glusterd_op_txn_begin (rpcsvc_request_t *req, glusterd_op_t op, void *ctx,
+ char *err_str, size_t err_len)
{
- int32_t ret = -1;
- xlator_t *this = NULL;
- glusterd_conf_t *priv = NULL;
- int32_t locked = 0;
+ int32_t ret = -1;
+ dict_t *dict = NULL;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ int32_t locked = 0;
+ char *tmp = NULL;
+ char *volname = NULL;
+ uuid_t *txn_id = NULL;
+ glusterd_op_info_t txn_op_info = {{0},};
+ glusterd_op_sm_event_type_t event_type = GD_OP_EVENT_NONE;
GF_ASSERT (req);
GF_ASSERT ((op > GD_OP_NONE) && (op < GD_OP_MAX));
GF_ASSERT (NULL != ctx);
this = THIS;
+ GF_ASSERT (this);
priv = this->private;
GF_ASSERT (priv);
- ret = glusterd_lock (MY_UUID);
+ dict = ctx;
+ /* Generate a transaction-id for this operation and
+ * save it in the dict. This transaction id distinguishes
+ * each transaction, and helps separate opinfos in the
+ * op state machine. */
+ ret = glusterd_generate_txn_id (dict, &txn_id);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
- "Unable to acquire local lock, ret: %d", ret);
+ "Failed to generate transaction id");
goto out;
}
+ /* Save the MY_UUID as the originator_uuid. This originator_uuid
+ * will be used by is_origin_glusterd() to determine if a node
+ * is the originator node for a command. */
+ ret = glusterd_set_originator_uuid (dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set originator_uuid.");
+ goto out;
+ }
+
+ /* Based on the op_version, acquire a cluster or mgmt_v3 lock */
+ if (priv->op_version < GD_OP_VERSION_4) {
+ ret = glusterd_lock (MY_UUID);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to acquire lock on localhost, ret: %d",
+ ret);
+ snprintf (err_str, err_len,
+ "Another transaction is in progress. "
+ "Please try again after sometime.");
+ goto out;
+ }
+ } else {
+ /* If no volname is given as a part of the command, locks will
+ * not be held */
+ ret = dict_get_str (dict, "volname", &tmp);
+ if (ret) {
+ gf_log ("", GF_LOG_INFO,
+ "No Volume name present. "
+ "Locks not being held.");
+ goto local_locking_done;
+ } else {
+ /* Use a copy of volname, as cli response will be
+ * sent before the unlock, and the volname in the
+ * dict, might be removed */
+ volname = gf_strdup (tmp);
+ if (!volname)
+ goto out;
+ }
+
+ ret = glusterd_mgmt_v3_lock (volname, MY_UUID, "vol");
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to acquire lock for %s", volname);
+ snprintf (err_str, err_len,
+ "Another transaction is in progress for %s. "
+ "Please try again after sometime.", volname);
+ goto out;
+ }
+ }
+
locked = 1;
- gf_log (this->name, GF_LOG_INFO, "Acquired local lock");
+ gf_log (this->name, GF_LOG_DEBUG, "Acquired lock on localhost");
+
+local_locking_done:
+
+ /* If no volname is given as a part of the command, locks will
+ * not be held, hence sending stage event. */
+ if (volname || (priv->op_version < GD_OP_VERSION_4))
+ event_type = GD_OP_EVENT_START_LOCK;
+ else {
+ txn_op_info.state.state = GD_OP_STATE_LOCK_SENT;
+ event_type = GD_OP_EVENT_ALL_ACC;
+ }
+
+ /* Save opinfo for this transaction with the transaction id */
+ glusterd_txn_opinfo_init (&txn_op_info, NULL, &op, ctx, req);
+
+ ret = glusterd_set_txn_opinfo (txn_id, &txn_op_info);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set transaction's opinfo");
+ if (ctx)
+ dict_unref (ctx);
+ goto out;
+ }
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_START_LOCK, NULL);
+ ret = glusterd_op_sm_inject_event (event_type, txn_id, ctx);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "Failed to acquire cluster"
" lock.");
goto out;
}
- glusterd_op_set_op (op);
- glusterd_op_set_ctx (ctx);
- glusterd_op_set_req (req);
-
-
out:
- if (locked && ret)
- glusterd_unlock (MY_UUID);
+ if (locked && ret) {
+ /* Based on the op-version, we release the
+ * cluster or mgmt_v3 lock */
+ if (priv->op_version < GD_OP_VERSION_4)
+ glusterd_unlock (MY_UUID);
+ else {
+ ret = glusterd_mgmt_v3_unlock (volname, MY_UUID,
+ "vol");
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to release lock for %s",
+ volname);
+ ret = -1;
+ }
+ }
+
+ if (volname)
+ GF_FREE (volname);
gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
int
-glusterd_handle_cluster_lock (rpcsvc_request_t *req)
+__glusterd_handle_cluster_lock (rpcsvc_request_t *req)
{
- gd1_mgmt_cluster_lock_req lock_req = {{0},};
- int32_t ret = -1;
- glusterd_op_lock_ctx_t *ctx = NULL;
- glusterd_peerinfo_t *peerinfo = NULL;
+ dict_t *op_ctx = NULL;
+ int32_t ret = -1;
+ gd1_mgmt_cluster_lock_req lock_req = {{0},};
+ glusterd_op_lock_ctx_t *ctx = NULL;
+ glusterd_op_t op = GD_OP_EVENT_LOCK;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ glusterd_op_info_t txn_op_info = {{0},};
+ glusterd_conf_t *priv = NULL;
+ uuid_t *txn_id = NULL;
+ xlator_t *this = NULL;
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
GF_ASSERT (req);
- if (!xdr_to_generic (req->msg[0], &lock_req, (xdrproc_t)xdr_gd1_mgmt_cluster_lock_req)) {
- //failed to decode msg;
+ txn_id = &priv->global_txn_id;
+
+ ret = xdr_to_generic (req->msg[0], &lock_req,
+ (xdrproc_t)xdr_gd1_mgmt_cluster_lock_req);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to decode lock "
+ "request received from peer");
req->rpc_err = GARBAGE_ARGS;
goto out;
}
- gf_log ("glusterd", GF_LOG_INFO,
- "Received LOCK from uuid: %s", uuid_utoa (lock_req.uuid));
+ gf_log (this->name, GF_LOG_DEBUG, "Received LOCK from uuid: %s",
+ uuid_utoa (lock_req.uuid));
if (glusterd_friend_find_by_uuid (lock_req.uuid, &peerinfo)) {
- gf_log (THIS->name, GF_LOG_WARNING, "%s doesn't "
+ gf_log (this->name, GF_LOG_WARNING, "%s doesn't "
"belong to the cluster. Ignoring request.",
uuid_utoa (lock_req.uuid));
ret = -1;
@@ -511,11 +799,32 @@ glusterd_handle_cluster_lock (rpcsvc_request_t *req)
uuid_copy (ctx->uuid, lock_req.uuid);
ctx->req = req;
+ ctx->dict = NULL;
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_LOCK, ctx);
+ op_ctx = dict_new ();
+ if (!op_ctx) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set new dict");
+ goto out;
+ }
+
+ glusterd_txn_opinfo_init (&txn_op_info, NULL, &op, op_ctx, req);
+
+ ret = glusterd_set_txn_opinfo (txn_id, &txn_op_info);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set transaction's opinfo");
+ dict_unref (txn_op_info.op_ctx);
+ goto out;
+ }
+
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_LOCK, txn_id, ctx);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to inject event GD_OP_EVENT_LOCK");
out:
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
glusterd_friend_sm ();
glusterd_op_sm ();
@@ -524,6 +833,13 @@ out:
}
int
+glusterd_handle_cluster_lock (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ __glusterd_handle_cluster_lock);
+}
+
+int
glusterd_req_ctx_create (rpcsvc_request_t *rpc_req,
glusterd_op_t op, uuid_t uuid,
char *buf_val, size_t buf_len,
@@ -534,10 +850,13 @@ glusterd_req_ctx_create (rpcsvc_request_t *rpc_req,
char str[50] = {0,};
glusterd_req_ctx_t *req_ctx = NULL;
dict_t *dict = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
uuid_unparse (uuid, str);
- gf_log ("glusterd", GF_LOG_INFO,
- "Received op from uuid: %s", str);
+ gf_log (this->name, GF_LOG_DEBUG, "Received op from uuid %s", str);
dict = dict_new ();
if (!dict)
@@ -552,7 +871,7 @@ glusterd_req_ctx_create (rpcsvc_request_t *rpc_req,
req_ctx->op = op;
ret = dict_unserialize (buf_val, buf_len, &dict);
if (ret) {
- gf_log ("", GF_LOG_WARNING,
+ gf_log (this->name, GF_LOG_WARNING,
"failed to unserialize the dictionary");
goto out;
}
@@ -571,22 +890,37 @@ out:
}
int
-glusterd_handle_stage_op (rpcsvc_request_t *req)
+__glusterd_handle_stage_op (rpcsvc_request_t *req)
{
int32_t ret = -1;
glusterd_req_ctx_t *req_ctx = NULL;
gd1_mgmt_stage_op_req op_req = {{0},};
glusterd_peerinfo_t *peerinfo = NULL;
+ xlator_t *this = NULL;
+ uuid_t *txn_id = NULL;
+ glusterd_op_info_t txn_op_info = {{0},};
+ glusterd_op_sm_state_info_t state = {0,};
+ glusterd_conf_t *priv = NULL;
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
GF_ASSERT (req);
- if (!xdr_to_generic (req->msg[0], &op_req, (xdrproc_t)xdr_gd1_mgmt_stage_op_req)) {
- //failed to decode msg;
+
+ txn_id = &priv->global_txn_id;
+
+ ret = xdr_to_generic (req->msg[0], &op_req,
+ (xdrproc_t)xdr_gd1_mgmt_stage_op_req);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to decode stage "
+ "request received from peer");
req->rpc_err = GARBAGE_ARGS;
goto out;
}
if (glusterd_friend_find_by_uuid (op_req.uuid, &peerinfo)) {
- gf_log (THIS->name, GF_LOG_WARNING, "%s doesn't "
+ gf_log (this->name, GF_LOG_WARNING, "%s doesn't "
"belong to the cluster. Ignoring request.",
uuid_utoa (op_req.uuid));
ret = -1;
@@ -599,7 +933,36 @@ glusterd_handle_stage_op (rpcsvc_request_t *req)
if (ret)
goto out;
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_STAGE_OP, req_ctx);
+ ret = dict_get_bin (req_ctx->dict, "transaction_id", (void **)&txn_id);
+
+ gf_log ("", GF_LOG_DEBUG, "transaction ID = %s", uuid_utoa (*txn_id));
+
+ /* In cases where there is no volname, the receivers won't have a
+ * transaction opinfo created, as for those operations, the locking
+ * phase where the transaction opinfos are created, won't be called. */
+ ret = glusterd_get_txn_opinfo (txn_id, &txn_op_info);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "No transaction's opinfo set");
+
+ state.state = GD_OP_STATE_LOCKED;
+ glusterd_txn_opinfo_init (&txn_op_info, &state,
+ &op_req.op, req_ctx->dict, req);
+
+ ret = glusterd_set_txn_opinfo (txn_id, &txn_op_info);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set transaction's opinfo");
+ dict_unref (req_ctx->dict);
+ goto out;
+ }
+ }
+
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_STAGE_OP,
+ txn_id, req_ctx);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to inject event GD_OP_EVENT_STAGE_OP");
out:
free (op_req.buf.buf_val);//malloced by xdr
@@ -609,23 +972,42 @@ glusterd_handle_stage_op (rpcsvc_request_t *req)
}
int
-glusterd_handle_commit_op (rpcsvc_request_t *req)
+glusterd_handle_stage_op (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req, __glusterd_handle_stage_op);
+}
+
+
+int
+__glusterd_handle_commit_op (rpcsvc_request_t *req)
{
int32_t ret = -1;
glusterd_req_ctx_t *req_ctx = NULL;
gd1_mgmt_commit_op_req op_req = {{0},};
glusterd_peerinfo_t *peerinfo = NULL;
+ xlator_t *this = NULL;
+ uuid_t *txn_id = NULL;
+ glusterd_conf_t *priv = NULL;
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
GF_ASSERT (req);
- if (!xdr_to_generic (req->msg[0], &op_req, (xdrproc_t)xdr_gd1_mgmt_commit_op_req)) {
- //failed to decode msg;
+ txn_id = &priv->global_txn_id;
+
+ ret = xdr_to_generic (req->msg[0], &op_req,
+ (xdrproc_t)xdr_gd1_mgmt_commit_op_req);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to decode commit "
+ "request received from peer");
req->rpc_err = GARBAGE_ARGS;
goto out;
}
if (glusterd_friend_find_by_uuid (op_req.uuid, &peerinfo)) {
- gf_log (THIS->name, GF_LOG_WARNING, "%s doesn't "
+ gf_log (this->name, GF_LOG_WARNING, "%s doesn't "
"belong to the cluster. Ignoring request.",
uuid_utoa (op_req.uuid));
ret = -1;
@@ -640,11 +1022,12 @@ glusterd_handle_commit_op (rpcsvc_request_t *req)
if (ret)
goto out;
- ret = glusterd_op_init_ctx (op_req.op);
- if (ret)
- goto out;
+ ret = dict_get_bin (req_ctx->dict, "transaction_id", (void **)&txn_id);
+
+ gf_log ("", GF_LOG_DEBUG, "transaction ID = %s", uuid_utoa (*txn_id));
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_COMMIT_OP, req_ctx);
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_COMMIT_OP,
+ txn_id, req_ctx);
out:
free (op_req.buf.buf_val);//malloced by xdr
@@ -654,56 +1037,109 @@ out:
}
int
-glusterd_handle_cli_probe (rpcsvc_request_t *req)
+glusterd_handle_commit_op (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req, __glusterd_handle_commit_op);
+}
+
+int
+__glusterd_handle_cli_probe (rpcsvc_request_t *req)
{
int32_t ret = -1;
- gf1_cli_probe_req cli_req = {0,};
- glusterd_peerinfo_t *peerinfo = NULL;
- gf_boolean_t run_fsm = _gf_true;
+ gf_cli_req cli_req = {{0,},};
+ glusterd_peerinfo_t *peerinfo = NULL;
+ gf_boolean_t run_fsm = _gf_true;
+ xlator_t *this = NULL;
+ char *bind_name = NULL;
+ dict_t *dict = NULL;
+ char *hostname = NULL;
+ int port = 0;
+
GF_ASSERT (req);
+ this = THIS;
- if (!xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf1_cli_probe_req)) {
+ ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
//failed to decode msg;
gf_log ("", GF_LOG_ERROR, "xdr decoding error");
req->rpc_err = GARBAGE_ARGS;
goto out;
}
- gf_cmd_log ("peer probe", " on host %s:%d", cli_req.hostname,
- cli_req.port);
+ if (cli_req.dict.dict_len) {
+ dict = dict_new ();
+
+ ret = dict_unserialize (cli_req.dict.dict_val,
+ cli_req.dict.dict_len, &dict);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "unserialize req-buffer to dictionary");
+ goto out;
+ }
+ }
+
+ ret = dict_get_str (dict, "hostname", &hostname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get hostname");
+ goto out;
+ }
+
+ ret = dict_get_int32 (dict, "port", &port);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get port");
+ goto out;
+ }
+
+ if (glusterd_is_any_volume_in_server_quorum (this) &&
+ !does_gd_meet_server_quorum (this)) {
+ glusterd_xfer_cli_probe_resp (req, -1, GF_PROBE_QUORUM_NOT_MET,
+ NULL, hostname, port, dict);
+ gf_log (this->name, GF_LOG_ERROR, "Quorum does not meet, "
+ "rejecting operation");
+ ret = 0;
+ goto out;
+ }
+
gf_log ("glusterd", GF_LOG_INFO, "Received CLI probe req %s %d",
- cli_req.hostname, cli_req.port);
+ hostname, port);
- if (!(ret = glusterd_is_local_addr(cli_req.hostname))) {
- glusterd_xfer_cli_probe_resp (req, 0, GF_PROBE_LOCALHOST, NULL,
- cli_req.hostname, cli_req.port);
+ if (dict_get_str(this->options,"transport.socket.bind-address",
+ &bind_name) == 0) {
+ gf_log ("glusterd", GF_LOG_DEBUG,
+ "only checking probe address vs. bind address");
+ ret = gf_is_same_address (bind_name, hostname);
+ }
+ else {
+ ret = gf_is_local_addr (hostname);
+ }
+ if (ret) {
+ glusterd_xfer_cli_probe_resp (req, 0, GF_PROBE_LOCALHOST,
+ NULL, hostname, port, dict);
+ ret = 0;
goto out;
}
- if (!(ret = glusterd_friend_find_by_hostname(cli_req.hostname,
- &peerinfo))) {
- if (strcmp (peerinfo->hostname, cli_req.hostname) == 0) {
+ if (!(ret = glusterd_friend_find_by_hostname (hostname, &peerinfo))) {
+ if (strcmp (peerinfo->hostname, hostname) == 0) {
- gf_log ("glusterd", GF_LOG_DEBUG, "Probe host %s port %d"
- " already a peer", cli_req.hostname, cli_req.port);
+ gf_log ("glusterd", GF_LOG_DEBUG, "Probe host %s port "
+ "%d already a peer", hostname, port);
glusterd_xfer_cli_probe_resp (req, 0, GF_PROBE_FRIEND,
- NULL, cli_req.hostname,
- cli_req.port);
+ NULL, hostname, port,
+ dict);
goto out;
}
}
- ret = glusterd_probe_begin (req, cli_req.hostname, cli_req.port);
-
- gf_cmd_log ("peer probe","on host %s:%d %s",cli_req.hostname, cli_req.port,
- (ret) ? "FAILED" : "SUCCESS");
+ ret = glusterd_probe_begin (req, hostname, port, dict);
if (ret == GLUSTERD_CONNECTION_AWAITED) {
//fsm should be run after connection establishes
run_fsm = _gf_false;
ret = 0;
}
+
out:
- free (cli_req.hostname);//its malloced by xdr
+ free (cli_req.dict.dict_val);
if (run_fsm) {
glusterd_friend_sm ();
@@ -714,14 +1150,26 @@ out:
}
int
-glusterd_handle_cli_deprobe (rpcsvc_request_t *req)
+glusterd_handle_cli_probe (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req, __glusterd_handle_cli_probe);
+}
+
+int
+__glusterd_handle_cli_deprobe (rpcsvc_request_t *req)
{
int32_t ret = -1;
- gf1_cli_deprobe_req cli_req = {0,};
- uuid_t uuid = {0};
- int op_errno = 0;
- xlator_t *this = NULL;
- glusterd_conf_t *priv = NULL;
+ gf_cli_req cli_req = {{0,},};
+ uuid_t uuid = {0};
+ int op_errno = 0;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ dict_t *dict = NULL;
+ char *hostname = NULL;
+ int port = 0;
+ int flags = 0;
+ glusterd_volinfo_t *volinfo = NULL;
+ glusterd_volinfo_t *tmp = NULL;
this = THIS;
GF_ASSERT (this);
@@ -729,16 +1177,46 @@ glusterd_handle_cli_deprobe (rpcsvc_request_t *req)
GF_ASSERT (priv);
GF_ASSERT (req);
- if (!xdr_to_generic (req->msg[0], &cli_req,
- (xdrproc_t)xdr_gf1_cli_deprobe_req)) {
+ ret = xdr_to_generic (req->msg[0], &cli_req,
+ (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
//failed to decode msg;
req->rpc_err = GARBAGE_ARGS;
goto out;
}
+ if (cli_req.dict.dict_len) {
+ dict = dict_new ();
+
+ ret = dict_unserialize (cli_req.dict.dict_val,
+ cli_req.dict.dict_len, &dict);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "unserialize req-buffer to dictionary");
+ goto out;
+ }
+ }
+
gf_log ("glusterd", GF_LOG_INFO, "Received CLI deprobe req");
- ret = glusterd_hostname_to_uuid (cli_req.hostname, uuid);
+ ret = dict_get_str (dict, "hostname", &hostname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get hostname");
+ goto out;
+ }
+
+ ret = dict_get_int32 (dict, "port", &port);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get port");
+ goto out;
+ }
+ ret = dict_get_int32 (dict, "flags", &flags);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get flags");
+ goto out;
+ }
+
+ ret = glusterd_hostname_to_uuid (hostname, uuid);
if (ret) {
op_errno = GF_DEPROBE_NOT_FRIEND;
goto out;
@@ -750,40 +1228,55 @@ glusterd_handle_cli_deprobe (rpcsvc_request_t *req)
goto out;
}
- if (!uuid_is_null (uuid) && !(cli_req.flags & GF_CLI_FLAG_OP_FORCE)) {
- /* Check if peers are connected, except peer being detached*/
+ if (!(flags & GF_CLI_FLAG_OP_FORCE)) {
+ /* Check if peers are connected, except peer being
+ * detached*/
if (!glusterd_chk_peers_connected_befriended (uuid)) {
ret = -1;
op_errno = GF_DEPROBE_FRIEND_DOWN;
goto out;
}
- ret = glusterd_all_volume_cond_check (
- glusterd_friend_brick_belongs,
- -1, &uuid);
- if (ret) {
+ }
+
+ /* Check for if volumes exist with some bricks on the peer being
+ * detached. It's not a problem if a volume contains none or all
+ * of its bricks on the peer being detached
+ */
+ list_for_each_entry_safe (volinfo, tmp, &priv->volumes,
+ vol_list) {
+ ret = glusterd_friend_contains_vol_bricks (volinfo,
+ uuid);
+ if (ret == 1) {
op_errno = GF_DEPROBE_BRICK_EXIST;
goto out;
}
}
+ if (!(flags & GF_CLI_FLAG_OP_FORCE)) {
+ if (glusterd_is_any_volume_in_server_quorum (this) &&
+ !does_gd_meet_server_quorum (this)) {
+ gf_log (this->name, GF_LOG_ERROR, "Quorum does not "
+ "meet, rejecting operation");
+ ret = -1;
+ op_errno = GF_DEPROBE_QUORUM_NOT_MET;
+ goto out;
+ }
+ }
+
if (!uuid_is_null (uuid)) {
- ret = glusterd_deprobe_begin (req, cli_req.hostname,
- cli_req.port, uuid);
+ ret = glusterd_deprobe_begin (req, hostname, port, uuid, dict);
} else {
- ret = glusterd_deprobe_begin (req, cli_req.hostname,
- cli_req.port, NULL);
+ ret = glusterd_deprobe_begin (req, hostname, port, NULL, dict);
}
- gf_cmd_log ("peer deprobe", "on host %s:%d %s", cli_req.hostname,
- cli_req.port, (ret) ? "FAILED" : "SUCCESS");
out:
+ free (cli_req.dict.dict_val);
+
if (ret) {
ret = glusterd_xfer_cli_deprobe_resp (req, ret, op_errno, NULL,
- cli_req.hostname);
+ hostname, dict);
}
- free (cli_req.hostname);//malloced by xdr
-
glusterd_friend_sm ();
glusterd_op_sm ();
@@ -791,7 +1284,13 @@ out:
}
int
-glusterd_handle_cli_list_friends (rpcsvc_request_t *req)
+glusterd_handle_cli_deprobe (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req, __glusterd_handle_cli_deprobe);
+}
+
+int
+__glusterd_handle_cli_list_friends (rpcsvc_request_t *req)
{
int32_t ret = -1;
gf1_cli_peer_list_req cli_req = {0,};
@@ -799,7 +1298,9 @@ glusterd_handle_cli_list_friends (rpcsvc_request_t *req)
GF_ASSERT (req);
- if (!xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf1_cli_peer_list_req)) {
+ ret = xdr_to_generic (req->msg[0], &cli_req,
+ (xdrproc_t)xdr_gf1_cli_peer_list_req);
+ if (ret < 0) {
//failed to decode msg;
req->rpc_err = GARBAGE_ARGS;
goto out;
@@ -837,7 +1338,14 @@ out:
}
int
-glusterd_handle_cli_get_volume (rpcsvc_request_t *req)
+glusterd_handle_cli_list_friends (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ __glusterd_handle_cli_list_friends);
+}
+
+int
+__glusterd_handle_cli_get_volume (rpcsvc_request_t *req)
{
int32_t ret = -1;
gf_cli_req cli_req = {{0,}};
@@ -846,7 +1354,8 @@ glusterd_handle_cli_get_volume (rpcsvc_request_t *req)
GF_ASSERT (req);
- if (!xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req)) {
+ ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
//failed to decode msg;
req->rpc_err = GARBAGE_ARGS;
goto out;
@@ -890,7 +1399,216 @@ out:
}
int
-glusterd_handle_cli_list_volume (rpcsvc_request_t *req)
+glusterd_handle_cli_get_volume (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ __glusterd_handle_cli_get_volume);
+}
+
+int
+__glusterd_handle_cli_uuid_reset (rpcsvc_request_t *req)
+{
+ int ret = -1;
+ dict_t *dict = NULL;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ uuid_t uuid = {0};
+ gf_cli_rsp rsp = {0,};
+ gf_cli_req cli_req = {{0,}};
+ char msg_str[2048] = {0,};
+
+ GF_ASSERT (req);
+
+ this = THIS;
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
+ //failed to decode msg;
+ req->rpc_err = GARBAGE_ARGS;
+ goto out;
+ }
+
+ gf_log ("glusterd", GF_LOG_DEBUG, "Received uuid reset req");
+
+ if (cli_req.dict.dict_len) {
+ /* Unserialize the dictionary */
+ dict = dict_new ();
+
+ ret = dict_unserialize (cli_req.dict.dict_val,
+ cli_req.dict.dict_len,
+ &dict);
+ if (ret < 0) {
+ gf_log ("glusterd", GF_LOG_ERROR,
+ "failed to "
+ "unserialize req-buffer to dictionary");
+ snprintf (msg_str, sizeof (msg_str), "Unable to decode "
+ "the buffer");
+ goto out;
+ } else {
+ dict->extra_stdfree = cli_req.dict.dict_val;
+ }
+ }
+
+ /* In the above section if dict_unserialize is successful, ret is set
+ * to zero.
+ */
+ ret = -1;
+ // Do not allow peer reset if there are any volumes in the cluster
+ if (!list_empty (&priv->volumes)) {
+ snprintf (msg_str, sizeof (msg_str), "volumes are already "
+ "present in the cluster. Resetting uuid is not "
+ "allowed");
+ gf_log (this->name, GF_LOG_WARNING, "%s", msg_str);
+ goto out;
+ }
+
+ // Do not allow peer reset if trusted storage pool is already formed
+ if (!list_empty (&priv->peers)) {
+ snprintf (msg_str, sizeof (msg_str),"trusted storage pool "
+ "has been already formed. Please detach this peer "
+ "from the pool and reset its uuid.");
+ gf_log (this->name, GF_LOG_WARNING, "%s", msg_str);
+ goto out;
+ }
+
+ uuid_copy (uuid, priv->uuid);
+ ret = glusterd_uuid_generate_save ();
+
+ if (!uuid_compare (uuid, MY_UUID)) {
+ snprintf (msg_str, sizeof (msg_str), "old uuid and the new uuid"
+ " are same. Try gluster peer reset again");
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg_str);
+ ret = -1;
+ goto out;
+ }
+
+out:
+ if (ret) {
+ rsp.op_ret = -1;
+ if (msg_str[0] == '\0')
+ snprintf (msg_str, sizeof (msg_str), "Operation "
+ "failed");
+ rsp.op_errstr = msg_str;
+ ret = 0;
+ } else {
+ rsp.op_errstr = "";
+ }
+
+ glusterd_to_cli (req, &rsp, NULL, 0, NULL,
+ (xdrproc_t)xdr_gf_cli_rsp, dict);
+
+ return ret;
+}
+
+int
+glusterd_handle_cli_uuid_reset (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ __glusterd_handle_cli_uuid_reset);
+}
+
+int
+__glusterd_handle_cli_uuid_get (rpcsvc_request_t *req)
+{
+ int ret = -1;
+ dict_t *dict = NULL;
+ dict_t *rsp_dict = NULL;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ gf_cli_rsp rsp = {0,};
+ gf_cli_req cli_req = {{0,}};
+ char msg_str[2048] = {0,};
+ char uuid_str[64] = {0,};
+
+ GF_ASSERT (req);
+
+ this = THIS;
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
+ req->rpc_err = GARBAGE_ARGS;
+ goto out;
+ }
+
+ gf_log ("glusterd", GF_LOG_DEBUG, "Received uuid get req");
+
+ if (cli_req.dict.dict_len) {
+ dict = dict_new ();
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_unserialize (cli_req.dict.dict_val,
+ cli_req.dict.dict_len,
+ &dict);
+ if (ret < 0) {
+ gf_log ("glusterd", GF_LOG_ERROR,
+ "failed to "
+ "unserialize req-buffer to dictionary");
+ snprintf (msg_str, sizeof (msg_str), "Unable to decode "
+ "the buffer");
+ goto out;
+
+ } else {
+ dict->extra_stdfree = cli_req.dict.dict_val;
+
+ }
+ }
+
+ rsp_dict = dict_new ();
+ if (!rsp_dict) {
+ ret = -1;
+ goto out;
+ }
+
+ uuid_utoa_r (MY_UUID, uuid_str);
+ ret = dict_set_str (rsp_dict, "uuid", uuid_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set uuid in "
+ "dictionary.");
+ goto out;
+ }
+
+ ret = dict_allocate_and_serialize (rsp_dict, &rsp.dict.dict_val,
+ &rsp.dict.dict_len);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to serialize "
+ "dictionary.");
+ goto out;
+ }
+ ret = 0;
+out:
+ if (ret) {
+ rsp.op_ret = -1;
+ if (msg_str[0] == '\0')
+ snprintf (msg_str, sizeof (msg_str), "Operation "
+ "failed");
+ rsp.op_errstr = msg_str;
+
+ } else {
+ rsp.op_errstr = "";
+
+ }
+
+ glusterd_to_cli (req, &rsp, NULL, 0, NULL,
+ (xdrproc_t)xdr_gf_cli_rsp, dict);
+
+ return 0;
+}
+int
+glusterd_handle_cli_uuid_get (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ __glusterd_handle_cli_uuid_get);
+}
+
+int
+__glusterd_handle_cli_list_volume (rpcsvc_request_t *req)
{
int ret = -1;
dict_t *dict = NULL;
@@ -949,29 +1667,44 @@ out:
return ret;
}
+int
+glusterd_handle_cli_list_volume (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ __glusterd_handle_cli_list_volume);
+}
+
int32_t
-glusterd_op_begin (rpcsvc_request_t *req, glusterd_op_t op, void *ctx)
+glusterd_op_begin (rpcsvc_request_t *req, glusterd_op_t op, void *ctx,
+ char *err_str, size_t err_len)
{
int ret = -1;
- ret = glusterd_op_txn_begin (req, op, ctx);
+ ret = glusterd_op_txn_begin (req, op, ctx, err_str, err_len);
return ret;
}
int
-glusterd_handle_reset_volume (rpcsvc_request_t *req)
+__glusterd_handle_reset_volume (rpcsvc_request_t *req)
{
int32_t ret = -1;
gf_cli_req cli_req = {{0,}};
dict_t *dict = NULL;
glusterd_op_t cli_op = GD_OP_RESET_VOLUME;
char *volname = NULL;
+ char err_str[2048] = {0,};
+ xlator_t *this = NULL;
GF_ASSERT (req);
+ this = THIS;
+ GF_ASSERT (this);
- if (!xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req)) {
- //failed to decode msg;
+ ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
+ snprintf (err_str, sizeof (err_str), "Failed to decode request "
+ "received from cli");
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
req->rpc_err = GARBAGE_ARGS;
goto out;
}
@@ -984,8 +1717,10 @@ glusterd_handle_reset_volume (rpcsvc_request_t *req)
cli_req.dict.dict_len,
&dict);
if (ret < 0) {
- gf_log ("glusterd", GF_LOG_ERROR, "failed to "
+ gf_log (this->name, GF_LOG_ERROR, "failed to "
"unserialize req-buffer to dictionary");
+ snprintf (err_str, sizeof (err_str), "Unable to decode "
+ "the command");
goto out;
} else {
dict->extra_stdfree = cli_req.dict.dict_val;
@@ -994,28 +1729,37 @@ glusterd_handle_reset_volume (rpcsvc_request_t *req)
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "failed to get volname");
+ snprintf (err_str, sizeof (err_str), "Failed to get volume "
+ "name");
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
+ gf_log (this->name, GF_LOG_DEBUG, "Received volume reset request for "
+ "volume %s", volname);
- ret = glusterd_op_begin (req, GD_OP_RESET_VOLUME, dict);
+ ret = glusterd_op_begin_synctask (req, GD_OP_RESET_VOLUME, dict);
out:
- glusterd_friend_sm ();
- glusterd_op_sm ();
if (ret) {
+ if (err_str[0] == '\0')
+ snprintf (err_str, sizeof (err_str),
+ "Operation failed");
ret = glusterd_op_send_cli_response (cli_op, ret, 0, req,
- dict, "operation failed");
- if (dict)
- dict_unref (dict);
+ dict, err_str);
}
return ret;
}
+int
+glusterd_handle_reset_volume (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ __glusterd_handle_reset_volume);
+}
int
-glusterd_handle_set_volume (rpcsvc_request_t *req)
+__glusterd_handle_set_volume (rpcsvc_request_t *req)
{
int32_t ret = -1;
gf_cli_req cli_req = {{0,}};
@@ -1026,11 +1770,19 @@ glusterd_handle_set_volume (rpcsvc_request_t *req)
char *volname = NULL;
char *op_errstr = NULL;
gf_boolean_t help = _gf_false;
+ char err_str[2048] = {0,};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
GF_ASSERT (req);
- if (!xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req)) {
- //failed to decode msg;
+ ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
+ snprintf (err_str, sizeof (err_str), "Failed to decode "
+ "request received from cli");
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
req->rpc_err = GARBAGE_ARGS;
goto out;
}
@@ -1043,9 +1795,11 @@ glusterd_handle_set_volume (rpcsvc_request_t *req)
cli_req.dict.dict_len,
&dict);
if (ret < 0) {
- gf_log ("glusterd", GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"failed to "
"unserialize req-buffer to dictionary");
+ snprintf (err_str, sizeof (err_str), "Unable to decode "
+ "the command");
goto out;
} else {
dict->extra_stdfree = cli_req.dict.dict_val;
@@ -1054,8 +1808,9 @@ glusterd_handle_set_volume (rpcsvc_request_t *req)
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log ("", GF_LOG_WARNING, "Unable to get volume name, while"
- "handling volume set command");
+ snprintf (err_str, sizeof (err_str), "Failed to get volume "
+ "name while handling volume set command");
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
@@ -1068,30 +1823,35 @@ glusterd_handle_set_volume (rpcsvc_request_t *req)
ret = dict_get_str (dict, "key1", &key);
if (ret) {
- gf_log ("", GF_LOG_WARNING, "Unable to get key, while "
- "handling volume set for %s",volname);
+ snprintf (err_str, sizeof (err_str), "Failed to get key while"
+ " handling volume set for %s", volname);
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
ret = dict_get_str (dict, "value1", &value);
if (ret) {
- gf_log ("", GF_LOG_WARNING, "Unable to get value, while"
- "handling volume set for %s",volname);
+ snprintf (err_str, sizeof (err_str), "Failed to get value while"
+ " handling volume set for %s", volname);
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
+ gf_log (this->name, GF_LOG_DEBUG, "Received volume set request for "
+ "volume %s", volname);
- ret = glusterd_op_begin (req, GD_OP_SET_VOLUME, dict);
+ ret = glusterd_op_begin_synctask (req, GD_OP_SET_VOLUME, dict);
out:
- glusterd_friend_sm ();
- glusterd_op_sm ();
-
if (help)
ret = glusterd_op_send_cli_response (cli_op, ret, 0, req, dict,
(op_errstr)? op_errstr:"");
- else if (ret)
+ else if (ret) {
+ if (err_str[0] == '\0')
+ snprintf (err_str, sizeof (err_str),
+ "Operation failed");
ret = glusterd_op_send_cli_response (cli_op, ret, 0, req,
- dict, "operation failed");
+ dict, err_str);
+ }
if (op_errstr)
GF_FREE (op_errstr);
@@ -1099,21 +1859,30 @@ out:
}
int
-glusterd_handle_sync_volume (rpcsvc_request_t *req)
+glusterd_handle_set_volume (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req, __glusterd_handle_set_volume);
+}
+
+int
+__glusterd_handle_sync_volume (rpcsvc_request_t *req)
{
int32_t ret = -1;
gf_cli_req cli_req = {{0,}};
dict_t *dict = NULL;
gf_cli_rsp cli_rsp = {0.};
char msg[2048] = {0,};
- glusterd_volinfo_t *volinfo = NULL;
char *volname = NULL;
gf1_cli_sync_volume flags = 0;
char *hostname = NULL;
+ xlator_t *this = NULL;
GF_ASSERT (req);
+ this = THIS;
+ GF_ASSERT (this);
- if (!xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req)) {
+ ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
//failed to decode msg;
req->rpc_err = GARBAGE_ARGS;
goto out;
@@ -1127,9 +1896,11 @@ glusterd_handle_sync_volume (rpcsvc_request_t *req)
cli_req.dict.dict_len,
&dict);
if (ret < 0) {
- gf_log ("glusterd", GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"failed to "
"unserialize req-buffer to dictionary");
+ snprintf (msg, sizeof (msg), "Unable to decode the "
+ "command");
goto out;
} else {
dict->extra_stdfree = cli_req.dict.dict_val;
@@ -1138,7 +1909,8 @@ glusterd_handle_sync_volume (rpcsvc_request_t *req)
ret = dict_get_str (dict, "hostname", &hostname);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "failed to get hostname");
+ snprintf (msg, sizeof (msg), "Failed to get hostname");
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg);
goto out;
}
@@ -1146,48 +1918,25 @@ glusterd_handle_sync_volume (rpcsvc_request_t *req)
if (ret) {
ret = dict_get_int32 (dict, "flags", (int32_t*)&flags);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "Unable to get volume"
- "name, or flags");
+ snprintf (msg, sizeof (msg), "Failed to get volume name"
+ " or flags");
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg);
goto out;
}
}
- gf_log ("glusterd", GF_LOG_INFO, "Received volume sync req "
- "for volume %s",
- (flags & GF_CLI_SYNC_ALL) ? "all" : volname);
+ gf_log (this->name, GF_LOG_INFO, "Received volume sync req "
+ "for volume %s", (flags & GF_CLI_SYNC_ALL) ? "all" : volname);
- if (!glusterd_is_local_addr (hostname)) {
+ if (gf_is_local_addr (hostname)) {
ret = -1;
snprintf (msg, sizeof (msg), "sync from localhost"
" not allowed");
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg);
goto out;
}
- if (!flags) {
- ret = glusterd_volinfo_find (volname, &volinfo);
- if (!ret) {
- snprintf (msg, sizeof (msg), "please delete the "
- "volume: %s before sync", volname);
- ret = -1;
- goto out;
- }
-
- ret = dict_set_dynmstr (dict, "volname", volname);
- if (ret) {
- gf_log ("", GF_LOG_ERROR, "volume name set failed");
- snprintf (msg, sizeof (msg), "volume name set failed");
- goto out;
- }
- } else {
- if (glusterd_volume_count_get ()) {
- snprintf (msg, sizeof (msg), "please delete all the "
- "volumes before full sync");
- ret = -1;
- goto out;
- }
- }
-
- ret = glusterd_op_begin (req, GD_OP_SYNC_VOLUME, dict);
+ ret = glusterd_op_begin_synctask (req, GD_OP_SYNC_VOLUME, dict);
out:
if (ret) {
@@ -1197,20 +1946,20 @@ out:
snprintf (msg, sizeof (msg), "Operation failed");
glusterd_to_cli (req, &cli_rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_cli_rsp, dict);
- if (dict)
- dict_unref (dict);
-
ret = 0; //sent error to cli, prevent second reply
}
- glusterd_friend_sm ();
- glusterd_op_sm ();
-
return ret;
}
int
+glusterd_handle_sync_volume (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req, __glusterd_handle_sync_volume);
+}
+
+int
glusterd_fsm_log_send_resp (rpcsvc_request_t *req, int op_ret,
char *op_errstr, dict_t *dict)
{
@@ -1237,7 +1986,7 @@ glusterd_fsm_log_send_resp (rpcsvc_request_t *req, int op_ret,
}
int
-glusterd_handle_fsm_log (rpcsvc_request_t *req)
+__glusterd_handle_fsm_log (rpcsvc_request_t *req)
{
int32_t ret = -1;
gf1_cli_fsm_log_req cli_req = {0,};
@@ -1250,7 +1999,9 @@ glusterd_handle_fsm_log (rpcsvc_request_t *req)
GF_ASSERT (req);
- if (!xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf1_cli_fsm_log_req)) {
+ ret = xdr_to_generic (req->msg[0], &cli_req,
+ (xdrproc_t)xdr_gf1_cli_fsm_log_req);
+ if (ret < 0) {
//failed to decode msg;
req->rpc_err = GARBAGE_ARGS;
snprintf (msg, sizeof (msg), "Garbage request");
@@ -1292,6 +2043,12 @@ out:
}
int
+glusterd_handle_fsm_log (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req, __glusterd_handle_fsm_log);
+}
+
+int
glusterd_op_lock_send_resp (rpcsvc_request_t *req, int32_t status)
{
@@ -1305,8 +2062,7 @@ glusterd_op_lock_send_resp (rpcsvc_request_t *req, int32_t status)
ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gd1_mgmt_cluster_lock_rsp);
- gf_log ("glusterd", GF_LOG_INFO,
- "Responded, ret: %d", ret);
+ gf_log (THIS->name, GF_LOG_DEBUG, "Responded to lock, ret: %d", ret);
return 0;
}
@@ -1325,35 +2081,95 @@ glusterd_op_unlock_send_resp (rpcsvc_request_t *req, int32_t status)
ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gd1_mgmt_cluster_unlock_rsp);
- gf_log ("glusterd", GF_LOG_INFO,
- "Responded to unlock, ret: %d", ret);
+ gf_log (THIS->name, GF_LOG_DEBUG, "Responded to unlock, ret: %d", ret);
return ret;
}
int
-glusterd_handle_cluster_unlock (rpcsvc_request_t *req)
+glusterd_op_mgmt_v3_lock_send_resp (rpcsvc_request_t *req, uuid_t *txn_id,
+ int32_t status)
+{
+
+ gd1_mgmt_v3_lock_rsp rsp = {{0},};
+ int ret = -1;
+
+ GF_ASSERT (req);
+ GF_ASSERT (txn_id);
+ glusterd_get_uuid (&rsp.uuid);
+ rsp.op_ret = status;
+ if (rsp.op_ret)
+ rsp.op_errno = errno;
+ uuid_copy (rsp.txn_id, *txn_id);
+
+ ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
+ (xdrproc_t)xdr_gd1_mgmt_v3_lock_rsp);
+
+ gf_log (THIS->name, GF_LOG_DEBUG, "Responded to mgmt_v3 lock, ret: %d",
+ ret);
+
+ return ret;
+}
+
+int
+glusterd_op_mgmt_v3_unlock_send_resp (rpcsvc_request_t *req, uuid_t *txn_id,
+ int32_t status)
+{
+
+ gd1_mgmt_v3_unlock_rsp rsp = {{0},};
+ int ret = -1;
+
+ GF_ASSERT (req);
+ GF_ASSERT (txn_id);
+ rsp.op_ret = status;
+ if (rsp.op_ret)
+ rsp.op_errno = errno;
+ glusterd_get_uuid (&rsp.uuid);
+ uuid_copy (rsp.txn_id, *txn_id);
+
+ ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
+ (xdrproc_t)xdr_gd1_mgmt_v3_unlock_rsp);
+
+ gf_log (THIS->name, GF_LOG_DEBUG, "Responded to mgmt_v3 unlock, ret: %d",
+ ret);
+
+ return ret;
+}
+
+int
+__glusterd_handle_cluster_unlock (rpcsvc_request_t *req)
{
gd1_mgmt_cluster_unlock_req unlock_req = {{0}, };
int32_t ret = -1;
glusterd_op_lock_ctx_t *ctx = NULL;
glusterd_peerinfo_t *peerinfo = NULL;
+ xlator_t *this = NULL;
+ uuid_t *txn_id = NULL;
+ glusterd_conf_t *priv = NULL;
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
GF_ASSERT (req);
- if (!xdr_to_generic (req->msg[0], &unlock_req,
- (xdrproc_t)xdr_gd1_mgmt_cluster_unlock_req)) {
- //failed to decode msg;
+ txn_id = &priv->global_txn_id;
+
+ ret = xdr_to_generic (req->msg[0], &unlock_req,
+ (xdrproc_t)xdr_gd1_mgmt_cluster_unlock_req);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to decode unlock "
+ "request received from peer");
req->rpc_err = GARBAGE_ARGS;
goto out;
}
- gf_log ("glusterd", GF_LOG_INFO,
+ gf_log (this->name, GF_LOG_DEBUG,
"Received UNLOCK from uuid: %s", uuid_utoa (unlock_req.uuid));
if (glusterd_friend_find_by_uuid (unlock_req.uuid, &peerinfo)) {
- gf_log (THIS->name, GF_LOG_WARNING, "%s doesn't "
+ gf_log (this->name, GF_LOG_WARNING, "%s doesn't "
"belong to the cluster. Ignoring request.",
uuid_utoa (unlock_req.uuid));
ret = -1;
@@ -1368,8 +2184,9 @@ glusterd_handle_cluster_unlock (rpcsvc_request_t *req)
}
uuid_copy (ctx->uuid, unlock_req.uuid);
ctx->req = req;
+ ctx->dict = NULL;
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_UNLOCK, ctx);
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_UNLOCK, txn_id, ctx);
out:
glusterd_friend_sm ();
@@ -1379,14 +2196,25 @@ out:
}
int
+glusterd_handle_cluster_unlock (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ __glusterd_handle_cluster_unlock);
+}
+
+int
glusterd_op_stage_send_resp (rpcsvc_request_t *req,
int32_t op, int32_t status,
char *op_errstr, dict_t *rsp_dict)
{
gd1_mgmt_stage_op_rsp rsp = {{0},};
int ret = -1;
+ xlator_t *this = NULL;
+ this = THIS;
+ GF_ASSERT (this);
GF_ASSERT (req);
+
rsp.op_ret = status;
glusterd_get_uuid (&rsp.uuid);
rsp.op = op;
@@ -1398,7 +2226,7 @@ glusterd_op_stage_send_resp (rpcsvc_request_t *req,
ret = dict_allocate_and_serialize (rsp_dict, &rsp.dict.dict_val,
&rsp.dict.dict_len);
if (ret < 0) {
- gf_log ("", GF_LOG_DEBUG,
+ gf_log (this->name, GF_LOG_ERROR,
"failed to get serialized length of dict");
return ret;
}
@@ -1406,8 +2234,7 @@ glusterd_op_stage_send_resp (rpcsvc_request_t *req,
ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gd1_mgmt_stage_op_rsp);
- gf_log ("glusterd", GF_LOG_INFO,
- "Responded to stage, ret: %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Responded to stage, ret: %d", ret);
GF_FREE (rsp.dict.dict_val);
return ret;
@@ -1420,7 +2247,10 @@ glusterd_op_commit_send_resp (rpcsvc_request_t *req,
{
gd1_mgmt_commit_op_rsp rsp = {{0}, };
int ret = -1;
+ xlator_t *this = NULL;
+ this = THIS;
+ GF_ASSERT (this);
GF_ASSERT (req);
rsp.op_ret = status;
glusterd_get_uuid (&rsp.uuid);
@@ -1435,7 +2265,7 @@ glusterd_op_commit_send_resp (rpcsvc_request_t *req,
ret = dict_allocate_and_serialize (rsp_dict, &rsp.dict.dict_val,
&rsp.dict.dict_len);
if (ret < 0) {
- gf_log ("", GF_LOG_DEBUG,
+ gf_log (this->name, GF_LOG_ERROR,
"failed to get serialized length of dict");
goto out;
}
@@ -1445,8 +2275,7 @@ glusterd_op_commit_send_resp (rpcsvc_request_t *req,
ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gd1_mgmt_commit_op_rsp);
- gf_log ("glusterd", GF_LOG_INFO,
- "Responded to commit, ret: %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Responded to commit, ret: %d", ret);
out:
GF_FREE (rsp.dict.dict_val);
@@ -1454,14 +2283,16 @@ out:
}
int
-glusterd_handle_incoming_friend_req (rpcsvc_request_t *req)
+__glusterd_handle_incoming_friend_req (rpcsvc_request_t *req)
{
int32_t ret = -1;
gd1_mgmt_friend_req friend_req = {{0},};
gf_boolean_t run_fsm = _gf_true;
GF_ASSERT (req);
- if (!xdr_to_generic (req->msg[0], &friend_req, (xdrproc_t)xdr_gd1_mgmt_friend_req)) {
+ ret = xdr_to_generic (req->msg[0], &friend_req,
+ (xdrproc_t)xdr_gd1_mgmt_friend_req);
+ if (ret < 0) {
//failed to decode msg;
req->rpc_err = GARBAGE_ARGS;
goto out;
@@ -1491,14 +2322,23 @@ out:
}
int
-glusterd_handle_incoming_unfriend_req (rpcsvc_request_t *req)
+glusterd_handle_incoming_friend_req (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ __glusterd_handle_incoming_friend_req);
+}
+
+int
+__glusterd_handle_incoming_unfriend_req (rpcsvc_request_t *req)
{
int32_t ret = -1;
gd1_mgmt_friend_req friend_req = {{0},};
char remote_hostname[UNIX_PATH_MAX + 1] = {0,};
GF_ASSERT (req);
- if (!xdr_to_generic (req->msg[0], &friend_req, (xdrproc_t)xdr_gd1_mgmt_friend_req)) {
+ ret = xdr_to_generic (req->msg[0], &friend_req,
+ (xdrproc_t)xdr_gd1_mgmt_friend_req);
+ if (ret < 0) {
//failed to decode msg;
req->rpc_err = GARBAGE_ARGS;
goto out;
@@ -1527,6 +2367,14 @@ out:
}
int
+glusterd_handle_incoming_unfriend_req (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ __glusterd_handle_incoming_unfriend_req);
+
+}
+
+int
glusterd_handle_friend_update_delete (dict_t *dict)
{
char *hostname = NULL;
@@ -1572,7 +2420,7 @@ out:
}
int
-glusterd_handle_friend_update (rpcsvc_request_t *req)
+__glusterd_handle_friend_update (rpcsvc_request_t *req)
{
int32_t ret = -1;
gd1_mgmt_friend_update friend_req = {{0},};
@@ -1598,7 +2446,9 @@ glusterd_handle_friend_update (rpcsvc_request_t *req)
priv = this->private;
GF_ASSERT (priv);
- if (!xdr_to_generic (req->msg[0], &friend_req, (xdrproc_t)xdr_gd1_mgmt_friend_update)) {
+ ret = xdr_to_generic (req->msg[0], &friend_req,
+ (xdrproc_t)xdr_gd1_mgmt_friend_update);
+ if (ret < 0) {
//failed to decode msg;
req->rpc_err = GARBAGE_ARGS;
goto out;
@@ -1707,7 +2557,14 @@ out:
}
int
-glusterd_handle_probe_query (rpcsvc_request_t *req)
+glusterd_handle_friend_update (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ __glusterd_handle_friend_update);
+}
+
+int
+__glusterd_handle_probe_query (rpcsvc_request_t *req)
{
int32_t ret = -1;
xlator_t *this = NULL;
@@ -1721,7 +2578,9 @@ glusterd_handle_probe_query (rpcsvc_request_t *req)
GF_ASSERT (req);
- if (!xdr_to_generic (req->msg[0], &probe_req, (xdrproc_t)xdr_gd1_mgmt_probe_req)) {
+ ret = xdr_to_generic (req->msg[0], &probe_req,
+ (xdrproc_t)xdr_gd1_mgmt_probe_req);
+ if (ret < 0) {
//failed to decode msg;
req->rpc_err = GARBAGE_ARGS;
goto out;
@@ -1773,13 +2632,25 @@ glusterd_handle_probe_query (rpcsvc_request_t *req)
gf_log ("", GF_LOG_ERROR, "Failed to add peer %s",
remote_hostname);
rsp.op_errno = GF_PROBE_ADD_FAILED;
+ goto respond;
}
+ gf_log (THIS->name, GF_LOG_INFO,
+ "joining, should point etcd at %s", remote_hostname);
+ /*
+ * We should have started a standalone etcd before. Now we
+ * need a new one, with a new config.
+ */
+ stop_etcd(conf->etcd_pid);
+ nuke_etcd_dir();
+ conf->etcd_pid = start_etcd (uuid_utoa(MY_UUID),
+ remote_hostname);
}
respond:
uuid_copy (rsp.uuid, MY_UUID);
rsp.hostname = probe_req.hostname;
+ rsp.op_errstr = "";
glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gd1_mgmt_probe_rsp);
@@ -1798,8 +2669,13 @@ out:
return ret;
}
+int glusterd_handle_probe_query (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req, __glusterd_handle_probe_query);
+}
+
int
-glusterd_handle_cli_profile_volume (rpcsvc_request_t *req)
+__glusterd_handle_cli_profile_volume (rpcsvc_request_t *req)
{
int32_t ret = -1;
gf_cli_req cli_req = {{0,}};
@@ -1807,10 +2683,15 @@ glusterd_handle_cli_profile_volume (rpcsvc_request_t *req)
glusterd_op_t cli_op = GD_OP_PROFILE_VOLUME;
char *volname = NULL;
int32_t op = 0;
+ char err_str[2048] = {0,};
+ xlator_t *this = NULL;
GF_ASSERT (req);
+ this = THIS;
+ GF_ASSERT (this);
- if (!xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req)) {
+ ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
//failed to decode msg;
req->rpc_err = GARBAGE_ARGS;
goto out;
@@ -1826,38 +2707,50 @@ glusterd_handle_cli_profile_volume (rpcsvc_request_t *req)
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "failed to get volname");
+ snprintf (err_str, sizeof (err_str), "Unable to get volume "
+ "name");
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
- gf_log (THIS->name, GF_LOG_INFO, "Received volume profile req "
+ gf_log (this->name, GF_LOG_INFO, "Received volume profile req "
"for volume %s", volname);
ret = dict_get_int32 (dict, "op", &op);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "failed to get op");
+ snprintf (err_str, sizeof (err_str), "Unable to get operation");
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
- ret = glusterd_op_begin (req, cli_op, dict);
+ ret = glusterd_op_begin (req, cli_op, dict, err_str, sizeof (err_str));
out:
glusterd_friend_sm ();
glusterd_op_sm ();
free (cli_req.dict.dict_val);
+
if (ret) {
+ if (err_str[0] == '\0')
+ snprintf (err_str, sizeof (err_str),
+ "Operation failed");
ret = glusterd_op_send_cli_response (cli_op, ret, 0, req,
- dict, "operation failed");
- if (dict)
- dict_unref (dict);
+ dict, err_str);
}
- gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
int
-glusterd_handle_getwd (rpcsvc_request_t *req)
+glusterd_handle_cli_profile_volume (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ __glusterd_handle_cli_profile_volume);
+}
+
+int
+__glusterd_handle_getwd (rpcsvc_request_t *req)
{
int32_t ret = -1;
gf1_cli_getwd_rsp rsp = {0,};
@@ -1882,18 +2775,27 @@ glusterd_handle_getwd (rpcsvc_request_t *req)
return ret;
}
+int
+glusterd_handle_getwd (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req, __glusterd_handle_getwd);
+}
int
-glusterd_handle_mount (rpcsvc_request_t *req)
+__glusterd_handle_mount (rpcsvc_request_t *req)
{
gf1_cli_mount_req mnt_req = {0,};
gf1_cli_mount_rsp rsp = {0,};
dict_t *dict = NULL;
int ret = 0;
+ glusterd_conf_t *priv = NULL;
GF_ASSERT (req);
+ priv = THIS->private;
- if (!xdr_to_generic (req->msg[0], &mnt_req, (xdrproc_t)xdr_gf1_cli_mount_req)) {
+ ret = xdr_to_generic (req->msg[0], &mnt_req,
+ (xdrproc_t)xdr_gf1_cli_mount_req);
+ if (ret < 0) {
//failed to decode msg;
req->rpc_err = GARBAGE_ARGS;
rsp.op_ret = -1;
@@ -1922,8 +2824,10 @@ glusterd_handle_mount (rpcsvc_request_t *req)
}
}
+ synclock_unlock (&priv->big_lock);
rsp.op_ret = glusterd_do_mount (mnt_req.label, dict,
&rsp.path, &rsp.op_errno);
+ synclock_lock (&priv->big_lock);
out:
if (!rsp.path)
@@ -1945,7 +2849,13 @@ glusterd_handle_mount (rpcsvc_request_t *req)
}
int
-glusterd_handle_umount (rpcsvc_request_t *req)
+glusterd_handle_mount (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req, __glusterd_handle_mount);
+}
+
+int
+__glusterd_handle_umount (rpcsvc_request_t *req)
{
gf1_cli_umount_req umnt_req = {0,};
gf1_cli_umount_rsp rsp = {0,};
@@ -1958,11 +2868,15 @@ glusterd_handle_umount (rpcsvc_request_t *req)
gf_boolean_t dir_ok = _gf_false;
char *pdir = NULL;
char *t = NULL;
+ glusterd_conf_t *priv = NULL;
GF_ASSERT (req);
GF_ASSERT (this);
+ priv = this->private;
- if (!xdr_to_generic (req->msg[0], &umnt_req, (xdrproc_t)xdr_gf1_cli_umount_req)) {
+ ret = xdr_to_generic (req->msg[0], &umnt_req,
+ (xdrproc_t)xdr_gf1_cli_umount_req);
+ if (ret < 0) {
//failed to decode msg;
req->rpc_err = GARBAGE_ARGS;
rsp.op_ret = -1;
@@ -2001,7 +2915,9 @@ glusterd_handle_umount (rpcsvc_request_t *req)
runner_add_args (&runner, "umount", umnt_req.path, NULL);
if (umnt_req.lazy)
runner_add_arg (&runner, "-l");
+ synclock_unlock (&priv->big_lock);
rsp.op_ret = runner_run (&runner);
+ synclock_lock (&priv->big_lock);
if (rsp.op_ret == 0) {
if (realpath (umnt_req.path, mntp))
rmdir (mntp);
@@ -2030,6 +2946,12 @@ glusterd_handle_umount (rpcsvc_request_t *req)
}
int
+glusterd_handle_umount (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req, __glusterd_handle_umount);
+}
+
+int
glusterd_friend_remove (uuid_t uuid, char *hostname)
{
int ret = 0;
@@ -2080,7 +3002,7 @@ out:
}
}
- gf_log ("", GF_LOG_DEBUG, "returning %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "returning %d", ret);
return ret;
}
@@ -2146,6 +3068,57 @@ out:
}
int
+glusterd_friend_rpc_create (xlator_t *this, glusterd_peerinfo_t *peerinfo,
+ glusterd_peerctx_args_t *args)
+{
+ dict_t *options = NULL;
+ int ret = -1;
+ glusterd_peerctx_t *peerctx = NULL;
+ data_t *data = NULL;
+
+ peerctx = GF_CALLOC (1, sizeof (*peerctx), gf_gld_mt_peerctx_t);
+ if (!peerctx)
+ goto out;
+
+ if (args)
+ peerctx->args = *args;
+
+ peerctx->peerinfo = peerinfo;
+
+ ret = glusterd_transport_inet_options_build (&options,
+ peerinfo->hostname,
+ peerinfo->port);
+ if (ret)
+ goto out;
+
+ /*
+ * For simulated multi-node testing, we need to make sure that we
+ * create our RPC endpoint with the same address that the peer would
+ * use to reach us.
+ */
+ if (this->options) {
+ data = dict_get(this->options,"transport.socket.bind-address");
+ if (data) {
+ ret = dict_set(options,
+ "transport.socket.source-addr",data);
+ }
+ }
+
+ ret = glusterd_rpc_create (&peerinfo->rpc, options,
+ glusterd_peer_rpc_notify, peerctx);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to create rpc for"
+ " peer %s", peerinfo->hostname);
+ goto out;
+ }
+ peerctx = NULL;
+ ret = 0;
+out:
+ GF_FREE (peerctx);
+ return ret;
+}
+
+int
glusterd_friend_add (const char *hoststr, int port,
glusterd_friend_sm_state_t state,
uuid_t *uuid,
@@ -2156,66 +3129,56 @@ glusterd_friend_add (const char *hoststr, int port,
int ret = 0;
xlator_t *this = NULL;
glusterd_conf_t *conf = NULL;
- glusterd_peerctx_t *peerctx = NULL;
- dict_t *options = NULL;
- gf_boolean_t handover = _gf_false;
this = THIS;
conf = this->private;
GF_ASSERT (conf);
GF_ASSERT (hoststr);
- peerctx = GF_CALLOC (1, sizeof (*peerctx), gf_gld_mt_peerctx_t);
- if (!peerctx) {
- ret = -1;
+ ret = glusterd_peerinfo_new (friend, state, uuid, hoststr, port);
+ if (ret) {
goto out;
}
- if (args)
- peerctx->args = *args;
-
- ret = glusterd_peerinfo_new (friend, state, uuid, hoststr);
- if (ret)
- goto out;
-
- peerctx->peerinfo = *friend;
-
- ret = glusterd_transport_inet_options_build (&options, hoststr, port);
- if (ret)
- goto out;
+ /*
+ * We can't add to the list after calling glusterd_friend_rpc_create,
+ * even if it succeeds, because by then the callback to take it back
+ * off and free might have happened already (notably in the case of an
+ * invalid peer name). That would mean we're adding something that had
+ * just been free, and we're likely to crash later.
+ */
+ list_add_tail (&(*friend)->uuid_list, &conf->peers);
+ //restore needs to first create the list of peers, then create rpcs
+ //to keep track of quorum in race-free manner. In restore for each peer
+ //rpc-create calls rpc_notify when the friend-list is partially
+ //constructed, leading to wrong quorum calculations.
if (!restore) {
ret = glusterd_store_peerinfo (*friend);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Failed to store "
- "peerinfo");
-
- goto out;
+ if (ret == 0) {
+ synclock_unlock (&conf->big_lock);
+ ret = glusterd_friend_rpc_create (this, *friend, args);
+ synclock_lock (&conf->big_lock);
+ }
+ else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to store peerinfo");
}
}
- list_add_tail (&(*friend)->uuid_list, &conf->peers);
- ret = glusterd_rpc_create (&(*friend)->rpc, options,
- glusterd_peer_rpc_notify,
- peerctx);
+
if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "failed to create rpc for"
- " peer %s", (char*)hoststr);
- goto out;
+ (void) glusterd_friend_cleanup (*friend);
+ *friend = NULL;
}
- handover = _gf_true;
out:
- if (ret && !handover) {
- (void) glusterd_friend_cleanup (*friend);
- *friend = NULL;
- }
-
gf_log (this->name, GF_LOG_INFO, "connect returned %d", ret);
return ret;
}
int
-glusterd_probe_begin (rpcsvc_request_t *req, const char *hoststr, int port)
+glusterd_probe_begin (rpcsvc_request_t *req, const char *hoststr, int port,
+ dict_t *dict)
{
int ret = -1;
glusterd_peerinfo_t *peerinfo = NULL;
@@ -2231,6 +3194,7 @@ glusterd_probe_begin (rpcsvc_request_t *req, const char *hoststr, int port)
" for host: %s (%d)", hoststr, port);
args.mode = GD_MODE_ON;
args.req = req;
+ args.dict = dict;
ret = glusterd_friend_add ((char *)hoststr, port,
GD_FRIEND_STATE_DEFAULT,
NULL, &peerinfo, 0, &args);
@@ -2252,11 +3216,11 @@ glusterd_probe_begin (rpcsvc_request_t *req, const char *hoststr, int port)
ret = glusterd_friend_sm_inject_event (event);
glusterd_xfer_cli_probe_resp (req, 0, GF_PROBE_SUCCESS,
NULL, (char*)hoststr,
- port);
+ port, dict);
}
} else {
glusterd_xfer_cli_probe_resp (req, 0, GF_PROBE_FRIEND, NULL,
- (char*)hoststr, port);
+ (char*)hoststr, port, dict);
}
out:
@@ -2266,7 +3230,7 @@ out:
int
glusterd_deprobe_begin (rpcsvc_request_t *req, const char *hoststr, int port,
- uuid_t uuid)
+ uuid_t uuid, dict_t *dict)
{
int ret = -1;
glusterd_peerinfo_t *peerinfo = NULL;
@@ -2307,6 +3271,7 @@ glusterd_deprobe_begin (rpcsvc_request_t *req, const char *hoststr, int port,
ctx->hostname = gf_strdup (hoststr);
ctx->port = port;
ctx->req = req;
+ ctx->dict = dict;
event->ctx = ctx;
@@ -2354,15 +3319,16 @@ glusterd_xfer_friend_remove_resp (rpcsvc_request_t *req, char *hostname, int por
int
-glusterd_xfer_friend_add_resp (rpcsvc_request_t *req, char *hostname, int port,
- int32_t op_ret, int32_t op_errno)
+glusterd_xfer_friend_add_resp (rpcsvc_request_t *req, char *myhostname,
+ char *remote_hostname, int port, int32_t op_ret,
+ int32_t op_errno)
{
gd1_mgmt_friend_rsp rsp = {{0}, };
int32_t ret = -1;
xlator_t *this = NULL;
glusterd_conf_t *conf = NULL;
- GF_ASSERT (hostname);
+ GF_ASSERT (myhostname);
this = THIS;
GF_ASSERT (this);
@@ -2372,61 +3338,229 @@ glusterd_xfer_friend_add_resp (rpcsvc_request_t *req, char *hostname, int port,
uuid_copy (rsp.uuid, MY_UUID);
rsp.op_ret = op_ret;
rsp.op_errno = op_errno;
- rsp.hostname = gf_strdup (hostname);
+ rsp.hostname = gf_strdup (myhostname);
rsp.port = port;
ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gd1_mgmt_friend_rsp);
gf_log ("glusterd", GF_LOG_INFO,
- "Responded to %s (%d), ret: %d", hostname, port, ret);
+ "Responded to %s (%d), ret: %d", remote_hostname, port, ret);
GF_FREE (rsp.hostname);
return ret;
}
+static void
+set_probe_error_str (int op_ret, int op_errno, char *op_errstr, char *errstr,
+ size_t len, char *hostname, int port)
+{
+ if ((op_errstr) && (strcmp (op_errstr, ""))) {
+ snprintf (errstr, len, "%s", op_errstr);
+ return;
+ }
+
+ if (!op_ret) {
+ switch (op_errno) {
+ case GF_PROBE_LOCALHOST:
+ snprintf (errstr, len, "Probe on localhost not "
+ "needed");
+ break;
+
+ case GF_PROBE_FRIEND:
+ snprintf (errstr, len, "Host %s port %d already"
+ " in peer list", hostname, port);
+ break;
+
+ default:
+ if (op_errno != 0)
+ snprintf (errstr, len, "Probe returned "
+ "with unknown errno %d",
+ op_errno);
+ break;
+ }
+ } else {
+ switch (op_errno) {
+ case GF_PROBE_ANOTHER_CLUSTER:
+ snprintf (errstr, len, "%s is already part of "
+ "another cluster", hostname);
+ break;
+
+ case GF_PROBE_VOLUME_CONFLICT:
+ snprintf (errstr, len, "Atleast one volume on "
+ "%s conflicts with existing volumes "
+ "in the cluster", hostname);
+ break;
+
+ case GF_PROBE_UNKNOWN_PEER:
+ snprintf (errstr, len, "%s responded with "
+ "'unknown peer' error, this could "
+ "happen if %s doesn't have localhost "
+ "in its peer database", hostname,
+ hostname);
+ break;
+
+ case GF_PROBE_ADD_FAILED:
+ snprintf (errstr, len, "Failed to add peer "
+ "information on %s", hostname);
+ break;
+
+ case GF_PROBE_SAME_UUID:
+ snprintf (errstr, len, "Peer uuid (host %s) is "
+ "same as local uuid", hostname);
+ break;
+
+ case GF_PROBE_QUORUM_NOT_MET:
+ snprintf (errstr, len, "Cluster quorum is not "
+ "met. Changing peers is not allowed "
+ "in this state");
+ break;
+
+ case GF_PROBE_MISSED_SNAP_CONFLICT:
+ snprintf (errstr, len, "Failed to update "
+ "list of missed snapshots from "
+ "peer %s", hostname);
+ break;
+
+ case GF_PROBE_SNAP_CONFLICT:
+ snprintf (errstr, len, "Conflict in comparing "
+ "list of snapshots from "
+ "peer %s", hostname);
+ break;
+
+ default:
+ snprintf (errstr, len, "Probe returned with "
+ "unknown errno %d", op_errno);
+ break;
+ }
+ }
+}
+
int
glusterd_xfer_cli_probe_resp (rpcsvc_request_t *req, int32_t op_ret,
int32_t op_errno, char *op_errstr, char *hostname,
- int port)
+ int port, dict_t *dict)
{
- gf1_cli_probe_rsp rsp = {0, };
+ gf_cli_rsp rsp = {0,};
int32_t ret = -1;
+ char errstr[2048] = {0,};
+ char *cmd_str = NULL;
+ xlator_t *this = THIS;
GF_ASSERT (req);
+ GF_ASSERT (this);
+
+ (void) set_probe_error_str (op_ret, op_errno, op_errstr, errstr,
+ sizeof (errstr), hostname, port);
+
+ if (dict) {
+ ret = dict_get_str (dict, "cmd-str", &cmd_str);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get "
+ "command string");
+ }
rsp.op_ret = op_ret;
rsp.op_errno = op_errno;
- rsp.op_errstr = op_errstr ? op_errstr : "";
- rsp.hostname = hostname;
- rsp.port = port;
+ rsp.op_errstr = (errstr[0] != '\0') ? errstr : "";
+
+ gf_cmd_log ("", "%s : %s %s %s", cmd_str,
+ (op_ret) ? "FAILED" : "SUCCESS",
+ (errstr[0] != '\0') ? ":" : " ",
+ (errstr[0] != '\0') ? errstr : " ");
ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
- (xdrproc_t)xdr_gf1_cli_probe_rsp);
+ (xdrproc_t)xdr_gf_cli_rsp);
- gf_log ("glusterd", GF_LOG_INFO, "Responded to CLI, ret: %d",ret);
+ if (dict)
+ dict_unref (dict);
+ gf_log (this->name, GF_LOG_DEBUG, "Responded to CLI, ret: %d",ret);
return ret;
}
+static void
+set_deprobe_error_str (int op_ret, int op_errno, char *op_errstr, char *errstr,
+ size_t len, char *hostname)
+{
+ if ((op_errstr) && (strcmp (op_errstr, ""))) {
+ snprintf (errstr, len, "%s", op_errstr);
+ return;
+ }
+
+ if (op_ret) {
+ switch (op_errno) {
+ case GF_DEPROBE_LOCALHOST:
+ snprintf (errstr, len, "%s is localhost",
+ hostname);
+ break;
+
+ case GF_DEPROBE_NOT_FRIEND:
+ snprintf (errstr, len, "%s is not part of "
+ "cluster", hostname);
+ break;
+
+ case GF_DEPROBE_BRICK_EXIST:
+ snprintf (errstr, len, "Brick(s) with the peer "
+ "%s exist in cluster", hostname);
+ break;
+
+ case GF_DEPROBE_FRIEND_DOWN:
+ snprintf (errstr, len, "One of the peers is "
+ "probably down. Check with "
+ "'peer status'");
+ break;
+
+ case GF_DEPROBE_QUORUM_NOT_MET:
+ snprintf (errstr, len, "Cluster quorum is not "
+ "met. Changing peers is not allowed "
+ "in this state");
+ break;
+
+ default:
+ snprintf (errstr, len, "Detach returned with "
+ "unknown errno %d", op_errno);
+ break;
+
+ }
+ }
+}
+
+
int
glusterd_xfer_cli_deprobe_resp (rpcsvc_request_t *req, int32_t op_ret,
int32_t op_errno, char *op_errstr,
- char *hostname)
+ char *hostname, dict_t *dict)
{
- gf1_cli_deprobe_rsp rsp = {0, };
+ gf_cli_rsp rsp = {0,};
int32_t ret = -1;
+ char *cmd_str = NULL;
+ char errstr[2048] = {0,};
GF_ASSERT (req);
+ (void) set_deprobe_error_str (op_ret, op_errno, op_errstr, errstr,
+ sizeof (errstr), hostname);
+
+ if (dict) {
+ ret = dict_get_str (dict, "cmd-str", &cmd_str);
+ if (ret)
+ gf_log (THIS->name, GF_LOG_ERROR, "Failed to get "
+ "command string");
+ }
+
rsp.op_ret = op_ret;
rsp.op_errno = op_errno;
- rsp.op_errstr = op_errstr ? op_errstr : "";
- rsp.hostname = hostname;
+ rsp.op_errstr = (errstr[0] != '\0') ? errstr : "";
+
+ gf_cmd_log ("", "%s : %s %s %s", cmd_str,
+ (op_ret) ? "FAILED" : "SUCCESS",
+ (errstr[0] != '\0') ? ":" : " ",
+ (errstr[0] != '\0') ? errstr : " ");
ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
- (xdrproc_t)xdr_gf1_cli_deprobe_rsp);
+ (xdrproc_t)xdr_gf_cli_rsp);
- gf_log ("glusterd", GF_LOG_INFO, "Responded to CLI, ret: %d",ret);
+ gf_log (THIS->name, GF_LOG_DEBUG, "Responded to CLI, ret: %d",ret);
return ret;
}
@@ -2440,37 +3574,50 @@ glusterd_list_friends (rpcsvc_request_t *req, dict_t *dict, int32_t flags)
int32_t count = 0;
dict_t *friends = NULL;
gf1_cli_peer_list_rsp rsp = {0,};
+ char my_uuid_str[64] = {0,};
+ char key[256] = {0,};
priv = THIS->private;
GF_ASSERT (priv);
- if (!list_empty (&priv->peers)) {
- friends = dict_new ();
- if (!friends) {
- gf_log ("", GF_LOG_WARNING, "Out of Memory");
- goto out;
- }
- } else {
- ret = 0;
+ friends = dict_new ();
+ if (!friends) {
+ gf_log ("", GF_LOG_WARNING, "Out of Memory");
goto out;
}
-
- if (flags == GF_CLI_LIST_ALL) {
- list_for_each_entry (entry, &priv->peers, uuid_list) {
- count++;
- ret = glusterd_add_peer_detail_to_dict (entry,
+ if (!list_empty (&priv->peers)) {
+ list_for_each_entry (entry, &priv->peers, uuid_list) {
+ count++;
+ ret = glusterd_add_peer_detail_to_dict (entry,
friends, count);
- if (ret)
- goto out;
+ if (ret)
+ goto out;
+ }
+ }
- }
+ if (flags == GF_CLI_LIST_POOL_NODES) {
+ count++;
+ snprintf (key, 256, "friend%d.uuid", count);
+ uuid_utoa_r (MY_UUID, my_uuid_str);
+ ret = dict_set_str (friends, key, my_uuid_str);
+ if (ret)
+ goto out;
- ret = dict_set_int32 (friends, "count", count);
+ snprintf (key, 256, "friend%d.hostname", count);
+ ret = dict_set_str (friends, key, "localhost");
+ if (ret)
+ goto out;
- if (ret)
- goto out;
+ snprintf (key, 256, "friend%d.connected", count);
+ ret = dict_set_int32 (friends, key, 1);
+ if (ret)
+ goto out;
}
+ ret = dict_set_int32 (friends, "count", count);
+ if (ret)
+ goto out;
+
ret = dict_allocate_and_serialize (friends, &rsp.friends.friends_val,
&rsp.friends.friends_len);
@@ -2601,7 +3748,7 @@ out:
}
int
-glusterd_handle_status_volume (rpcsvc_request_t *req)
+__glusterd_handle_status_volume (rpcsvc_request_t *req)
{
int32_t ret = -1;
uint32_t cmd = 0;
@@ -2609,11 +3756,18 @@ glusterd_handle_status_volume (rpcsvc_request_t *req)
char *volname = 0;
gf_cli_req cli_req = {{0,}};
glusterd_op_t cli_op = GD_OP_STATUS_VOLUME;
+ char err_str[2048] = {0,};
+ xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
GF_ASSERT (req);
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
- if (!xdr_to_generic (req->msg[0], &cli_req,
- (xdrproc_t)xdr_gf_cli_req)) {
+ ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
//failed to decode msg;
req->rpc_err = GARBAGE_ARGS;
goto out;
@@ -2626,8 +3780,10 @@ glusterd_handle_status_volume (rpcsvc_request_t *req)
ret = dict_unserialize (cli_req.dict.dict_val,
cli_req.dict.dict_len, &dict);
if (ret < 0) {
- gf_log (THIS->name, GF_LOG_ERROR, "failed to "
+ gf_log (this->name, GF_LOG_ERROR, "failed to "
"unserialize buffer");
+ snprintf (err_str, sizeof (err_str), "Unable to decode "
+ "the command");
goto out;
}
@@ -2640,27 +3796,34 @@ glusterd_handle_status_volume (rpcsvc_request_t *req)
if (!(cmd & GF_CLI_STATUS_ALL)) {
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR,
- "failed to get volname");
+ snprintf (err_str, sizeof (err_str), "Unable to get "
+ "volume name");
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
- gf_log (THIS->name, GF_LOG_INFO,
- "Received status volume req "
- "for volume %s", volname);
+ gf_log (this->name, GF_LOG_INFO,
+ "Received status volume req for volume %s", volname);
}
+ if ((cmd & GF_CLI_STATUS_QUOTAD) &&
+ (conf->op_version == GD_OP_VERSION_MIN)) {
+ snprintf (err_str, sizeof (err_str), "The cluster is operating "
+ "at version 1. Getting the status of quotad is not "
+ "allowed in this state.");
+ ret = -1;
+ goto out;
+ }
- ret = glusterd_op_begin (req, GD_OP_STATUS_VOLUME, dict);
+ ret = glusterd_op_begin_synctask (req, GD_OP_STATUS_VOLUME, dict);
out:
- glusterd_friend_sm ();
- glusterd_op_sm ();
if (ret) {
+ if (err_str[0] == '\0')
+ snprintf (err_str, sizeof (err_str),
+ "Operation failed");
ret = glusterd_op_send_cli_response (cli_op, ret, 0, req,
- dict, "operation failed");
- if (dict)
- dict_unref (dict);
+ dict, err_str);
}
free (cli_req.dict.dict_val);
@@ -2668,19 +3831,30 @@ out:
}
int
-glusterd_handle_cli_clearlocks_volume (rpcsvc_request_t *req)
+glusterd_handle_status_volume (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ __glusterd_handle_status_volume);
+}
+
+int
+__glusterd_handle_cli_clearlocks_volume (rpcsvc_request_t *req)
{
int32_t ret = -1;
gf_cli_req cli_req = {{0,}};
glusterd_op_t cli_op = GD_OP_CLEARLOCKS_VOLUME;
char *volname = NULL;
dict_t *dict = NULL;
+ char err_str[2048] = {0,};
+ xlator_t *this = NULL;
GF_ASSERT (req);
+ this = THIS;
+ GF_ASSERT (this);
ret = -1;
- if (!xdr_to_generic (req->msg[0], &cli_req,
- (xdrproc_t)xdr_gf_cli_req)) {
+ ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
req->rpc_err = GARBAGE_ARGS;
goto out;
}
@@ -2692,38 +3866,40 @@ glusterd_handle_cli_clearlocks_volume (rpcsvc_request_t *req)
cli_req.dict.dict_len,
&dict);
if (ret < 0) {
- gf_log (THIS->name, GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"failed to unserialize req-buffer to"
" dictionary");
+ snprintf (err_str, sizeof (err_str), "unable to decode "
+ "the command");
goto out;
}
} else {
ret = -1;
- gf_log (THIS->name, GF_LOG_ERROR, "Empty cli request.");
+ gf_log (this->name, GF_LOG_ERROR, "Empty cli request.");
goto out;
}
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "failed to get volname");
+ snprintf (err_str, sizeof (err_str), "Unable to get volume "
+ "name");
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
- gf_log (THIS->name, GF_LOG_INFO, "Received clear-locks volume req "
+ gf_log (this->name, GF_LOG_INFO, "Received clear-locks volume req "
"for volume %s", volname);
- ret = glusterd_op_begin (req, cli_op, dict);
+ ret = glusterd_op_begin_synctask (req, GD_OP_CLEARLOCKS_VOLUME, dict);
out:
- glusterd_friend_sm ();
- glusterd_op_sm ();
-
if (ret) {
+ if (err_str[0] == '\0')
+ snprintf (err_str, sizeof (err_str),
+ "Operation failed");
ret = glusterd_op_send_cli_response (cli_op, ret, 0, req,
- dict, "operation failed");
- if (dict)
- dict_unref (dict);
+ dict, err_str);
}
free (cli_req.dict.dict_val);
@@ -2731,17 +3907,116 @@ out:
}
int
-glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata,
- rpc_clnt_event_t event,
- void *data)
+glusterd_handle_cli_clearlocks_volume (rpcsvc_request_t *req)
{
- xlator_t *this = NULL;
- glusterd_conf_t *conf = NULL;
- int ret = 0;
- glusterd_brickinfo_t *brickinfo = NULL;
+ return glusterd_big_locked_handler (req,
+ __glusterd_handle_cli_clearlocks_volume);
+}
+
+static int
+get_volinfo_from_brickid (char *brickid, glusterd_volinfo_t **volinfo)
+{
+ int ret = -1;
+ char *volid_str = NULL;
+ char *brick = NULL;
+ char *brickid_dup = NULL;
+ uuid_t volid = {0};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (brickid);
+
+ brickid_dup = gf_strdup (brickid);
+ if (!brickid_dup)
+ goto out;
+
+ volid_str = brickid_dup;
+ brick = strchr (brickid_dup, ':');
+ if (!brick) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Invalid brickid");
+ goto out;
+ }
+
+ *brick = '\0';
+ brick++;
+ uuid_parse (volid_str, volid);
+ ret = glusterd_volinfo_find_by_volume_id (volid, volinfo);
+ if (ret) {
+ /* Check if it is a snapshot volume */
+ ret = glusterd_snap_volinfo_find_by_volume_id (volid, volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Failed to find volinfo");
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ GF_FREE (brickid_dup);
+ return ret;
+}
+
+static int
+get_brickinfo_from_brickid (char *brickid, glusterd_brickinfo_t **brickinfo)
+{
+ glusterd_volinfo_t *volinfo = NULL;
+ char *volid_str = NULL;
+ char *brick = NULL;
+ char *brickid_dup = NULL;
+ uuid_t volid = {0};
+ int ret = -1;
+
+ brickid_dup = gf_strdup (brickid);
+ if (!brickid_dup)
+ goto out;
+
+ volid_str = brickid_dup;
+ brick = strchr (brickid_dup, ':');
+ if (!volid_str || !brick)
+ goto out;
+
+ *brick = '\0';
+ brick++;
+ uuid_parse (volid_str, volid);
+ ret = glusterd_volinfo_find_by_volume_id (volid, &volinfo);
+ if (ret) {
+ /* Check if it a snapshot volume */
+ ret = glusterd_snap_volinfo_find_by_volume_id (volid, &volinfo);
+ if (ret)
+ goto out;
+ }
+
+ ret = glusterd_volume_brickinfo_get_by_brick (brick, volinfo,
+ brickinfo);
+ if (ret)
+ goto out;
+
+ ret = 0;
+out:
+ GF_FREE (brickid_dup);
+ return ret;
+}
+
+int
+__glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata,
+ rpc_clnt_event_t event, void *data)
+{
+ char *brickid = NULL;
+ int ret = 0;
+ glusterd_conf_t *conf = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ xlator_t *this = NULL;
+
+ brickid = mydata;
+ if (!brickid)
+ return 0;
- brickinfo = mydata;
- if (!brickinfo)
+ ret = get_brickinfo_from_brickid (brickid, &brickinfo);
+ if (ret)
return 0;
this = THIS;
@@ -2751,30 +4026,77 @@ glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata,
switch (event) {
case RPC_CLNT_CONNECT:
- gf_log (this->name, GF_LOG_DEBUG, "got RPC_CLNT_CONNECT");
+ /* If a node on coming back up, already starts a brick
+ * before the handshake, and the notification comes after
+ * the handshake is done, then we need to check if this
+ * is a restored brick with a snapshot pending. If so, we
+ * need to stop the brick
+ */
+ if (brickinfo->snap_status == -1) {
+ gf_log (this->name, GF_LOG_INFO,
+ "Snapshot is pending on %s:%s. "
+ "Hence not starting the brick",
+ brickinfo->hostname,
+ brickinfo->path);
+ ret = get_volinfo_from_brickid (brickid, &volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get volinfo from "
+ "brickid(%s)", brickid);
+ goto out;
+ }
+
+ ret = glusterd_brick_stop (volinfo, brickinfo,
+ _gf_false);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "Unable to stop %s:%s",
+ brickinfo->hostname, brickinfo->path);
+ goto out;
+ }
+
+ break;
+ }
+ gf_log (this->name, GF_LOG_DEBUG, "Connected to %s:%s",
+ brickinfo->hostname, brickinfo->path);
glusterd_set_brick_status (brickinfo, GF_BRICK_STARTED);
ret = default_notify (this, GF_EVENT_CHILD_UP, NULL);
break;
case RPC_CLNT_DISCONNECT:
- gf_log (this->name, GF_LOG_DEBUG, "got RPC_CLNT_DISCONNECT");
+ if (GF_BRICK_STARTED == brickinfo->status)
+ gf_log (this->name, GF_LOG_INFO, "Disconnected from "
+ "%s:%s", brickinfo->hostname, brickinfo->path);
+
glusterd_set_brick_status (brickinfo, GF_BRICK_STOPPED);
break;
+ case RPC_CLNT_DESTROY:
+ GF_FREE (mydata);
+ mydata = NULL;
+ break;
default:
gf_log (this->name, GF_LOG_TRACE,
"got some other RPC event %d", event);
break;
}
+out:
return ret;
}
int
-glusterd_nodesvc_rpc_notify (struct rpc_clnt *rpc, void *mydata,
- rpc_clnt_event_t event,
- void *data)
+glusterd_brick_rpc_notify (struct rpc_clnt *rpc, void *mydata,
+ rpc_clnt_event_t event, void *data)
+{
+ return glusterd_big_locked_notify (rpc, mydata, event, data,
+ __glusterd_brick_rpc_notify);
+}
+
+int
+__glusterd_nodesvc_rpc_notify (struct rpc_clnt *rpc, void *mydata,
+ rpc_clnt_event_t event, void *data)
{
xlator_t *this = NULL;
glusterd_conf_t *conf = NULL;
@@ -2793,14 +4115,14 @@ glusterd_nodesvc_rpc_notify (struct rpc_clnt *rpc, void *mydata,
switch (event) {
case RPC_CLNT_CONNECT:
gf_log (this->name, GF_LOG_DEBUG, "got RPC_CLNT_CONNECT");
- (void) glusterd_nodesvc_set_running (server, _gf_true);
+ (void) glusterd_nodesvc_set_online_status (server, _gf_true);
ret = default_notify (this, GF_EVENT_CHILD_UP, NULL);
break;
case RPC_CLNT_DISCONNECT:
gf_log (this->name, GF_LOG_DEBUG, "got RPC_CLNT_DISCONNECT");
- (void) glusterd_nodesvc_set_running (server, _gf_false);
+ (void) glusterd_nodesvc_set_online_status (server, _gf_false);
break;
default:
@@ -2813,6 +4135,14 @@ glusterd_nodesvc_rpc_notify (struct rpc_clnt *rpc, void *mydata,
}
int
+glusterd_nodesvc_rpc_notify (struct rpc_clnt *rpc, void *mydata,
+ rpc_clnt_event_t event, void *data)
+{
+ return glusterd_big_locked_notify (rpc, mydata, event, data,
+ __glusterd_nodesvc_rpc_notify);
+}
+
+int
glusterd_friend_remove_notify (glusterd_peerctx_t *peerctx)
{
int ret = -1;
@@ -2820,11 +4150,13 @@ glusterd_friend_remove_notify (glusterd_peerctx_t *peerctx)
glusterd_peerinfo_t *peerinfo = peerctx->peerinfo;
rpcsvc_request_t *req = peerctx->args.req;
char *errstr = peerctx->errstr;
+ dict_t *dict = NULL;
GF_ASSERT (peerctx);
peerinfo = peerctx->peerinfo;
req = peerctx->args.req;
+ dict = peerctx->args.dict;
errstr = peerctx->errstr;
ret = glusterd_friend_sm_new_event (GD_FRIEND_EVENT_REMOVE_FRIEND,
@@ -2838,11 +4170,11 @@ glusterd_friend_remove_notify (glusterd_peerctx_t *peerctx)
}
glusterd_xfer_cli_probe_resp (req, -1, ENOTCONN, errstr,
- peerinfo->hostname, peerinfo->port);
+ peerinfo->hostname,
+ peerinfo->port, dict);
new_event->peerinfo = peerinfo;
ret = glusterd_friend_sm_inject_event (new_event);
- glusterd_friend_sm ();
} else {
gf_log ("glusterd", GF_LOG_ERROR,
@@ -2855,17 +4187,17 @@ out:
}
int
-glusterd_peer_rpc_notify (struct rpc_clnt *rpc, void *mydata,
- rpc_clnt_event_t event,
- void *data)
+__glusterd_peer_rpc_notify (struct rpc_clnt *rpc, void *mydata,
+ rpc_clnt_event_t event, void *data)
{
xlator_t *this = NULL;
glusterd_conf_t *conf = NULL;
int ret = 0;
glusterd_peerinfo_t *peerinfo = NULL;
glusterd_peerctx_t *peerctx = NULL;
- uuid_t owner = {0,};
- uuid_t *peer_uuid = NULL;
+ gf_boolean_t quorum_action = _gf_false;
+ glusterd_volinfo_t *volinfo = NULL;
+ uuid_t uuid;
peerctx = mydata;
if (!peerctx)
@@ -2880,8 +4212,9 @@ glusterd_peer_rpc_notify (struct rpc_clnt *rpc, void *mydata,
{
gf_log (this->name, GF_LOG_DEBUG, "got RPC_CLNT_CONNECT");
peerinfo->connected = 1;
+ peerinfo->quorum_action = _gf_true;
- ret = glusterd_peer_handshake (this, rpc, peerctx);
+ ret = glusterd_peer_dump_version (this, rpc, peerctx);
if (ret)
gf_log ("", GF_LOG_ERROR, "glusterd handshake failed");
break;
@@ -2892,47 +4225,47 @@ glusterd_peer_rpc_notify (struct rpc_clnt *rpc, void *mydata,
gf_log (this->name, GF_LOG_DEBUG, "got RPC_CLNT_DISCONNECT %d",
peerinfo->state.state);
- /*
- local glusterd (thinks that it) is the owner of the cluster
- lock and 'fails' the operation on the first disconnect from
- a peer.
- */
-
if (peerinfo->connected) {
- glusterd_get_lock_owner (&owner);
- if (!uuid_compare (MY_UUID, owner)) {
- ret = glusterd_op_sm_inject_event
- (GD_OP_EVENT_START_UNLOCK, NULL);
- if (ret)
- gf_log (this->name, GF_LOG_ERROR,
- "Unable to enqueue cluster "
- "unlock event");
- break;
- }
-
- peer_uuid = GF_CALLOC (1, sizeof (*peer_uuid),
- gf_common_mt_char);
- if (!peer_uuid) {
- ret = -1;
- break;
+ if (conf->op_version < GD_OP_VERSION_4) {
+ glusterd_get_lock_owner (&uuid);
+ if (!uuid_is_null (uuid) &&
+ !uuid_compare (peerinfo->uuid, uuid))
+ glusterd_unlock (peerinfo->uuid);
+ } else {
+ list_for_each_entry (volinfo, &conf->volumes,
+ vol_list) {
+ ret = glusterd_mgmt_v3_unlock
+ (volinfo->volname,
+ peerinfo->uuid,
+ "vol");
+ if (ret)
+ gf_log (this->name,
+ GF_LOG_TRACE,
+ "Lock not released "
+ "for %s",
+ volinfo->volname);
+ }
}
- uuid_copy (*peer_uuid, peerinfo->uuid);
- ret = glusterd_op_sm_inject_event
- (GD_OP_EVENT_LOCAL_UNLOCK_NO_RESP, peer_uuid);
- if (ret)
- gf_log (this->name, GF_LOG_ERROR, "Unable"
- " to enque local lock flush event.");
+ ret = 0;
+ }
- //Inject friend disconnected here
- if (peerinfo->state.state == GD_FRIEND_STATE_DEFAULT) {
- glusterd_friend_remove_notify (peerctx);
- }
+ if ((peerinfo->quorum_contrib != QUORUM_DOWN) &&
+ (peerinfo->state.state == GD_FRIEND_STATE_BEFRIENDED)) {
+ peerinfo->quorum_contrib = QUORUM_DOWN;
+ quorum_action = _gf_true;
+ peerinfo->quorum_action = _gf_false;
+ }
+ /* Remove peer if it is not a friend and connection/handshake
+ * fails, and notify cli. Happens only during probe.
+ */
+ if (peerinfo->state.state == GD_FRIEND_STATE_DEFAULT) {
+ glusterd_friend_remove_notify (peerctx);
+ goto out;
}
peerinfo->connected = 0;
- //default_notify (this, GF_EVENT_CHILD_DOWN, NULL);
break;
}
default:
@@ -2942,12 +4275,23 @@ glusterd_peer_rpc_notify (struct rpc_clnt *rpc, void *mydata,
break;
}
+out:
glusterd_friend_sm ();
glusterd_op_sm ();
+ if (quorum_action)
+ glusterd_do_quorum_action ();
return ret;
}
int
+glusterd_peer_rpc_notify (struct rpc_clnt *rpc, void *mydata,
+ rpc_clnt_event_t event, void *data)
+{
+ return glusterd_big_locked_notify (rpc, mydata, event, data,
+ __glusterd_peer_rpc_notify);
+}
+
+int
glusterd_null (rpcsvc_request_t *req)
{
@@ -2955,11 +4299,11 @@ glusterd_null (rpcsvc_request_t *req)
}
rpcsvc_actor_t gd_svc_mgmt_actors[] = {
- [GLUSTERD_MGMT_NULL] = { "NULL", GLUSTERD_MGMT_NULL, glusterd_null, NULL, 0},
- [GLUSTERD_MGMT_CLUSTER_LOCK] = { "CLUSTER_LOCK", GLUSTERD_MGMT_CLUSTER_LOCK, glusterd_handle_cluster_lock, NULL, 0},
- [GLUSTERD_MGMT_CLUSTER_UNLOCK] = { "CLUSTER_UNLOCK", GLUSTERD_MGMT_CLUSTER_UNLOCK, glusterd_handle_cluster_unlock, NULL, 0},
- [GLUSTERD_MGMT_STAGE_OP] = { "STAGE_OP", GLUSTERD_MGMT_STAGE_OP, glusterd_handle_stage_op, NULL, 0},
- [GLUSTERD_MGMT_COMMIT_OP] = { "COMMIT_OP", GLUSTERD_MGMT_COMMIT_OP, glusterd_handle_commit_op, NULL, 0},
+ [GLUSTERD_MGMT_NULL] = { "NULL", GLUSTERD_MGMT_NULL, glusterd_null, NULL, 0, DRC_NA},
+ [GLUSTERD_MGMT_CLUSTER_LOCK] = { "CLUSTER_LOCK", GLUSTERD_MGMT_CLUSTER_LOCK, glusterd_handle_cluster_lock, NULL, 0, DRC_NA},
+ [GLUSTERD_MGMT_CLUSTER_UNLOCK] = { "CLUSTER_UNLOCK", GLUSTERD_MGMT_CLUSTER_UNLOCK, glusterd_handle_cluster_unlock, NULL, 0, DRC_NA},
+ [GLUSTERD_MGMT_STAGE_OP] = { "STAGE_OP", GLUSTERD_MGMT_STAGE_OP, glusterd_handle_stage_op, NULL, 0, DRC_NA},
+ [GLUSTERD_MGMT_COMMIT_OP] = { "COMMIT_OP", GLUSTERD_MGMT_COMMIT_OP, glusterd_handle_commit_op, NULL, 0, DRC_NA},
};
struct rpcsvc_program gd_svc_mgmt_prog = {
@@ -2972,11 +4316,11 @@ struct rpcsvc_program gd_svc_mgmt_prog = {
};
rpcsvc_actor_t gd_svc_peer_actors[] = {
- [GLUSTERD_FRIEND_NULL] = { "NULL", GLUSTERD_MGMT_NULL, glusterd_null, NULL, 0},
- [GLUSTERD_PROBE_QUERY] = { "PROBE_QUERY", GLUSTERD_PROBE_QUERY, glusterd_handle_probe_query, NULL, 0},
- [GLUSTERD_FRIEND_ADD] = { "FRIEND_ADD", GLUSTERD_FRIEND_ADD, glusterd_handle_incoming_friend_req, NULL, 0},
- [GLUSTERD_FRIEND_REMOVE] = { "FRIEND_REMOVE", GLUSTERD_FRIEND_REMOVE, glusterd_handle_incoming_unfriend_req, NULL, 0},
- [GLUSTERD_FRIEND_UPDATE] = { "FRIEND_UPDATE", GLUSTERD_FRIEND_UPDATE, glusterd_handle_friend_update, NULL, 0},
+ [GLUSTERD_FRIEND_NULL] = { "NULL", GLUSTERD_MGMT_NULL, glusterd_null, NULL, 0, DRC_NA},
+ [GLUSTERD_PROBE_QUERY] = { "PROBE_QUERY", GLUSTERD_PROBE_QUERY, glusterd_handle_probe_query, NULL, 0, DRC_NA},
+ [GLUSTERD_FRIEND_ADD] = { "FRIEND_ADD", GLUSTERD_FRIEND_ADD, glusterd_handle_incoming_friend_req, NULL, 0, DRC_NA},
+ [GLUSTERD_FRIEND_REMOVE] = { "FRIEND_REMOVE", GLUSTERD_FRIEND_REMOVE, glusterd_handle_incoming_unfriend_req, NULL, 0, DRC_NA},
+ [GLUSTERD_FRIEND_UPDATE] = { "FRIEND_UPDATE", GLUSTERD_FRIEND_UPDATE, glusterd_handle_friend_update, NULL, 0, DRC_NA},
};
struct rpcsvc_program gd_svc_peer_prog = {
@@ -2991,34 +4335,39 @@ struct rpcsvc_program gd_svc_peer_prog = {
rpcsvc_actor_t gd_svc_cli_actors[] = {
- [GLUSTER_CLI_PROBE] = { "CLI_PROBE", GLUSTER_CLI_PROBE, glusterd_handle_cli_probe, NULL, 0},
- [GLUSTER_CLI_CREATE_VOLUME] = { "CLI_CREATE_VOLUME", GLUSTER_CLI_CREATE_VOLUME, glusterd_handle_create_volume, NULL, 0},
- [GLUSTER_CLI_DEFRAG_VOLUME] = { "CLI_DEFRAG_VOLUME", GLUSTER_CLI_DEFRAG_VOLUME, glusterd_handle_defrag_volume, NULL, 0},
- [GLUSTER_CLI_DEPROBE] = { "FRIEND_REMOVE", GLUSTER_CLI_DEPROBE, glusterd_handle_cli_deprobe, NULL, 0},
- [GLUSTER_CLI_LIST_FRIENDS] = { "LIST_FRIENDS", GLUSTER_CLI_LIST_FRIENDS, glusterd_handle_cli_list_friends, NULL, 0},
- [GLUSTER_CLI_START_VOLUME] = { "START_VOLUME", GLUSTER_CLI_START_VOLUME, glusterd_handle_cli_start_volume, NULL, 0},
- [GLUSTER_CLI_STOP_VOLUME] = { "STOP_VOLUME", GLUSTER_CLI_STOP_VOLUME, glusterd_handle_cli_stop_volume, NULL, 0},
- [GLUSTER_CLI_DELETE_VOLUME] = { "DELETE_VOLUME", GLUSTER_CLI_DELETE_VOLUME, glusterd_handle_cli_delete_volume, NULL, 0},
- [GLUSTER_CLI_GET_VOLUME] = { "GET_VOLUME", GLUSTER_CLI_GET_VOLUME, glusterd_handle_cli_get_volume, NULL, 0},
- [GLUSTER_CLI_ADD_BRICK] = { "ADD_BRICK", GLUSTER_CLI_ADD_BRICK, glusterd_handle_add_brick, NULL, 0},
- [GLUSTER_CLI_REPLACE_BRICK] = { "REPLACE_BRICK", GLUSTER_CLI_REPLACE_BRICK, glusterd_handle_replace_brick, NULL, 0},
- [GLUSTER_CLI_REMOVE_BRICK] = { "REMOVE_BRICK", GLUSTER_CLI_REMOVE_BRICK, glusterd_handle_remove_brick, NULL, 0},
- [GLUSTER_CLI_LOG_ROTATE] = { "LOG FILENAME", GLUSTER_CLI_LOG_ROTATE, glusterd_handle_log_rotate, NULL, 0},
- [GLUSTER_CLI_SET_VOLUME] = { "SET_VOLUME", GLUSTER_CLI_SET_VOLUME, glusterd_handle_set_volume, NULL, 0},
- [GLUSTER_CLI_SYNC_VOLUME] = { "SYNC_VOLUME", GLUSTER_CLI_SYNC_VOLUME, glusterd_handle_sync_volume, NULL, 0},
- [GLUSTER_CLI_RESET_VOLUME] = { "RESET_VOLUME", GLUSTER_CLI_RESET_VOLUME, glusterd_handle_reset_volume, NULL, 0},
- [GLUSTER_CLI_FSM_LOG] = { "FSM_LOG", GLUSTER_CLI_FSM_LOG, glusterd_handle_fsm_log, NULL, 0},
- [GLUSTER_CLI_GSYNC_SET] = { "GSYNC_SET", GLUSTER_CLI_GSYNC_SET, glusterd_handle_gsync_set, NULL, 0},
- [GLUSTER_CLI_PROFILE_VOLUME] = { "STATS_VOLUME", GLUSTER_CLI_PROFILE_VOLUME, glusterd_handle_cli_profile_volume, NULL, 0},
- [GLUSTER_CLI_QUOTA] = { "QUOTA", GLUSTER_CLI_QUOTA, glusterd_handle_quota, NULL, 0},
- [GLUSTER_CLI_GETWD] = { "GETWD", GLUSTER_CLI_GETWD, glusterd_handle_getwd, NULL, 1},
- [GLUSTER_CLI_STATUS_VOLUME] = {"STATUS_VOLUME", GLUSTER_CLI_STATUS_VOLUME, glusterd_handle_status_volume, NULL, 0},
- [GLUSTER_CLI_MOUNT] = { "MOUNT", GLUSTER_CLI_MOUNT, glusterd_handle_mount, NULL, 1},
- [GLUSTER_CLI_UMOUNT] = { "UMOUNT", GLUSTER_CLI_UMOUNT, glusterd_handle_umount, NULL, 1},
- [GLUSTER_CLI_HEAL_VOLUME] = { "HEAL_VOLUME", GLUSTER_CLI_HEAL_VOLUME, glusterd_handle_cli_heal_volume, NULL, 0},
- [GLUSTER_CLI_STATEDUMP_VOLUME] = {"STATEDUMP_VOLUME", GLUSTER_CLI_STATEDUMP_VOLUME, glusterd_handle_cli_statedump_volume, NULL, 0},
- [GLUSTER_CLI_LIST_VOLUME] = {"LIST_VOLUME", GLUSTER_CLI_LIST_VOLUME, glusterd_handle_cli_list_volume, NULL, 0},
- [GLUSTER_CLI_CLRLOCKS_VOLUME] = {"CLEARLOCKS_VOLUME", GLUSTER_CLI_CLRLOCKS_VOLUME, glusterd_handle_cli_clearlocks_volume, NULL, 0},
+ [GLUSTER_CLI_PROBE] = { "CLI_PROBE", GLUSTER_CLI_PROBE, glusterd_handle_cli_probe, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_CREATE_VOLUME] = { "CLI_CREATE_VOLUME", GLUSTER_CLI_CREATE_VOLUME, glusterd_handle_create_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_DEFRAG_VOLUME] = { "CLI_DEFRAG_VOLUME", GLUSTER_CLI_DEFRAG_VOLUME, glusterd_handle_defrag_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_DEPROBE] = { "FRIEND_REMOVE", GLUSTER_CLI_DEPROBE, glusterd_handle_cli_deprobe, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_LIST_FRIENDS] = { "LIST_FRIENDS", GLUSTER_CLI_LIST_FRIENDS, glusterd_handle_cli_list_friends, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_UUID_RESET] = { "UUID_RESET", GLUSTER_CLI_UUID_RESET, glusterd_handle_cli_uuid_reset, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_UUID_GET] = { "UUID_GET", GLUSTER_CLI_UUID_GET, glusterd_handle_cli_uuid_get, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_START_VOLUME] = { "START_VOLUME", GLUSTER_CLI_START_VOLUME, glusterd_handle_cli_start_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_STOP_VOLUME] = { "STOP_VOLUME", GLUSTER_CLI_STOP_VOLUME, glusterd_handle_cli_stop_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_DELETE_VOLUME] = { "DELETE_VOLUME", GLUSTER_CLI_DELETE_VOLUME, glusterd_handle_cli_delete_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_GET_VOLUME] = { "GET_VOLUME", GLUSTER_CLI_GET_VOLUME, glusterd_handle_cli_get_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_ADD_BRICK] = { "ADD_BRICK", GLUSTER_CLI_ADD_BRICK, glusterd_handle_add_brick, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_REPLACE_BRICK] = { "REPLACE_BRICK", GLUSTER_CLI_REPLACE_BRICK, glusterd_handle_replace_brick, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_REMOVE_BRICK] = { "REMOVE_BRICK", GLUSTER_CLI_REMOVE_BRICK, glusterd_handle_remove_brick, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_LOG_ROTATE] = { "LOG FILENAME", GLUSTER_CLI_LOG_ROTATE, glusterd_handle_log_rotate, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_SET_VOLUME] = { "SET_VOLUME", GLUSTER_CLI_SET_VOLUME, glusterd_handle_set_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_SYNC_VOLUME] = { "SYNC_VOLUME", GLUSTER_CLI_SYNC_VOLUME, glusterd_handle_sync_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_RESET_VOLUME] = { "RESET_VOLUME", GLUSTER_CLI_RESET_VOLUME, glusterd_handle_reset_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_FSM_LOG] = { "FSM_LOG", GLUSTER_CLI_FSM_LOG, glusterd_handle_fsm_log, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_GSYNC_SET] = { "GSYNC_SET", GLUSTER_CLI_GSYNC_SET, glusterd_handle_gsync_set, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_PROFILE_VOLUME] = { "STATS_VOLUME", GLUSTER_CLI_PROFILE_VOLUME, glusterd_handle_cli_profile_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_QUOTA] = { "QUOTA", GLUSTER_CLI_QUOTA, glusterd_handle_quota, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_GETWD] = { "GETWD", GLUSTER_CLI_GETWD, glusterd_handle_getwd, NULL, 1, DRC_NA},
+ [GLUSTER_CLI_STATUS_VOLUME] = {"STATUS_VOLUME", GLUSTER_CLI_STATUS_VOLUME, glusterd_handle_status_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_MOUNT] = { "MOUNT", GLUSTER_CLI_MOUNT, glusterd_handle_mount, NULL, 1, DRC_NA},
+ [GLUSTER_CLI_UMOUNT] = { "UMOUNT", GLUSTER_CLI_UMOUNT, glusterd_handle_umount, NULL, 1, DRC_NA},
+ [GLUSTER_CLI_HEAL_VOLUME] = { "HEAL_VOLUME", GLUSTER_CLI_HEAL_VOLUME, glusterd_handle_cli_heal_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_STATEDUMP_VOLUME] = {"STATEDUMP_VOLUME", GLUSTER_CLI_STATEDUMP_VOLUME, glusterd_handle_cli_statedump_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_LIST_VOLUME] = {"LIST_VOLUME", GLUSTER_CLI_LIST_VOLUME, glusterd_handle_cli_list_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_CLRLOCKS_VOLUME] = {"CLEARLOCKS_VOLUME", GLUSTER_CLI_CLRLOCKS_VOLUME, glusterd_handle_cli_clearlocks_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_COPY_FILE] = {"COPY_FILE", GLUSTER_CLI_COPY_FILE, glusterd_handle_copy_file, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_SYS_EXEC] = {"SYS_EXEC", GLUSTER_CLI_SYS_EXEC, glusterd_handle_sys_exec, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_SNAP] = {"SNAP", GLUSTER_CLI_SNAP, glusterd_handle_snapshot, NULL, 0, DRC_NA},
};
struct rpcsvc_program gd_svc_cli_prog = {
@@ -3029,3 +4378,24 @@ struct rpcsvc_program gd_svc_cli_prog = {
.actors = gd_svc_cli_actors,
.synctask = _gf_true,
};
+
+/* This is a minimal RPC prog, which contains only the readonly RPC procs from
+ * the cli rpcsvc
+ */
+rpcsvc_actor_t gd_svc_cli_actors_ro[] = {
+ [GLUSTER_CLI_LIST_FRIENDS] = { "LIST_FRIENDS", GLUSTER_CLI_LIST_FRIENDS, glusterd_handle_cli_list_friends, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_UUID_GET] = { "UUID_GET", GLUSTER_CLI_UUID_GET, glusterd_handle_cli_uuid_get, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_GET_VOLUME] = { "GET_VOLUME", GLUSTER_CLI_GET_VOLUME, glusterd_handle_cli_get_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_GETWD] = { "GETWD", GLUSTER_CLI_GETWD, glusterd_handle_getwd, NULL, 1, DRC_NA},
+ [GLUSTER_CLI_STATUS_VOLUME] = {"STATUS_VOLUME", GLUSTER_CLI_STATUS_VOLUME, glusterd_handle_status_volume, NULL, 0, DRC_NA},
+ [GLUSTER_CLI_LIST_VOLUME] = {"LIST_VOLUME", GLUSTER_CLI_LIST_VOLUME, glusterd_handle_cli_list_volume, NULL, 0, DRC_NA},
+};
+
+struct rpcsvc_program gd_svc_cli_prog_ro = {
+ .progname = "GlusterD svc cli read-only",
+ .prognum = GLUSTER_CLI_PROGRAM,
+ .progver = GLUSTER_CLI_VERSION,
+ .numactors = GLUSTER_CLI_MAXVALUE,
+ .actors = gd_svc_cli_actors_ro,
+ .synctask = _gf_true,
+};
diff --git a/xlators/mgmt/glusterd/src/glusterd-handshake.c b/xlators/mgmt/glusterd/src/glusterd-handshake.c
index ab38de17c..5078526e9 100644
--- a/xlators/mgmt/glusterd/src/glusterd-handshake.c
+++ b/xlators/mgmt/glusterd/src/glusterd-handshake.c
@@ -21,6 +21,7 @@
#include "glusterd.h"
#include "glusterd-utils.h"
#include "glusterd-op-sm.h"
+#include "glusterd-store.h"
#include "glusterfs3.h"
#include "protocol-common.h"
@@ -29,26 +30,145 @@
extern struct rpc_clnt_program gd_peer_prog;
extern struct rpc_clnt_program gd_mgmt_prog;
+extern struct rpc_clnt_program gd_mgmt_v3_prog;
#define TRUSTED_PREFIX "trusted-"
typedef ssize_t (*gfs_serialize_t) (struct iovec outmsg, void *data);
+static int
+get_snap_volname_and_volinfo (const char *volpath, char **volname,
+ glusterd_volinfo_t **volinfo)
+{
+ int ret = -1;
+ char *save_ptr = NULL;
+ char *str_token = NULL;
+ char *snapname = NULL;
+ char *volname_token = NULL;
+ char *vol = NULL;
+ glusterd_snap_t *snap = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (volpath);
+ GF_ASSERT (volinfo);
+
+ str_token = gf_strdup (volpath);
+ if (NULL == str_token) {
+ goto out;
+ }
+
+ /* Input volname will have below formats:
+ * /snaps/<snapname>/<volname>.<hostname>
+ * or
+ * /snaps/<snapname>/<parent-volname>
+ * We need to extract snapname and parent_volname */
+
+ /*split string by "/" */
+ strtok_r (str_token, "/", &save_ptr);
+ snapname = strtok_r(NULL, "/", &save_ptr);
+ if (!snapname) {
+ gf_log(this->name, GF_LOG_ERROR, "Invalid path: %s", volpath);
+ goto out;
+ }
+
+ volname_token = strtok_r(NULL, "/", &save_ptr);
+ if (!volname_token) {
+ gf_log(this->name, GF_LOG_ERROR, "Invalid path: %s", volpath);
+ goto out;
+ }
+
+ snap = glusterd_find_snap_by_name (snapname);
+ if (!snap) {
+ gf_log(this->name, GF_LOG_ERROR, "Failed to "
+ "fetch snap %s", snapname);
+ goto out;
+ }
+
+ /* Find if its a parent volume name or snap volume
+ * name. This function will succeed if volname_token
+ * is a parent volname
+ */
+ ret = glusterd_volinfo_find (volname_token, volinfo);
+ if (ret) {
+ *volname = gf_strdup (volname_token);
+ if (NULL == *volname) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = glusterd_snap_volinfo_find (volname_token, snap,
+ volinfo);
+ if (ret) {
+ /* Split the volume name */
+ vol = strtok_r (volname_token, ".", &save_ptr);
+ if (!vol) {
+ gf_log(this->name, GF_LOG_ERROR, "Invalid "
+ "volname (%s)", volname_token);
+ goto out;
+ }
+
+ ret = glusterd_snap_volinfo_find (vol, snap, volinfo);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR, "Failed to "
+ "fetch snap volume from volname (%s)",
+ vol);
+ goto out;
+ }
+ }
+ } else {
+ /*volname_token is parent volname*/
+ ret = glusterd_snap_volinfo_find_from_parent_volname (
+ volname_token, snap, volinfo);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR, "Failed to "
+ "fetch snap volume from parent "
+ "volname (%s)", volname_token);
+ goto out;
+ }
+
+ /* Since volname_token is a parent volname we should
+ * get the snap volname here*/
+ *volname = gf_strdup ((*volinfo)->volname);
+ if (NULL == *volname) {
+ ret = -1;
+ goto out;
+ }
+ }
+
+out:
+ if (ret && NULL != *volname) {
+ GF_FREE (*volname);
+ *volname = NULL;
+ }
+ return ret;
+}
+
static size_t
build_volfile_path (const char *volname, char *path,
size_t path_len, char *trusted_str)
{
- struct stat stbuf = {0,};
- int32_t ret = -1;
- glusterd_conf_t *priv = NULL;
- char *vol = NULL;
- char *dup_volname = NULL;
- char *free_ptr = NULL;
- char *tmp = NULL;
- glusterd_volinfo_t *volinfo = NULL;
- char *server = NULL;
-
- priv = THIS->private;
+ struct stat stbuf = {0,};
+ int32_t ret = -1;
+ glusterd_conf_t *priv = NULL;
+ char *vol = NULL;
+ char *dup_volname = NULL;
+ char *free_ptr = NULL;
+ char *save_ptr = NULL;
+ char *str_token = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ char *server = NULL;
+ const char *volname_ptr = NULL;
+ char path_prefix [PATH_MAX] = {0,};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (volname);
+ GF_ASSERT (path);
if (strstr (volname, "gluster/")) {
server = strchr (volname, '/') + 1;
@@ -56,6 +176,22 @@ build_volfile_path (const char *volname, char *path,
path, path_len);
ret = 1;
goto out;
+ } else if ((str_token = strstr (volname, "/snaps/"))) {
+ ret = get_snap_volname_and_volinfo (str_token, &dup_volname,
+ &volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get snap"
+ " volinfo from path (%s)", volname);
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (path_prefix, sizeof (path_prefix), "%s/snaps/%s",
+ priv->workdir, volinfo->snapshot->snapname);
+
+ free_ptr = dup_volname;
+ volname_ptr = dup_volname;
+ goto gotvolinfo;
} else if (volname[0] != '/') {
/* Normal behavior */
dup_volname = gf_strdup (volname);
@@ -66,39 +202,53 @@ build_volfile_path (const char *volname, char *path,
dup_volname = gf_strdup (&volname[1]);
}
+ if (!dup_volname) {
+ gf_log(THIS->name, GF_LOG_ERROR, "strdup failed");
+ ret = -1;
+ goto out;
+ }
free_ptr = dup_volname;
+ volname_ptr = volname;
+
+ snprintf (path_prefix, sizeof (path_prefix), "%s/vols",
+ priv->workdir);
ret = glusterd_volinfo_find (dup_volname, &volinfo);
+
if (ret) {
/* Split the volume name */
- vol = strtok_r (dup_volname, ".", &tmp);
+ vol = strtok_r (dup_volname, ".", &save_ptr);
if (!vol)
goto out;
+
ret = glusterd_volinfo_find (vol, &volinfo);
if (ret)
goto out;
}
+gotvolinfo:
if (!glusterd_auth_get_username (volinfo))
trusted_str = NULL;
- ret = snprintf (path, path_len, "%s/vols/%s/%s.vol",
- priv->workdir, volinfo->volname, volname);
+ ret = snprintf (path, path_len, "%s/%s/%s.vol", path_prefix,
+ volinfo->volname, volname_ptr);
if (ret == -1)
goto out;
ret = stat (path, &stbuf);
if ((ret == -1) && (errno == ENOENT)) {
- snprintf (path, path_len, "%s/vols/%s/%s%s-fuse.vol",
- priv->workdir, volinfo->volname,
- (trusted_str ? trusted_str : ""), dup_volname);
+ snprintf (path, path_len, "%s/%s/%s%s-fuse.vol",
+ path_prefix, volinfo->volname,
+ (trusted_str ? trusted_str : ""),
+ dup_volname);
+
ret = stat (path, &stbuf);
}
if ((ret == -1) && (errno == ENOENT)) {
- snprintf (path, path_len, "%s/vols/%s/%s-tcp.vol",
- priv->workdir, volinfo->volname, volname);
+ snprintf (path, path_len, "%s/%s/%s-tcp.vol",
+ path_prefix, volinfo->volname, volname_ptr);
}
ret = 1;
@@ -107,8 +257,97 @@ out:
return ret;
}
+/* Get and store op-versions of the clients sending the getspec request
+ * Clients of versions <= 3.3, don't send op-versions, their op-versions are
+ * defaulted to 1
+ */
+static int
+_get_client_op_versions (gf_getspec_req *args, peer_info_t *peerinfo)
+{
+ int ret = 0;
+ int client_max_op_version = 1;
+ int client_min_op_version = 1;
+ dict_t *dict = NULL;
+
+ GF_ASSERT (args);
+ GF_ASSERT (peerinfo);
+
+ if (args->xdata.xdata_len) {
+ dict = dict_new ();
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_unserialize (args->xdata.xdata_val,
+ args->xdata.xdata_len, &dict);
+ if (ret) {
+ gf_log ("glusterd", GF_LOG_ERROR,
+ "Failed to unserialize request dictionary");
+ goto out;
+ }
+
+ ret = dict_get_int32 (dict, "min-op-version",
+ &client_min_op_version);
+ if (ret) {
+ gf_log ("glusterd", GF_LOG_ERROR,
+ "Failed to get client-min-op-version");
+ goto out;
+ }
+
+ ret = dict_get_int32 (dict, "max-op-version",
+ &client_max_op_version);
+ if (ret) {
+ gf_log ("glusterd", GF_LOG_ERROR,
+ "Failed to get client-max-op-version");
+ goto out;
+ }
+ }
+
+ peerinfo->max_op_version = client_max_op_version;
+ peerinfo->min_op_version = client_min_op_version;
+
+out:
+ return ret;
+}
+
+/* Checks if the client supports the volume, ie. client can understand all the
+ * options in the volfile
+ */
+static gf_boolean_t
+_client_supports_volume (peer_info_t *peerinfo, int32_t *op_errno)
+{
+ gf_boolean_t ret = _gf_true;
+ glusterd_volinfo_t *volinfo = NULL;
+
+ GF_ASSERT (peerinfo);
+ GF_ASSERT (op_errno);
+
+
+ /* Only check when the volfile being requested is a volume. Not finding
+ * a volinfo implies that the volfile requested for is not of a gluster
+ * volume. A non volume volfile is requested by the local gluster
+ * services like shd and nfs-server. These need not be checked as they
+ * will be running at the same op-version as glusterd and will be able
+ * to support all the features
+ */
+ if ((glusterd_volinfo_find (peerinfo->volname, &volinfo) == 0) &&
+ ((peerinfo->min_op_version > volinfo->client_op_version) ||
+ (peerinfo->max_op_version < volinfo->client_op_version))) {
+ ret = _gf_false;
+ *op_errno = ENOTSUP;
+ gf_log ("glusterd", GF_LOG_INFO,
+ "Client %s (%d -> %d) doesn't support required "
+ "op-version (%d). Rejecting volfile request.",
+ peerinfo->identifier, peerinfo->min_op_version,
+ peerinfo->max_op_version, volinfo->client_op_version);
+ }
+
+ return ret;
+}
+
int
-server_getspec (rpcsvc_request_t *req)
+__server_getspec (rpcsvc_request_t *req)
{
int32_t ret = -1;
int32_t op_errno = 0;
@@ -123,28 +362,53 @@ server_getspec (rpcsvc_request_t *req)
gf_getspec_req args = {0,};
gf_getspec_rsp rsp = {0,};
char addrstr[RPCSVC_PEER_STRLEN] = {0};
+ peer_info_t *peerinfo = NULL;
-
- if (!xdr_to_generic (req->msg[0], &args,
- (xdrproc_t)xdr_gf_getspec_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gf_getspec_req);
+ if (ret < 0) {
//failed to decode msg;
req->rpc_err = GARBAGE_ARGS;
goto fail;
}
+ peerinfo = &req->trans->peerinfo;
+
volume = args.key;
+ /* Need to strip leading '/' from volnames. This was introduced to
+ * support nfs style mount parameters for native gluster mount
+ */
+ if (volume[0] == '/')
+ strncpy (peerinfo->volname, &volume[1], strlen(&volume[1]));
+ else
+ strncpy (peerinfo->volname, volume, strlen(volume));
+
+ ret = _get_client_op_versions (&args, peerinfo);
+ if (ret)
+ goto fail;
+
+ if (!_client_supports_volume (peerinfo, &op_errno)) {
+ ret = -1;
+ goto fail;
+ }
trans = req->trans;
+ /* addrstr will be empty for cli socket connections */
ret = rpcsvc_transport_peername (trans, (char *)&addrstr,
sizeof (addrstr));
if (ret)
goto fail;
- tmp = strrchr (addrstr, ':');
- *tmp = '\0';
+ tmp = strrchr (addrstr, ':');
+ if (tmp)
+ *tmp = '\0';
- /* we trust the local admin */
- if (!glusterd_is_local_addr (addrstr)) {
+ /* The trusted volfiles are given to the glusterd owned process like NFS
+ * server, self-heal daemon etc., so that they are not inadvertently
+ * blocked by a auth.{allow,reject} setting. The trusted volfile is not
+ * meant for external users.
+ */
+ if (strlen (addrstr) && gf_is_local_addr (addrstr)) {
ret = build_volfile_path (volume, filename,
sizeof (filename),
@@ -208,8 +472,14 @@ fail:
return 0;
}
+int
+server_getspec (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req, __server_getspec);
+}
+
int32_t
-server_event_notify (rpcsvc_request_t *req)
+__server_event_notify (rpcsvc_request_t *req)
{
int32_t ret = -1;
int32_t op_errno = 0;
@@ -218,8 +488,9 @@ server_event_notify (rpcsvc_request_t *req)
dict_t *dict = NULL;
gf_boolean_t need_rsp = _gf_true;
- if (!xdr_to_generic (req->msg[0], &args,
- (xdrproc_t)xdr_gf_event_notify_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gf_event_notify_req);
+ if (ret < 0) {
req->rpc_err = GARBAGE_ARGS;
goto fail;
}
@@ -267,34 +538,284 @@ fail:
return 0;
}
+
+int32_t
+server_event_notify (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req, __server_event_notify);
+}
+
+int
+gd_validate_cluster_op_version (xlator_t *this, int cluster_op_version,
+ char *peerid)
+{
+ int ret = -1;
+ glusterd_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (cluster_op_version > GD_OP_VERSION_MAX) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "operating version %d is more than the maximum "
+ "supported (%d) on the machine (as per peer request "
+ "from %s)", cluster_op_version, GD_OP_VERSION_MAX,
+ peerid);
+ goto out;
+ }
+
+ /* The peer can only reduce its op-version when it doesn't have any
+ * volumes. Reducing op-version when it already contains volumes can
+ * lead to inconsistencies in the cluster
+ */
+ if ((cluster_op_version < conf->op_version) &&
+ !list_empty (&conf->volumes)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "cannot reduce operating version to %d from current "
+ "version %d as volumes exist (as per peer request from "
+ "%s)", cluster_op_version, conf->op_version, peerid);
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int
+__glusterd_mgmt_hndsk_versions (rpcsvc_request_t *req)
+{
+ dict_t *dict = NULL;
+ xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
+ int ret = -1;
+ int op_errno = EINVAL;
+ gf_mgmt_hndsk_req args = {{0,},};
+ gf_mgmt_hndsk_rsp rsp = {0,};
+
+ this = THIS;
+ conf = this->private;
+
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gf_mgmt_hndsk_req);
+ if (ret < 0) {
+ //failed to decode msg;
+ req->rpc_err = GARBAGE_ARGS;
+ goto out;
+ }
+
+ dict = dict_new ();
+ if (!dict)
+ goto out;
+
+ ret = dict_set_int32 (dict, GD_OP_VERSION_KEY, conf->op_version);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to set operating version");
+ rsp.op_ret = ret;
+ goto out;
+ }
+
+ ret = dict_set_int32 (dict, GD_MIN_OP_VERSION_KEY, GD_OP_VERSION_MIN);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to set %s", GD_MIN_OP_VERSION_KEY);
+ rsp.op_ret = ret;
+ goto out;
+ }
+
+ ret = dict_set_int32 (dict, GD_MAX_OP_VERSION_KEY, GD_OP_VERSION_MAX);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to set %s", GD_MAX_OP_VERSION_KEY);
+ rsp.op_ret = ret;
+ goto out;
+ }
+
+ ret = 0;
+
+ GF_PROTOCOL_DICT_SERIALIZE (this, dict, (&rsp.hndsk.hndsk_val),
+ rsp.hndsk.hndsk_len, op_errno, out);
+out:
+
+ rsp.op_ret = ret;
+ rsp.op_errno = op_errno;
+
+ glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
+ (xdrproc_t)xdr_gf_mgmt_hndsk_rsp);
+
+ ret = 0;
+
+ if (dict)
+ dict_unref (dict);
+
+ if (args.hndsk.hndsk_val)
+ free (args.hndsk.hndsk_val);
+
+ if (rsp.hndsk.hndsk_val)
+ GF_FREE (rsp.hndsk.hndsk_val);
+
+ return ret;
+}
+
+int
+glusterd_mgmt_hndsk_versions (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ __glusterd_mgmt_hndsk_versions);
+}
+
+int
+__glusterd_mgmt_hndsk_versions_ack (rpcsvc_request_t *req)
+{
+ dict_t *clnt_dict = NULL;
+ xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
+ int ret = -1;
+ int op_errno = EINVAL;
+ int peer_op_version = 0;
+ gf_mgmt_hndsk_req args = {{0,},};
+ gf_mgmt_hndsk_rsp rsp = {0,};
+
+ this = THIS;
+ conf = this->private;
+
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gf_mgmt_hndsk_req);
+ if (ret < 0) {
+ //failed to decode msg;
+ req->rpc_err = GARBAGE_ARGS;
+ goto out;
+ }
+
+ GF_PROTOCOL_DICT_UNSERIALIZE (this, clnt_dict, args.hndsk.hndsk_val,
+ (args.hndsk.hndsk_len), ret, op_errno,
+ out);
+
+ ret = dict_get_int32 (clnt_dict, GD_OP_VERSION_KEY, &peer_op_version);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to get the op-version key peer=%s",
+ req->trans->peerinfo.identifier);
+ goto out;
+ }
+
+ ret = gd_validate_cluster_op_version (this, peer_op_version,
+ req->trans->peerinfo.identifier);
+ if (ret)
+ goto out;
+
+
+ /* As this is ACK from the Cluster for the versions supported,
+ can set the op-version of 'this' glusterd to the one
+ received. */
+ gf_log (this->name, GF_LOG_INFO, "using the op-version %d",
+ peer_op_version);
+ conf->op_version = peer_op_version;
+ ret = glusterd_store_global_info (this);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR, "Failed to store op-version");
+
+out:
+ rsp.op_ret = ret;
+ rsp.op_errno = op_errno;
+
+ glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
+ (xdrproc_t)xdr_gf_mgmt_hndsk_rsp);
+
+ ret = 0;
+
+ if (clnt_dict)
+ dict_unref (clnt_dict);
+
+ if (args.hndsk.hndsk_val)
+ free (args.hndsk.hndsk_val);
+
+ return ret;
+}
+
+int
+glusterd_mgmt_hndsk_versions_ack (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ __glusterd_mgmt_hndsk_versions_ack);
+}
+
rpcsvc_actor_t gluster_handshake_actors[] = {
- [GF_HNDSK_NULL] = {"NULL", GF_HNDSK_NULL, NULL, NULL, 0},
- [GF_HNDSK_GETSPEC] = {"GETSPEC", GF_HNDSK_GETSPEC, server_getspec, NULL, 0},
- [GF_HNDSK_EVENT_NOTIFY] = {"EVENTNOTIFY", GF_HNDSK_EVENT_NOTIFY, server_event_notify,
- NULL, 0},
+ [GF_HNDSK_NULL] = {"NULL", GF_HNDSK_NULL, NULL, NULL, 0, DRC_NA},
+ [GF_HNDSK_GETSPEC] = {"GETSPEC", GF_HNDSK_GETSPEC, server_getspec, NULL, 0, DRC_NA},
+ [GF_HNDSK_EVENT_NOTIFY] = {"EVENTNOTIFY", GF_HNDSK_EVENT_NOTIFY, server_event_notify, NULL, 0, DRC_NA},
};
struct rpcsvc_program gluster_handshake_prog = {
- .progname = "GlusterFS Handshake",
+ .progname = "Gluster Handshake",
.prognum = GLUSTER_HNDSK_PROGRAM,
.progver = GLUSTER_HNDSK_VERSION,
.actors = gluster_handshake_actors,
.numactors = GF_HNDSK_MAXVALUE,
};
+/* A minimal RPC program just for the cli getspec command */
+rpcsvc_actor_t gluster_cli_getspec_actors[] = {
+ [GF_HNDSK_GETSPEC] = {"GETSPEC", GF_HNDSK_GETSPEC, server_getspec, NULL, 0, DRC_NA},
+};
+
+struct rpcsvc_program gluster_cli_getspec_prog = {
+ .progname = "Gluster Handshake (CLI Getspec)",
+ .prognum = GLUSTER_HNDSK_PROGRAM,
+ .progver = GLUSTER_HNDSK_VERSION,
+ .actors = gluster_cli_getspec_actors,
+ .numactors = GF_HNDSK_MAXVALUE,
+};
+
+
char *glusterd_dump_proc[GF_DUMP_MAXVALUE] = {
[GF_DUMP_NULL] = "NULL",
[GF_DUMP_DUMP] = "DUMP",
};
rpc_clnt_prog_t glusterd_dump_prog = {
- .progname = "GLUSTERD-DUMP",
- .prognum = GLUSTER_DUMP_PROGRAM,
- .progver = GLUSTER_DUMP_VERSION,
- .procnames = glusterd_dump_proc,
+ .progname = "GLUSTERD-DUMP",
+ .prognum = GLUSTER_DUMP_PROGRAM,
+ .progver = GLUSTER_DUMP_VERSION,
+ .procnames = glusterd_dump_proc,
+};
+
+
+rpcsvc_actor_t glusterd_mgmt_hndsk_actors[] = {
+ [GD_MGMT_HNDSK_NULL] = {"NULL", GD_MGMT_HNDSK_NULL, NULL,
+ NULL, 0},
+ [GD_MGMT_HNDSK_VERSIONS] = {"MGMT-VERS", GD_MGMT_HNDSK_VERSIONS,
+ glusterd_mgmt_hndsk_versions, NULL,
+ 0},
+ [GD_MGMT_HNDSK_VERSIONS_ACK] = {"MGMT-VERS-ACK",
+ GD_MGMT_HNDSK_VERSIONS_ACK,
+ glusterd_mgmt_hndsk_versions_ack,
+ NULL, 0},
+};
+
+struct rpcsvc_program glusterd_mgmt_hndsk_prog = {
+ .progname = "Gluster MGMT Handshake",
+ .prognum = GD_MGMT_HNDSK_PROGRAM,
+ .progver = GD_MGMT_HNDSK_VERSION,
+ .actors = glusterd_mgmt_hndsk_actors,
+ .numactors = GD_MGMT_HNDSK_MAXVALUE,
+};
+
+char *glusterd_mgmt_hndsk_proc[GD_MGMT_HNDSK_MAXVALUE] = {
+ [GD_MGMT_HNDSK_NULL] = "NULL",
+ [GD_MGMT_HNDSK_VERSIONS] = "MGMT-VERS",
+ [GD_MGMT_HNDSK_VERSIONS_ACK] = "MGMT-VERS-ACK",
+};
+
+rpc_clnt_prog_t gd_clnt_mgmt_hndsk_prog = {
+ .progname = "Gluster MGMT Handshake",
+ .prognum = GD_MGMT_HNDSK_PROGRAM,
+ .progver = GD_MGMT_HNDSK_VERSION,
+ .procnames = glusterd_mgmt_hndsk_proc,
};
+
static int
glusterd_event_connected_inject (glusterd_peerctx_t *peerctx)
{
@@ -326,6 +847,7 @@ glusterd_event_connected_inject (glusterd_peerctx_t *peerctx)
ctx->hostname = gf_strdup (peerinfo->hostname);
ctx->port = peerinfo->port;
ctx->req = peerctx->args.req;
+ ctx->dict = peerctx->args.dict;
event->peerinfo = peerinfo;
event->ctx = ctx;
@@ -343,12 +865,286 @@ out:
return ret;
}
+
+int
+gd_validate_peer_op_version (xlator_t *this, glusterd_peerinfo_t *peerinfo,
+ dict_t *dict, char **errstr)
+{
+ int ret = -1;
+ glusterd_conf_t *conf = NULL;
+ int32_t peer_op_version = 0;
+ int32_t peer_min_op_version = 0;
+ int32_t peer_max_op_version = 0;
+
+ if (!dict && !this && !peerinfo)
+ goto out;
+
+ conf = this->private;
+
+ ret = dict_get_int32 (dict, GD_OP_VERSION_KEY, &peer_op_version);
+ if (ret)
+ goto out;
+
+ ret = dict_get_int32 (dict, GD_MAX_OP_VERSION_KEY,
+ &peer_max_op_version);
+ if (ret)
+ goto out;
+
+ ret = dict_get_int32 (dict, GD_MIN_OP_VERSION_KEY,
+ &peer_min_op_version);
+ if (ret)
+ goto out;
+
+ ret = -1;
+ /* Check if peer can support our op_version */
+ if ((peer_max_op_version < conf->op_version) ||
+ (peer_min_op_version > conf->op_version)) {
+ ret = gf_asprintf (errstr, "Peer %s does not support required "
+ "op-version", peerinfo->hostname);
+ ret = -1;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ gf_log (this->name , GF_LOG_DEBUG, "Peer %s %s", peerinfo->hostname,
+ ((ret < 0) ? "rejected" : "accepted"));
+ return ret;
+}
+
+int
+__glusterd_mgmt_hndsk_version_ack_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ int ret = -1;
+ int op_errno = EINVAL;
+ gf_mgmt_hndsk_rsp rsp = {0,};
+ xlator_t *this = NULL;
+ call_frame_t *frame = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ glusterd_peerctx_t *peerctx = NULL;
+ char msg[1024] = {0,};
+
+ this = THIS;
+ frame = myframe;
+ peerctx = frame->local;
+ peerinfo = peerctx->peerinfo;
+
+ if (-1 == req->rpc_status) {
+ snprintf (msg, sizeof (msg),
+ "Error through RPC layer, retry again later");
+ gf_log ("", GF_LOG_ERROR, "%s", msg);
+ peerctx->errstr = gf_strdup (msg);
+ goto out;
+ }
+
+ ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_mgmt_hndsk_rsp);
+ if (ret < 0) {
+ snprintf (msg, sizeof (msg), "Failed to decode XDR");
+ gf_log ("", GF_LOG_ERROR, "%s", msg);
+ peerctx->errstr = gf_strdup (msg);
+ goto out;
+ }
+
+ op_errno = rsp.op_errno;
+ if (-1 == rsp.op_ret) {
+ ret = -1;
+ snprintf (msg, sizeof (msg),
+ "Failed to get handshake ack from remote server");
+ gf_log (frame->this->name, GF_LOG_ERROR, "%s", msg);
+ peerctx->errstr = gf_strdup (msg);
+ goto out;
+ }
+
+ /* TODO: this is hardcoded as of now, but I don't forsee any problems
+ * with this as long as we are properly handshaking operating versions
+ */
+ peerinfo->mgmt = &gd_mgmt_prog;
+ peerinfo->peer = &gd_peer_prog;
+ peerinfo->mgmt_v3 = &gd_mgmt_v3_prog;
+
+ ret = default_notify (this, GF_EVENT_CHILD_UP, NULL);
+
+ if (GD_MODE_ON == peerctx->args.mode) {
+ ret = glusterd_event_connected_inject (peerctx);
+ peerctx->args.req = NULL;
+ } else if (GD_MODE_SWITCH_ON == peerctx->args.mode) {
+ peerctx->args.mode = GD_MODE_ON;
+ } else {
+ gf_log (this->name, GF_LOG_WARNING, "unknown mode %d",
+ peerctx->args.mode);
+ }
+
+ glusterd_friend_sm ();
+
+ ret = 0;
+out:
+
+ frame->local = NULL;
+ STACK_DESTROY (frame->root);
+
+ if (ret != 0)
+ rpc_transport_disconnect (peerinfo->rpc->conn.trans);
+
+ if (rsp.hndsk.hndsk_val)
+ free (rsp.hndsk.hndsk_val);
+
+ return 0;
+}
+
+int
+glusterd_mgmt_hndsk_version_ack_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ __glusterd_mgmt_hndsk_version_ack_cbk);
+}
+
+int
+__glusterd_mgmt_hndsk_version_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ int ret = -1;
+ int op_errno = EINVAL;
+ gf_mgmt_hndsk_rsp rsp = {0,};
+ gf_mgmt_hndsk_req arg = {{0,}};
+ xlator_t *this = NULL;
+ call_frame_t *frame = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ glusterd_peerctx_t *peerctx = NULL;
+ dict_t *dict = NULL;
+ dict_t *rsp_dict = NULL;
+ glusterd_conf_t *conf = NULL;
+ char msg[1024] = {0,};
+
+ this = THIS;
+ conf = this->private;
+ frame = myframe;
+ peerctx = frame->local;
+ peerinfo = peerctx->peerinfo;
+
+ if (-1 == req->rpc_status) {
+ ret = -1;
+ snprintf (msg, sizeof (msg),
+ "Error through RPC layer, retry again later");
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg);
+ peerctx->errstr = gf_strdup (msg);
+ goto out;
+ }
+
+ ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_mgmt_hndsk_rsp);
+ if (ret < 0) {
+ snprintf (msg, sizeof (msg), "Failed to decode management "
+ "handshake response");
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg);
+ peerctx->errstr = gf_strdup (msg);
+ goto out;
+ }
+
+ GF_PROTOCOL_DICT_UNSERIALIZE (this, dict, rsp.hndsk.hndsk_val,
+ rsp.hndsk.hndsk_len, ret, op_errno,
+ out);
+
+ op_errno = rsp.op_errno;
+ if (-1 == rsp.op_ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to get the 'versions' from peer (%s)",
+ req->conn->trans->peerinfo.identifier);
+ goto out;
+ }
+
+ /* Check if peer can be part of cluster */
+ ret = gd_validate_peer_op_version (this, peerinfo, dict,
+ &peerctx->errstr);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to validate the operating version of peer (%s)",
+ peerinfo->hostname);
+ goto out;
+ }
+
+ rsp_dict = dict_new ();
+ if (!rsp_dict)
+ goto out;
+
+ ret = dict_set_int32 (rsp_dict, GD_OP_VERSION_KEY, conf->op_version);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "failed to set operating version in dict");
+ goto out;
+ }
+
+ GF_PROTOCOL_DICT_SERIALIZE (this, rsp_dict, (&arg.hndsk.hndsk_val),
+ arg.hndsk.hndsk_len, op_errno, out);
+
+ ret = glusterd_submit_request (peerctx->peerinfo->rpc, &arg, frame,
+ &gd_clnt_mgmt_hndsk_prog,
+ GD_MGMT_HNDSK_VERSIONS_ACK, NULL, this,
+ glusterd_mgmt_hndsk_version_ack_cbk,
+ (xdrproc_t)xdr_gf_mgmt_hndsk_req);
+
+out:
+ if (ret) {
+ frame->local = NULL;
+ STACK_DESTROY (frame->root);
+ rpc_transport_disconnect (peerinfo->rpc->conn.trans);
+ }
+
+ if (rsp.hndsk.hndsk_val)
+ free (rsp.hndsk.hndsk_val);
+
+ if (arg.hndsk.hndsk_val)
+ GF_FREE (arg.hndsk.hndsk_val);
+
+ if (dict)
+ dict_unref (dict);
+
+ if (rsp_dict)
+ dict_unref (rsp_dict);
+
+ return 0;
+}
+
+int
+glusterd_mgmt_hndsk_version_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ __glusterd_mgmt_hndsk_version_cbk);
+}
+
+int
+glusterd_mgmt_handshake (xlator_t *this, glusterd_peerctx_t *peerctx)
+{
+ call_frame_t *frame = NULL;
+ gf_mgmt_hndsk_req req = {{0,},};
+ int ret = -1;
+
+ frame = create_frame (this, this->ctx->pool);
+ if (!frame)
+ goto out;
+
+ frame->local = peerctx;
+
+ ret = glusterd_submit_request (peerctx->peerinfo->rpc, &req, frame,
+ &gd_clnt_mgmt_hndsk_prog,
+ GD_MGMT_HNDSK_VERSIONS, NULL, this,
+ glusterd_mgmt_hndsk_version_cbk,
+ (xdrproc_t)xdr_gf_mgmt_hndsk_req);
+ ret = 0;
+out:
+ if (ret && frame)
+ STACK_DESTROY (frame->root);
+
+ return ret;
+}
+
int
glusterd_set_clnt_mgmt_program (glusterd_peerinfo_t *peerinfo,
gf_prog_detail *prog)
{
- gf_prog_detail *trav = NULL;
- int ret = -1;
+ gf_prog_detail *trav = NULL;
+ int ret = -1;
if (!peerinfo || !prog)
goto out;
@@ -356,45 +1152,80 @@ glusterd_set_clnt_mgmt_program (glusterd_peerinfo_t *peerinfo,
trav = prog;
while (trav) {
- /* Select 'programs' */
+ ret = -1;
if ((gd_mgmt_prog.prognum == trav->prognum) &&
(gd_mgmt_prog.progver == trav->progver)) {
peerinfo->mgmt = &gd_mgmt_prog;
ret = 0;
}
+
if ((gd_peer_prog.prognum == trav->prognum) &&
(gd_peer_prog.progver == trav->progver)) {
peerinfo->peer = &gd_peer_prog;
ret = 0;
}
+
if (ret) {
gf_log ("", GF_LOG_DEBUG,
"%s (%"PRId64":%"PRId64") not supported",
trav->progname, trav->prognum,
trav->progver);
}
+
trav = trav->next;
}
if (peerinfo->mgmt) {
- gf_log ("", GF_LOG_INFO,
- "Using Program %s, Num (%d), Version (%d)",
- peerinfo->mgmt->progname, peerinfo->mgmt->prognum,
- peerinfo->mgmt->progver);
+ gf_log ("", GF_LOG_INFO,
+ "Using Program %s, Num (%d), Version (%d)",
+ peerinfo->mgmt->progname, peerinfo->mgmt->prognum,
+ peerinfo->mgmt->progver);
}
+
if (peerinfo->peer) {
- gf_log ("", GF_LOG_INFO,
- "Using Program %s, Num (%d), Version (%d)",
- peerinfo->peer->progname, peerinfo->peer->prognum,
- peerinfo->peer->progver);
+ gf_log ("", GF_LOG_INFO,
+ "Using Program %s, Num (%d), Version (%d)",
+ peerinfo->peer->progname, peerinfo->peer->prognum,
+ peerinfo->peer->progver);
}
+ if (peerinfo->mgmt_v3) {
+ gf_log ("", GF_LOG_INFO,
+ "Using Program %s, Num (%d), Version (%d)",
+ peerinfo->mgmt_v3->progname,
+ peerinfo->mgmt_v3->prognum,
+ peerinfo->mgmt_v3->progver);
+ }
+
+ ret = 0;
+out:
+ return ret;
+
+}
+
+static gf_boolean_t
+_mgmt_hndsk_prog_present (gf_prog_detail *prog) {
+ gf_boolean_t ret = _gf_false;
+ gf_prog_detail *trav = NULL;
+
+ GF_ASSERT (prog);
+
+ trav = prog;
+
+ while (trav) {
+ if ((trav->prognum == GD_MGMT_HNDSK_PROGRAM) &&
+ (trav->progver == GD_MGMT_HNDSK_VERSION)) {
+ ret = _gf_true;
+ goto out;
+ }
+ trav = trav->next;
+ }
out:
return ret;
}
int
-glusterd_peer_dump_version_cbk (struct rpc_req *req, struct iovec *iov,
+__glusterd_peer_dump_version_cbk (struct rpc_req *req, struct iovec *iov,
int count, void *myframe)
{
int ret = -1;
@@ -405,9 +1236,11 @@ glusterd_peer_dump_version_cbk (struct rpc_req *req, struct iovec *iov,
call_frame_t *frame = NULL;
glusterd_peerinfo_t *peerinfo = NULL;
glusterd_peerctx_t *peerctx = NULL;
+ glusterd_conf_t *conf = NULL;
char msg[1024] = {0,};
this = THIS;
+ conf = this->private;
frame = myframe;
peerctx = frame->local;
peerinfo = peerctx->peerinfo;
@@ -435,6 +1268,22 @@ glusterd_peer_dump_version_cbk (struct rpc_req *req, struct iovec *iov,
goto out;
}
+ if (_mgmt_hndsk_prog_present (rsp.prog)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Proceeding to op-version handshake with peer %s",
+ peerinfo->hostname);
+ ret = glusterd_mgmt_handshake (this, peerctx);
+ goto out;
+ } else if (conf->op_version > 1) {
+ ret = -1;
+ snprintf (msg, sizeof (msg),
+ "Peer %s does not support required op-version",
+ peerinfo->hostname);
+ peerctx->errstr = gf_strdup (msg);
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg);
+ goto out;
+ }
+
/* Make sure we assign the proper program to peer */
ret = glusterd_set_clnt_mgmt_program (peerinfo, rsp.prog);
if (ret) {
@@ -454,10 +1303,11 @@ glusterd_peer_dump_version_cbk (struct rpc_req *req, struct iovec *iov,
peerctx->args.mode);
}
- glusterd_friend_sm ();
- glusterd_op_sm ();
+ glusterd_friend_sm();
+ glusterd_op_sm();
ret = 0;
+
out:
/* don't use GF_FREE, buffer was allocated by libc */
@@ -482,8 +1332,16 @@ out:
int
-glusterd_peer_handshake (xlator_t *this, struct rpc_clnt *rpc,
- glusterd_peerctx_t *peerctx)
+glusterd_peer_dump_version_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ __glusterd_peer_dump_version_cbk);
+}
+
+int
+glusterd_peer_dump_version (xlator_t *this, struct rpc_clnt *rpc,
+ glusterd_peerctx_t *peerctx)
{
call_frame_t *frame = NULL;
gf_dump_req req = {0,};
diff --git a/xlators/mgmt/glusterd/src/glusterd-hooks.c b/xlators/mgmt/glusterd/src/glusterd-hooks.c
index a61e1e85f..78730a564 100644
--- a/xlators/mgmt/glusterd/src/glusterd-hooks.c
+++ b/xlators/mgmt/glusterd/src/glusterd-hooks.c
@@ -49,6 +49,7 @@ char glusterd_hook_dirnames[GD_OP_MAX][256] =
[GD_OP_RESET_VOLUME] = EMPTY,
[GD_OP_SYNC_VOLUME] = EMPTY,
[GD_OP_LOG_ROTATE] = EMPTY,
+ [GD_OP_GSYNC_CREATE] = "gsync-create",
[GD_OP_GSYNC_SET] = EMPTY,
[GD_OP_PROFILE_VOLUME] = EMPTY,
[GD_OP_QUOTA] = EMPTY,
@@ -142,6 +143,24 @@ glusterd_hooks_get_hooks_cmd_subdir (glusterd_op_t op)
return glusterd_hook_dirnames[op];
}
+void
+glusterd_hooks_add_working_dir (runner_t *runner, glusterd_conf_t *priv)
+{
+ runner_argprintf (runner, "--gd-workdir=%s", priv->workdir);
+}
+
+void
+glusterd_hooks_add_op (runner_t *runner, char *op)
+{
+ runner_argprintf (runner, "--volume-op=%s", op);
+}
+
+void
+glusterd_hooks_add_hooks_version (runner_t* runner)
+{
+ runner_argprintf (runner, "--version=%d", GLUSTERD_HOOK_VER);
+}
+
int
glusterd_hooks_set_volume_args (dict_t *dict, runner_t *runner)
{
@@ -162,7 +181,7 @@ glusterd_hooks_set_volume_args (dict_t *dict, runner_t *runner)
goto out;
runner_add_arg (runner, "-o");
- for (i = 1; (ret == 0); i++) {
+ for (i = 1; ret == 0; i++) {
snprintf (query, sizeof (query), "key%d", i);
ret = dict_get_str (dict, query, &key);
if (ret)
@@ -185,6 +204,7 @@ static int
glusterd_hooks_add_op_args (runner_t *runner, glusterd_op_t op,
dict_t *op_ctx, glusterd_commit_hook_type_t type)
{
+ char *hooks_args = NULL;
int vol_count = 0;
gf_boolean_t truth = _gf_false;
glusterd_volinfo_t *voliter = NULL;
@@ -214,6 +234,11 @@ glusterd_hooks_add_op_args (runner_t *runner, glusterd_op_t op,
runner_argprintf (runner, "--first=%s",
truth? "yes":"no");
+
+ glusterd_hooks_add_hooks_version (runner);
+ glusterd_hooks_add_op (runner, "start");
+ glusterd_hooks_add_working_dir (runner, priv);
+
break;
case GD_OP_STOP_VOLUME:
@@ -236,6 +261,23 @@ glusterd_hooks_add_op_args (runner_t *runner, glusterd_op_t op,
ret = glusterd_hooks_set_volume_args (op_ctx, runner);
break;
+ case GD_OP_GSYNC_CREATE:
+ ret = dict_get_str (op_ctx, "hooks_args", &hooks_args);
+ if (ret)
+ gf_log ("", GF_LOG_DEBUG,
+ "No Hooks Arguments.");
+ else
+ gf_log ("", GF_LOG_DEBUG,
+ "Hooks Args = %s", hooks_args);
+ if (hooks_args)
+ runner_argprintf (runner, "%s", hooks_args);
+ break;
+
+ case GD_OP_ADD_BRICK:
+ glusterd_hooks_add_hooks_version (runner);
+ glusterd_hooks_add_op (runner, "add-brick");
+ glusterd_hooks_add_working_dir (runner, priv);
+
default:
break;
diff --git a/xlators/mgmt/glusterd/src/glusterd-locks.c b/xlators/mgmt/glusterd/src/glusterd-locks.c
new file mode 100644
index 000000000..28358aa55
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-locks.c
@@ -0,0 +1,656 @@
+/*
+ Copyright (c) 2013-2014 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "common-utils.h"
+#include "cli1-xdr.h"
+#include "xdr-generic.h"
+#include "glusterd.h"
+#include "glusterd-op-sm.h"
+#include "glusterd-store.h"
+#include "glusterd-utils.h"
+#include "glusterd-volgen.h"
+#include "glusterd-locks.h"
+#include "run.h"
+#include "syscall.h"
+
+#include <signal.h>
+
+#define GF_MAX_LOCKING_ENTITIES 2
+
+/* Valid entities that the mgmt_v3 lock can hold locks upon *
+ * To add newer entities to be locked, we can just add more *
+ * entries to this table along with the type and default value */
+glusterd_valid_entities valid_types[] = {
+ { "vol", _gf_true },
+ { "snap", _gf_false },
+ { NULL },
+};
+
+/* Checks if the lock request is for a valid entity */
+gf_boolean_t
+glusterd_mgmt_v3_is_type_valid (char *type)
+{
+ int32_t i = 0;
+ gf_boolean_t ret = _gf_false;
+
+ GF_ASSERT (type);
+
+ for (i = 0; valid_types[i].type; i++) {
+ if (!strcmp (type, valid_types[i].type)) {
+ ret = _gf_true;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+/* Initialize the global mgmt_v3 lock list(dict) when
+ * glusterd is spawned */
+int32_t
+glusterd_mgmt_v3_lock_init ()
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ priv->mgmt_v3_lock = dict_new ();
+ if (!priv->mgmt_v3_lock)
+ goto out;
+
+ ret = 0;
+out:
+ return ret;
+}
+
+/* Destroy the global mgmt_v3 lock list(dict) when
+ * glusterd cleanup is performed */
+void
+glusterd_mgmt_v3_lock_fini ()
+{
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ if (priv->mgmt_v3_lock)
+ dict_unref (priv->mgmt_v3_lock);
+}
+
+int32_t
+glusterd_get_mgmt_v3_lock_owner (char *key, uuid_t *uuid)
+{
+ int32_t ret = -1;
+ glusterd_mgmt_v3_lock_obj *lock_obj = NULL;
+ glusterd_conf_t *priv = NULL;
+ uuid_t no_owner = {0,};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ if (!key || !uuid) {
+ gf_log (this->name, GF_LOG_ERROR, "key or uuid is null.");
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_get_bin (priv->mgmt_v3_lock, key, (void **) &lock_obj);
+ if (!ret)
+ uuid_copy (*uuid, lock_obj->lock_owner);
+ else
+ uuid_copy (*uuid, no_owner);
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* This function is called with the locked_count and type, to *
+ * release all the acquired locks. */
+static int32_t
+glusterd_release_multiple_locks_per_entity (dict_t *dict, uuid_t uuid,
+ int32_t locked_count,
+ char *type)
+{
+ char name_buf[PATH_MAX] = "";
+ char *name = NULL;
+ int32_t i = -1;
+ int32_t op_ret = 0;
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT(this);
+ GF_ASSERT (dict);
+ GF_ASSERT (type);
+
+ if (locked_count == 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "No %s locked as part of this transaction",
+ type);
+ goto out;
+ }
+
+ /* Release all the locks held */
+ for (i = 0; i < locked_count; i++) {
+ snprintf (name_buf, sizeof(name_buf),
+ "%sname%d", type, i+1);
+
+ /* Looking for volname1, volname2 or snapname1, *
+ * as key in the dict snapname2 */
+ ret = dict_get_str (dict, name_buf, &name);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to get %s locked_count = %d",
+ name_buf, locked_count);
+ op_ret = ret;
+ continue;
+ }
+
+ ret = glusterd_mgmt_v3_unlock (name, uuid, type);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to release lock for %s.",
+ name);
+ op_ret = ret;
+ }
+ }
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", op_ret);
+ return op_ret;
+}
+
+/* Given the count and type of the entity this function acquires *
+ * locks on multiple elements of the same entity. For example: *
+ * If type is "vol" this function tries to acquire locks on multiple *
+ * volumes */
+static int32_t
+glusterd_acquire_multiple_locks_per_entity (dict_t *dict, uuid_t uuid,
+ int32_t count, char *type)
+{
+ char name_buf[PATH_MAX] = "";
+ char *name = NULL;
+ int32_t i = -1;
+ int32_t ret = -1;
+ int32_t locked_count = 0;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT(this);
+ GF_ASSERT (dict);
+ GF_ASSERT (type);
+
+ /* Locking one element after other */
+ for (i = 0; i < count; i++) {
+ snprintf (name_buf, sizeof(name_buf),
+ "%sname%d", type, i+1);
+
+ /* Looking for volname1, volname2 or snapname1, *
+ * as key in the dict snapname2 */
+ ret = dict_get_str (dict, name_buf, &name);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to get %s count = %d",
+ name_buf, count);
+ break;
+ }
+
+ ret = glusterd_mgmt_v3_lock (name, uuid, type);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to acquire lock for %s %s "
+ "on behalf of %s. Reversing "
+ "this transaction", type, name,
+ uuid_utoa(uuid));
+ break;
+ }
+ locked_count++;
+ }
+
+ if (count == locked_count) {
+ /* If all locking ops went successfuly, return as success */
+ ret = 0;
+ goto out;
+ }
+
+ /* If we failed to lock one element, unlock others and return failure */
+ ret = glusterd_release_multiple_locks_per_entity (dict, uuid,
+ locked_count,
+ type);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to release multiple %s locks",
+ type);
+ }
+ ret = -1;
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* Given the type of entity, this function figures out if it should unlock a *
+ * single element of multiple elements of the said entity. For example: *
+ * if the type is "vol", this function will accordingly unlock a single volume *
+ * or multiple volumes */
+static int32_t
+glusterd_mgmt_v3_unlock_entity (dict_t *dict, uuid_t uuid, char *type,
+ gf_boolean_t default_value)
+{
+ char name_buf[PATH_MAX] = "";
+ char *name = NULL;
+ int32_t count = -1;
+ int32_t ret = -1;
+ gf_boolean_t hold_locks = _gf_false;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT(this);
+ GF_ASSERT (dict);
+ GF_ASSERT (type);
+
+ snprintf (name_buf, sizeof(name_buf), "hold_%s_locks", type);
+ hold_locks = dict_get_str_boolean (dict, name_buf, default_value);
+
+ if (hold_locks == _gf_false) {
+ /* Locks were not held for this particular entity *
+ * Hence nothing to release */
+ ret = 0;
+ goto out;
+ }
+
+ /* Looking for volcount or snapcount in the dict */
+ snprintf (name_buf, sizeof(name_buf), "%scount", type);
+ ret = dict_get_int32 (dict, name_buf, &count);
+ if (ret) {
+ /* count is not present. Only one *
+ * element name needs to be unlocked */
+ snprintf (name_buf, sizeof(name_buf), "%sname",
+ type);
+ ret = dict_get_str (dict, name_buf, &name);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to fetch %sname", type);
+ goto out;
+ }
+
+ ret = glusterd_mgmt_v3_unlock (name, uuid, type);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to release lock for %s %s "
+ "on behalf of %s.", type, name,
+ uuid_utoa(uuid));
+ goto out;
+ }
+ } else {
+ /* Unlocking one element name after another */
+ ret = glusterd_release_multiple_locks_per_entity (dict,
+ uuid,
+ count,
+ type);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to release all %s locks", type);
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* Given the type of entity, this function figures out if it should lock a *
+ * single element or multiple elements of the said entity. For example: *
+ * if the type is "vol", this function will accordingly lock a single volume *
+ * or multiple volumes */
+static int32_t
+glusterd_mgmt_v3_lock_entity (dict_t *dict, uuid_t uuid, char *type,
+ gf_boolean_t default_value)
+{
+ char name_buf[PATH_MAX] = "";
+ char *name = NULL;
+ int32_t count = -1;
+ int32_t ret = -1;
+ gf_boolean_t hold_locks = _gf_false;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT(this);
+ GF_ASSERT (dict);
+ GF_ASSERT (type);
+
+ snprintf (name_buf, sizeof(name_buf), "hold_%s_locks", type);
+ hold_locks = dict_get_str_boolean (dict, name_buf, default_value);
+
+ if (hold_locks == _gf_false) {
+ /* Not holding locks for this particular entity */
+ ret = 0;
+ goto out;
+ }
+
+ /* Looking for volcount or snapcount in the dict */
+ snprintf (name_buf, sizeof(name_buf), "%scount", type);
+ ret = dict_get_int32 (dict, name_buf, &count);
+ if (ret) {
+ /* count is not present. Only one *
+ * element name needs to be locked */
+ snprintf (name_buf, sizeof(name_buf), "%sname",
+ type);
+ ret = dict_get_str (dict, name_buf, &name);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to fetch %sname", type);
+ goto out;
+ }
+
+ ret = glusterd_mgmt_v3_lock (name, uuid, type);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to acquire lock for %s %s "
+ "on behalf of %s.", type, name,
+ uuid_utoa(uuid));
+ goto out;
+ }
+ } else {
+ /* Locking one element name after another */
+ ret = glusterd_acquire_multiple_locks_per_entity (dict,
+ uuid,
+ count,
+ type);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to acquire all %s locks", type);
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* Try to release locks of multiple entities like *
+ * volume, snaps etc. */
+int32_t
+glusterd_multiple_mgmt_v3_unlock (dict_t *dict, uuid_t uuid)
+{
+ int32_t i = -1;
+ int32_t ret = -1;
+ int32_t op_ret = 0;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT(this);
+
+ if (!dict) {
+ gf_log (this->name, GF_LOG_ERROR, "dict is null.");
+ ret = -1;
+ goto out;
+ }
+
+ for (i = 0; valid_types[i].type; i++) {
+ ret = glusterd_mgmt_v3_unlock_entity
+ (dict, uuid,
+ valid_types[i].type,
+ valid_types[i].default_value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to unlock all %s",
+ valid_types[i].type);
+ op_ret = ret;
+ }
+ }
+
+ ret = op_ret;
+out:
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+/* Try to acquire locks on multiple entities like *
+ * volume, snaps etc. */
+int32_t
+glusterd_multiple_mgmt_v3_lock (dict_t *dict, uuid_t uuid)
+{
+ int32_t i = -1;
+ int32_t ret = -1;
+ int32_t locked_count = 0;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT(this);
+
+ if (!dict) {
+ gf_log (this->name, GF_LOG_ERROR, "dict is null.");
+ ret = -1;
+ goto out;
+ }
+
+ /* Locking one entity after other */
+ for (i = 0; valid_types[i].type; i++) {
+ ret = glusterd_mgmt_v3_lock_entity
+ (dict, uuid,
+ valid_types[i].type,
+ valid_types[i].default_value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to lock all %s",
+ valid_types[i].type);
+ break;
+ }
+ locked_count++;
+ }
+
+ if (locked_count == GF_MAX_LOCKING_ENTITIES) {
+ /* If all locking ops went successfuly, return as success */
+ ret = 0;
+ goto out;
+ }
+
+ /* If we failed to lock one entity, unlock others and return failure */
+ for (i = 0; i < locked_count; i++) {
+ ret = glusterd_mgmt_v3_unlock_entity
+ (dict, uuid,
+ valid_types[i].type,
+ valid_types[i].default_value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to unlock all %s",
+ valid_types[i].type);
+ }
+ }
+ ret = -1;
+out:
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_mgmt_v3_lock (const char *name, uuid_t uuid, char *type)
+{
+ char key[PATH_MAX] = "";
+ int32_t ret = -1;
+ glusterd_mgmt_v3_lock_obj *lock_obj = NULL;
+ glusterd_conf_t *priv = NULL;
+ gf_boolean_t is_valid = _gf_true;
+ uuid_t owner = {0};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ if (!name || !type) {
+ gf_log (this->name, GF_LOG_ERROR, "name or type is null.");
+ ret = -1;
+ goto out;
+ }
+
+ is_valid = glusterd_mgmt_v3_is_type_valid (type);
+ if (is_valid != _gf_true) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Invalid entity. Cannot perform locking "
+ "operation on %s types", type);
+ ret = -1;
+ goto out;
+ }
+
+ ret = snprintf (key, sizeof(key), "%s_%s", name, type);
+ if (ret != strlen(name) + 1 + strlen(type)) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR, "Unable to create key");
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Trying to acquire lock of %s %s for %s as %s",
+ type, name, uuid_utoa (uuid), key);
+
+ ret = glusterd_get_mgmt_v3_lock_owner (key, &owner);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Unable to get mgmt_v3 lock owner");
+ goto out;
+ }
+
+ /* If the lock has already been held for the given volume
+ * we fail */
+ if (!uuid_is_null (owner)) {
+ gf_log (this->name, GF_LOG_WARNING, "Lock for %s held by %s",
+ name, uuid_utoa (owner));
+ ret = -1;
+ goto out;
+ }
+
+ lock_obj = GF_CALLOC (1, sizeof(glusterd_mgmt_v3_lock_obj),
+ gf_common_mt_mgmt_v3_lock_obj_t);
+ if (!lock_obj) {
+ ret = -1;
+ goto out;
+ }
+
+ uuid_copy (lock_obj->lock_owner, uuid);
+
+ ret = dict_set_bin (priv->mgmt_v3_lock, key, lock_obj,
+ sizeof(glusterd_mgmt_v3_lock_obj));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set lock owner in mgmt_v3 lock");
+ if (lock_obj)
+ GF_FREE (lock_obj);
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Lock for %s %s successfully held by %s",
+ type, name, uuid_utoa (uuid));
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_mgmt_v3_unlock (const char *name, uuid_t uuid, char *type)
+{
+ char key[PATH_MAX] = "";
+ int32_t ret = -1;
+ gf_boolean_t is_valid = _gf_true;
+ glusterd_conf_t *priv = NULL;
+ uuid_t owner = {0};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ if (!name || !type) {
+ gf_log (this->name, GF_LOG_ERROR, "name is null.");
+ ret = -1;
+ goto out;
+ }
+
+ is_valid = glusterd_mgmt_v3_is_type_valid (type);
+ if (is_valid != _gf_true) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Invalid entity. Cannot perform unlocking "
+ "operation on %s types", type);
+ ret = -1;
+ goto out;
+ }
+
+ ret = snprintf (key, sizeof(key), "%s_%s",
+ name, type);
+ if (ret != strlen(name) + 1 + strlen(type)) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to create key");
+ ret = -1;
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Trying to release lock of %s %s for %s as %s",
+ type, name, uuid_utoa (uuid), key);
+
+ ret = glusterd_get_mgmt_v3_lock_owner (key, &owner);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Unable to get mgmt_v3 lock owner");
+ goto out;
+ }
+
+ if (uuid_is_null (owner)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Lock for %s %s not held", type, name);
+ ret = -1;
+ goto out;
+ }
+
+ ret = uuid_compare (uuid, owner);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Lock owner mismatch. "
+ "Lock for %s %s held by %s",
+ type, name, uuid_utoa (owner));
+ goto out;
+ }
+
+ /* Removing the mgmt_v3 lock from the global list */
+ dict_del (priv->mgmt_v3_lock, key);
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Lock for %s %s successfully released",
+ type, name);
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-locks.h b/xlators/mgmt/glusterd/src/glusterd-locks.h
new file mode 100644
index 000000000..b9cc8c0d1
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-locks.h
@@ -0,0 +1,51 @@
+/*
+ Copyright (c) 2013-2014 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _GLUSTERD_LOCKS_H_
+#define _GLUSTERD_LOCKS_H_
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+typedef struct glusterd_mgmt_v3_lock_object_ {
+ uuid_t lock_owner;
+} glusterd_mgmt_v3_lock_obj;
+
+typedef struct glusterd_mgmt_v3_lock_valid_entities {
+ char *type; /* Entity type like vol, snap */
+ gf_boolean_t default_value; /* The default value that *
+ * determines if the locks *
+ * should be held for that *
+ * entity */
+} glusterd_valid_entities;
+
+int32_t
+glusterd_mgmt_v3_lock_init ();
+
+void
+glusterd_mgmt_v3_lock_fini ();
+
+int32_t
+glusterd_get_mgmt_v3_lock_owner (char *volname, uuid_t *uuid);
+
+int32_t
+glusterd_mgmt_v3_lock (const char *key, uuid_t uuid, char *type);
+
+int32_t
+glusterd_mgmt_v3_unlock (const char *key, uuid_t uuid, char *type);
+
+int32_t
+glusterd_multiple_mgmt_v3_lock (dict_t *dict, uuid_t uuid);
+
+int32_t
+glusterd_multiple_mgmt_v3_unlock (dict_t *dict, uuid_t uuid);
+
+#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-log-ops.c b/xlators/mgmt/glusterd/src/glusterd-log-ops.c
index 9ab52a349..33bd95c03 100644
--- a/xlators/mgmt/glusterd/src/glusterd-log-ops.c
+++ b/xlators/mgmt/glusterd/src/glusterd-log-ops.c
@@ -24,18 +24,22 @@
#include <signal.h>
int
-glusterd_handle_log_rotate (rpcsvc_request_t *req)
+__glusterd_handle_log_rotate (rpcsvc_request_t *req)
{
int32_t ret = -1;
gf_cli_req cli_req = {{0,}};
dict_t *dict = NULL;
glusterd_op_t cli_op = GD_OP_LOG_ROTATE;
char *volname = NULL;
+ char msg[2048] = {0,};
+ xlator_t *this = NULL;
GF_ASSERT (req);
+ this = THIS;
+ GF_ASSERT (this);
- if (!xdr_to_generic (req->msg[0], &cli_req,
- (xdrproc_t)xdr_gf_cli_req)) {
+ ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
//failed to decode msg;
req->rpc_err = GARBAGE_ARGS;
goto out;
@@ -49,43 +53,50 @@ glusterd_handle_log_rotate (rpcsvc_request_t *req)
cli_req.dict.dict_len,
&dict);
if (ret < 0) {
- gf_log ("glusterd", GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"failed to "
"unserialize req-buffer to dictionary");
+ snprintf (msg, sizeof (msg), "Unable to decode the "
+ "command");
goto out;
}
}
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "failed to get volname");
+ snprintf (msg, sizeof (msg), "Failed to get volume name");
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg);
goto out;
}
- gf_log ("glusterd", GF_LOG_INFO, "Received log rotate req "
+ gf_log (this->name, GF_LOG_INFO, "Received log rotate req "
"for volume %s", volname);
ret = dict_set_uint64 (dict, "rotate-key", (uint64_t)time (NULL));
if (ret)
goto out;
- ret = glusterd_op_begin (req, GD_OP_LOG_ROTATE, dict);
+ ret = glusterd_op_begin_synctask (req, GD_OP_LOG_ROTATE, dict);
out:
- glusterd_friend_sm ();
- glusterd_op_sm ();
-
if (ret) {
+ if (msg[0] == '\0')
+ snprintf (msg, sizeof (msg), "Operation failed");
ret = glusterd_op_send_cli_response (cli_op, ret, 0, req,
- dict, "operation failed");
- if (dict)
- dict_unref (dict);
+ dict, msg);
}
free (cli_req.dict.dict_val);
return ret;
}
+int
+glusterd_handle_log_rotate (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ __glusterd_handle_log_rotate);
+}
+
/* op-sm */
int
glusterd_op_stage_log_rotate (dict_t *dict, char **op_errstr)
@@ -156,7 +167,6 @@ glusterd_op_log_rotate (dict_t *dict)
xlator_t *this = NULL;
char *volname = NULL;
char *brick = NULL;
- char path[PATH_MAX] = {0,};
char logfile[PATH_MAX] = {0,};
char pidfile[PATH_MAX] = {0,};
FILE *file = NULL;
@@ -212,10 +222,7 @@ cont:
valid_brick = 1;
- GLUSTERD_GET_VOLUME_DIR (path, volinfo, priv);
- GLUSTERD_GET_BRICK_PIDFILE (pidfile, path, brickinfo->hostname,
- brickinfo->path);
-
+ GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, brickinfo, priv);
file = fopen (pidfile, "r+");
if (!file) {
gf_log ("", GF_LOG_ERROR, "Unable to open pidfile: %s",
diff --git a/xlators/mgmt/glusterd/src/glusterd-mem-types.h b/xlators/mgmt/glusterd/src/glusterd-mem-types.h
index a13236da1..e6f6a0333 100644
--- a/xlators/mgmt/glusterd/src/glusterd-mem-types.h
+++ b/xlators/mgmt/glusterd/src/glusterd-mem-types.h
@@ -65,7 +65,11 @@ typedef enum gf_gld_mem_types_ {
gf_gld_mt_charptr = gf_common_mt_end + 49,
gf_gld_mt_hooks_stub_t = gf_common_mt_end + 50,
gf_gld_mt_hooks_priv_t = gf_common_mt_end + 51,
- gf_gld_mt_end = gf_common_mt_end + 52,
+ gf_gld_mt_mop_commit_req_t = gf_common_mt_end + 52,
+ gf_gld_mt_int = gf_common_mt_end + 53,
+ gf_gld_mt_snap_t = gf_common_mt_end + 54,
+ gf_gld_mt_missed_snapinfo_t = gf_common_mt_end + 55,
+ gf_gld_mt_end = gf_common_mt_end + 56,
} gf_gld_mem_types_t;
#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-mgmt-handler.c b/xlators/mgmt/glusterd/src/glusterd-mgmt-handler.c
new file mode 100644
index 000000000..81c5aa579
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-mgmt-handler.c
@@ -0,0 +1,936 @@
+/*
+ Copyright (c) 2013-2014 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+/* rpc related syncops */
+#include "rpc-clnt.h"
+#include "protocol-common.h"
+#include "xdr-generic.h"
+#include "glusterd1-xdr.h"
+#include "glusterd-syncop.h"
+
+#include "glusterd.h"
+#include "glusterd-utils.h"
+#include "glusterd-locks.h"
+#include "glusterd-mgmt.h"
+#include "glusterd-op-sm.h"
+
+static int
+glusterd_mgmt_v3_null (rpcsvc_request_t *req)
+{
+ return 0;
+}
+
+static int
+glusterd_mgmt_v3_lock_send_resp (rpcsvc_request_t *req, int32_t status)
+{
+
+ gd1_mgmt_v3_lock_rsp rsp = {{0},};
+ int ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ rsp.op_ret = status;
+ if (rsp.op_ret)
+ rsp.op_errno = errno;
+
+ glusterd_get_uuid (&rsp.uuid);
+
+ ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
+ (xdrproc_t)xdr_gd1_mgmt_v3_lock_rsp);
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Responded to mgmt_v3 lock, ret: %d", ret);
+
+ return ret;
+}
+
+static int
+glusterd_synctasked_mgmt_v3_lock (rpcsvc_request_t *req,
+ gd1_mgmt_v3_lock_req *lock_req,
+ glusterd_op_lock_ctx_t *ctx)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+ GF_ASSERT (ctx);
+ GF_ASSERT (ctx->dict);
+
+ /* Trying to acquire multiple mgmt_v3 locks */
+ ret = glusterd_multiple_mgmt_v3_lock (ctx->dict, ctx->uuid);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to acquire mgmt_v3 locks for %s",
+ uuid_utoa (ctx->uuid));
+
+ ret = glusterd_mgmt_v3_lock_send_resp (req, ret);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+static int
+glusterd_op_state_machine_mgmt_v3_lock (rpcsvc_request_t *req,
+ gd1_mgmt_v3_lock_req *lock_req,
+ glusterd_op_lock_ctx_t *ctx)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+ glusterd_op_info_t txn_op_info = {{0},};
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ glusterd_txn_opinfo_init (&txn_op_info, NULL, &lock_req->op,
+ ctx->dict, req);
+
+ ret = glusterd_set_txn_opinfo (&lock_req->txn_id, &txn_op_info);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set transaction's opinfo");
+ goto out;
+ }
+
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_LOCK,
+ &lock_req->txn_id, ctx);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to inject event GD_OP_EVENT_LOCK");
+
+out:
+ glusterd_friend_sm ();
+ glusterd_op_sm ();
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+static int
+glusterd_handle_mgmt_v3_lock_fn (rpcsvc_request_t *req)
+{
+ gd1_mgmt_v3_lock_req lock_req = {{0},};
+ int32_t ret = -1;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ glusterd_op_lock_ctx_t *ctx = NULL;
+ xlator_t *this = NULL;
+ gf_boolean_t is_synctasked = _gf_false;
+ gf_boolean_t free_ctx = _gf_false;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ ret = xdr_to_generic (req->msg[0], &lock_req,
+ (xdrproc_t)xdr_gd1_mgmt_v3_lock_req);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to decode lock "
+ "request received from peer");
+ req->rpc_err = GARBAGE_ARGS;
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG, "Received mgmt_v3 lock req "
+ "from uuid: %s", uuid_utoa (lock_req.uuid));
+
+ if (glusterd_friend_find_by_uuid (lock_req.uuid, &peerinfo)) {
+ gf_log (this->name, GF_LOG_WARNING, "%s doesn't "
+ "belong to the cluster. Ignoring request.",
+ uuid_utoa (lock_req.uuid));
+ ret = -1;
+ goto out;
+ }
+
+ ctx = GF_CALLOC (1, sizeof (*ctx), gf_gld_mt_op_lock_ctx_t);
+ if (!ctx) {
+ ret = -1;
+ goto out;
+ }
+
+ uuid_copy (ctx->uuid, lock_req.uuid);
+ ctx->req = req;
+
+ ctx->dict = dict_new ();
+ if (!ctx->dict) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_unserialize (lock_req.dict.dict_val,
+ lock_req.dict.dict_len, &ctx->dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to unserialize the dictionary");
+ goto out;
+ }
+
+ is_synctasked = dict_get_str_boolean (ctx->dict,
+ "is_synctasked", _gf_false);
+ if (is_synctasked) {
+ ret = glusterd_synctasked_mgmt_v3_lock (req, &lock_req, ctx);
+ /* The above function does not take ownership of ctx.
+ * Therefore we need to free the ctx explicitly. */
+ free_ctx = _gf_true;
+ }
+ else {
+ ret = glusterd_op_state_machine_mgmt_v3_lock (req, &lock_req,
+ ctx);
+ }
+
+out:
+
+ if (ret || free_ctx) {
+ if (ctx->dict)
+ dict_unref (ctx->dict);
+ if (ctx)
+ GF_FREE (ctx);
+ }
+
+ free (lock_req.dict.dict_val);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+static int
+glusterd_mgmt_v3_pre_validate_send_resp (rpcsvc_request_t *req,
+ int32_t op, int32_t status,
+ char *op_errstr, dict_t *rsp_dict)
+{
+ gd1_mgmt_v3_pre_val_rsp rsp = {{0},};
+ int ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ rsp.op_ret = status;
+ glusterd_get_uuid (&rsp.uuid);
+ rsp.op = op;
+ if (op_errstr)
+ rsp.op_errstr = op_errstr;
+ else
+ rsp.op_errstr = "";
+
+ ret = dict_allocate_and_serialize (rsp_dict, &rsp.dict.dict_val,
+ &rsp.dict.dict_len);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to get serialized length of dict");
+ goto out;
+ }
+
+ ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
+ (xdrproc_t)xdr_gd1_mgmt_v3_pre_val_rsp);
+
+ GF_FREE (rsp.dict.dict_val);
+out:
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Responded to pre validation, ret: %d", ret);
+ return ret;
+}
+
+static int
+glusterd_handle_pre_validate_fn (rpcsvc_request_t *req)
+{
+ int32_t ret = -1;
+ gd1_mgmt_v3_pre_val_req op_req = {{0},};
+ glusterd_peerinfo_t *peerinfo = NULL;
+ xlator_t *this = NULL;
+ char *op_errstr = NULL;
+ dict_t *dict = NULL;
+ dict_t *rsp_dict = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ ret = xdr_to_generic (req->msg[0], &op_req,
+ (xdrproc_t)xdr_gd1_mgmt_v3_pre_val_req);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to decode pre validation "
+ "request received from peer");
+ req->rpc_err = GARBAGE_ARGS;
+ goto out;
+ }
+
+ if (glusterd_friend_find_by_uuid (op_req.uuid, &peerinfo)) {
+ gf_log (this->name, GF_LOG_WARNING, "%s doesn't "
+ "belong to the cluster. Ignoring request.",
+ uuid_utoa (op_req.uuid));
+ ret = -1;
+ goto out;
+ }
+
+ dict = dict_new ();
+ if (!dict)
+ goto out;
+
+ ret = dict_unserialize (op_req.dict.dict_val,
+ op_req.dict.dict_len, &dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to unserialize the dictionary");
+ goto out;
+ }
+
+ rsp_dict = dict_new ();
+ if (!rsp_dict) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get new dictionary");
+ return -1;
+ }
+
+ ret = gd_mgmt_v3_pre_validate_fn (op_req.op, dict, &op_errstr,
+ rsp_dict);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Pre Validation failed on operation %s",
+ gd_op_list[op_req.op]);
+ }
+
+ ret = glusterd_mgmt_v3_pre_validate_send_resp (req, op_req.op,
+ ret, op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to send Pre Validation "
+ "response for operation %s",
+ gd_op_list[op_req.op]);
+ goto out;
+ }
+
+out:
+ if (op_errstr && (strcmp (op_errstr, "")))
+ GF_FREE (op_errstr);
+
+ free (op_req.dict.dict_val);
+
+ if (dict)
+ dict_unref (dict);
+
+ if (rsp_dict)
+ dict_unref (rsp_dict);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+static int
+glusterd_mgmt_v3_brick_op_send_resp (rpcsvc_request_t *req,
+ int32_t op, int32_t status,
+ char *op_errstr, dict_t *rsp_dict)
+{
+ gd1_mgmt_v3_brick_op_rsp rsp = {{0},};
+ int ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ rsp.op_ret = status;
+ glusterd_get_uuid (&rsp.uuid);
+ rsp.op = op;
+ if (op_errstr)
+ rsp.op_errstr = op_errstr;
+ else
+ rsp.op_errstr = "";
+
+ ret = dict_allocate_and_serialize (rsp_dict, &rsp.dict.dict_val,
+ &rsp.dict.dict_len);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to get serialized length of dict");
+ goto out;
+ }
+
+ ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
+ (xdrproc_t)xdr_gd1_mgmt_v3_brick_op_rsp);
+
+ GF_FREE (rsp.dict.dict_val);
+out:
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Responded to brick op, ret: %d", ret);
+ return ret;
+}
+
+static int
+glusterd_handle_brick_op_fn (rpcsvc_request_t *req)
+{
+ int32_t ret = -1;
+ gd1_mgmt_v3_brick_op_req op_req = {{0},};
+ glusterd_peerinfo_t *peerinfo = NULL;
+ xlator_t *this = NULL;
+ char *op_errstr = NULL;
+ dict_t *dict = NULL;
+ dict_t *rsp_dict = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ ret = xdr_to_generic (req->msg[0], &op_req,
+ (xdrproc_t)xdr_gd1_mgmt_v3_brick_op_req);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to decode brick op "
+ "request received from peer");
+ req->rpc_err = GARBAGE_ARGS;
+ goto out;
+ }
+
+ if (glusterd_friend_find_by_uuid (op_req.uuid, &peerinfo)) {
+ gf_log (this->name, GF_LOG_WARNING, "%s doesn't "
+ "belong to the cluster. Ignoring request.",
+ uuid_utoa (op_req.uuid));
+ ret = -1;
+ goto out;
+ }
+
+ dict = dict_new ();
+ if (!dict)
+ goto out;
+
+ ret = dict_unserialize (op_req.dict.dict_val,
+ op_req.dict.dict_len, &dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to unserialize the dictionary");
+ goto out;
+ }
+
+ rsp_dict = dict_new ();
+ if (!rsp_dict) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get new dictionary");
+ return -1;
+ }
+
+ ret = gd_mgmt_v3_brick_op_fn (op_req.op, dict, &op_errstr,
+ rsp_dict);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Brick Op failed on operation %s",
+ gd_op_list[op_req.op]);
+ }
+
+ ret = glusterd_mgmt_v3_brick_op_send_resp (req, op_req.op,
+ ret, op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to send brick op "
+ "response for operation %s",
+ gd_op_list[op_req.op]);
+ goto out;
+ }
+
+out:
+ if (op_errstr && (strcmp (op_errstr, "")))
+ GF_FREE (op_errstr);
+
+ free (op_req.dict.dict_val);
+
+ if (dict)
+ dict_unref (dict);
+
+ if (rsp_dict)
+ dict_unref (rsp_dict);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+static int
+glusterd_mgmt_v3_commit_send_resp (rpcsvc_request_t *req,
+ int32_t op, int32_t status,
+ char *op_errstr, dict_t *rsp_dict)
+{
+ gd1_mgmt_v3_commit_rsp rsp = {{0},};
+ int ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ rsp.op_ret = status;
+ glusterd_get_uuid (&rsp.uuid);
+ rsp.op = op;
+ if (op_errstr)
+ rsp.op_errstr = op_errstr;
+ else
+ rsp.op_errstr = "";
+
+ ret = dict_allocate_and_serialize (rsp_dict, &rsp.dict.dict_val,
+ &rsp.dict.dict_len);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to get serialized length of dict");
+ goto out;
+ }
+
+ ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
+ (xdrproc_t)xdr_gd1_mgmt_v3_commit_rsp);
+
+ GF_FREE (rsp.dict.dict_val);
+out:
+ gf_log (this->name, GF_LOG_DEBUG, "Responded to commit, ret: %d", ret);
+ return ret;
+}
+
+static int
+glusterd_handle_commit_fn (rpcsvc_request_t *req)
+{
+ int32_t ret = -1;
+ gd1_mgmt_v3_commit_req op_req = {{0},};
+ glusterd_peerinfo_t *peerinfo = NULL;
+ xlator_t *this = NULL;
+ char *op_errstr = NULL;
+ dict_t *dict = NULL;
+ dict_t *rsp_dict = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ ret = xdr_to_generic (req->msg[0], &op_req,
+ (xdrproc_t)xdr_gd1_mgmt_v3_commit_req);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to decode commit "
+ "request received from peer");
+ req->rpc_err = GARBAGE_ARGS;
+ goto out;
+ }
+
+ if (glusterd_friend_find_by_uuid (op_req.uuid, &peerinfo)) {
+ gf_log (this->name, GF_LOG_WARNING, "%s doesn't "
+ "belong to the cluster. Ignoring request.",
+ uuid_utoa (op_req.uuid));
+ ret = -1;
+ goto out;
+ }
+
+ dict = dict_new ();
+ if (!dict)
+ goto out;
+
+ ret = dict_unserialize (op_req.dict.dict_val,
+ op_req.dict.dict_len, &dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to unserialize the dictionary");
+ goto out;
+ }
+
+ rsp_dict = dict_new ();
+ if (!rsp_dict) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get new dictionary");
+ return -1;
+ }
+
+ ret = gd_mgmt_v3_commit_fn (op_req.op, dict, &op_errstr,
+ rsp_dict);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "commit failed on operation %s",
+ gd_op_list[op_req.op]);
+ }
+
+ ret = glusterd_mgmt_v3_commit_send_resp (req, op_req.op,
+ ret, op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to send commit "
+ "response for operation %s",
+ gd_op_list[op_req.op]);
+ goto out;
+ }
+
+out:
+ if (op_errstr && (strcmp (op_errstr, "")))
+ GF_FREE (op_errstr);
+
+ free (op_req.dict.dict_val);
+
+ if (dict)
+ dict_unref (dict);
+
+ if (rsp_dict)
+ dict_unref (rsp_dict);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+static int
+glusterd_mgmt_v3_post_validate_send_resp (rpcsvc_request_t *req,
+ int32_t op, int32_t status,
+ char *op_errstr, dict_t *rsp_dict)
+{
+ gd1_mgmt_v3_post_val_rsp rsp = {{0},};
+ int ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ rsp.op_ret = status;
+ glusterd_get_uuid (&rsp.uuid);
+ rsp.op = op;
+ if (op_errstr)
+ rsp.op_errstr = op_errstr;
+ else
+ rsp.op_errstr = "";
+
+ ret = dict_allocate_and_serialize (rsp_dict, &rsp.dict.dict_val,
+ &rsp.dict.dict_len);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to get serialized length of dict");
+ goto out;
+ }
+
+ ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
+ (xdrproc_t)xdr_gd1_mgmt_v3_post_val_rsp);
+
+ GF_FREE (rsp.dict.dict_val);
+out:
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Responded to post validation, ret: %d", ret);
+ return ret;
+}
+
+static int
+glusterd_handle_post_validate_fn (rpcsvc_request_t *req)
+{
+ int32_t ret = -1;
+ gd1_mgmt_v3_post_val_req op_req = {{0},};
+ glusterd_peerinfo_t *peerinfo = NULL;
+ xlator_t *this = NULL;
+ char *op_errstr = NULL;
+ dict_t *dict = NULL;
+ dict_t *rsp_dict = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ ret = xdr_to_generic (req->msg[0], &op_req,
+ (xdrproc_t)xdr_gd1_mgmt_v3_post_val_req);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to decode post validation "
+ "request received from peer");
+ req->rpc_err = GARBAGE_ARGS;
+ goto out;
+ }
+
+ if (glusterd_friend_find_by_uuid (op_req.uuid, &peerinfo)) {
+ gf_log (this->name, GF_LOG_WARNING, "%s doesn't "
+ "belong to the cluster. Ignoring request.",
+ uuid_utoa (op_req.uuid));
+ ret = -1;
+ goto out;
+ }
+
+ dict = dict_new ();
+ if (!dict)
+ goto out;
+
+ ret = dict_unserialize (op_req.dict.dict_val,
+ op_req.dict.dict_len, &dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to unserialize the dictionary");
+ goto out;
+ }
+
+ rsp_dict = dict_new ();
+ if (!rsp_dict) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get new dictionary");
+ return -1;
+ }
+
+ ret = gd_mgmt_v3_post_validate_fn (op_req.op, op_req.op_ret, dict,
+ &op_errstr, rsp_dict);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Post Validation failed on operation %s",
+ gd_op_list[op_req.op]);
+ }
+
+ ret = glusterd_mgmt_v3_post_validate_send_resp (req, op_req.op,
+ ret, op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to send Post Validation "
+ "response for operation %s",
+ gd_op_list[op_req.op]);
+ goto out;
+ }
+
+out:
+ if (op_errstr && (strcmp (op_errstr, "")))
+ GF_FREE (op_errstr);
+
+ free (op_req.dict.dict_val);
+
+ if (dict)
+ dict_unref (dict);
+
+ if (rsp_dict)
+ dict_unref (rsp_dict);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+static int
+glusterd_mgmt_v3_unlock_send_resp (rpcsvc_request_t *req, int32_t status)
+{
+
+ gd1_mgmt_v3_unlock_rsp rsp = {{0},};
+ int ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ rsp.op_ret = status;
+ if (rsp.op_ret)
+ rsp.op_errno = errno;
+
+ glusterd_get_uuid (&rsp.uuid);
+
+ ret = glusterd_submit_reply (req, &rsp, NULL, 0, NULL,
+ (xdrproc_t)xdr_gd1_mgmt_v3_unlock_rsp);
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Responded to mgmt_v3 unlock, ret: %d", ret);
+
+ return ret;
+}
+
+static int
+glusterd_syctasked_mgmt_v3_unlock (rpcsvc_request_t *req,
+ gd1_mgmt_v3_unlock_req *unlock_req,
+ glusterd_op_lock_ctx_t *ctx)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+ GF_ASSERT (ctx);
+
+ /* Trying to release multiple mgmt_v3 locks */
+ ret = glusterd_multiple_mgmt_v3_unlock (ctx->dict, ctx->uuid);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to release mgmt_v3 locks for %s",
+ uuid_utoa(ctx->uuid));
+ }
+
+ ret = glusterd_mgmt_v3_unlock_send_resp (req, ret);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+
+static int
+glusterd_op_state_machine_mgmt_v3_unlock (rpcsvc_request_t *req,
+ gd1_mgmt_v3_unlock_req *lock_req,
+ glusterd_op_lock_ctx_t *ctx)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_UNLOCK,
+ &lock_req->txn_id, ctx);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to inject event GD_OP_EVENT_UNLOCK");
+
+ glusterd_friend_sm ();
+ glusterd_op_sm ();
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+static int
+glusterd_handle_mgmt_v3_unlock_fn (rpcsvc_request_t *req)
+{
+ gd1_mgmt_v3_unlock_req lock_req = {{0},};
+ int32_t ret = -1;
+ glusterd_op_lock_ctx_t *ctx = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ xlator_t *this = NULL;
+ gf_boolean_t is_synctasked = _gf_false;
+ gf_boolean_t free_ctx = _gf_false;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ ret = xdr_to_generic (req->msg[0], &lock_req,
+ (xdrproc_t)xdr_gd1_mgmt_v3_unlock_req);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to decode unlock "
+ "request received from peer");
+ req->rpc_err = GARBAGE_ARGS;
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG, "Received volume unlock req "
+ "from uuid: %s", uuid_utoa (lock_req.uuid));
+
+ if (glusterd_friend_find_by_uuid (lock_req.uuid, &peerinfo)) {
+ gf_log (this->name, GF_LOG_WARNING, "%s doesn't "
+ "belong to the cluster. Ignoring request.",
+ uuid_utoa (lock_req.uuid));
+ ret = -1;
+ goto out;
+ }
+
+ ctx = GF_CALLOC (1, sizeof (*ctx), gf_gld_mt_op_lock_ctx_t);
+ if (!ctx) {
+ ret = -1;
+ goto out;
+ }
+
+ uuid_copy (ctx->uuid, lock_req.uuid);
+ ctx->req = req;
+
+ ctx->dict = dict_new ();
+ if (!ctx->dict) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_unserialize (lock_req.dict.dict_val,
+ lock_req.dict.dict_len, &ctx->dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to unserialize the dictionary");
+ goto out;
+ }
+
+ is_synctasked = dict_get_str_boolean (ctx->dict,
+ "is_synctasked", _gf_false);
+ if (is_synctasked) {
+ ret = glusterd_syctasked_mgmt_v3_unlock (req, &lock_req, ctx);
+ /* The above function does not take ownership of ctx.
+ * Therefore we need to free the ctx explicitly. */
+ free_ctx = _gf_true;
+ }
+ else {
+ ret = glusterd_op_state_machine_mgmt_v3_unlock (req, &lock_req,
+ ctx);
+ }
+
+out:
+
+ if (ret || free_ctx) {
+ if (ctx->dict)
+ dict_unref (ctx->dict);
+ if (ctx)
+ GF_FREE (ctx);
+ }
+
+ free (lock_req.dict.dict_val);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_handle_mgmt_v3_lock (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ glusterd_handle_mgmt_v3_lock_fn);
+}
+
+static int
+glusterd_handle_pre_validate (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ glusterd_handle_pre_validate_fn);
+}
+
+static int
+glusterd_handle_brick_op (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ glusterd_handle_brick_op_fn);
+}
+
+static int
+glusterd_handle_commit (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ glusterd_handle_commit_fn);
+}
+
+static int
+glusterd_handle_post_validate (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ glusterd_handle_post_validate_fn);
+}
+
+int
+glusterd_handle_mgmt_v3_unlock (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ glusterd_handle_mgmt_v3_unlock_fn);
+}
+
+rpcsvc_actor_t gd_svc_mgmt_v3_actors[] = {
+ [GLUSTERD_MGMT_V3_NULL] = { "NULL", GLUSTERD_MGMT_V3_NULL, glusterd_mgmt_v3_null, NULL, 0, DRC_NA},
+ [GLUSTERD_MGMT_V3_LOCK] = { "MGMT_V3_LOCK", GLUSTERD_MGMT_V3_LOCK, glusterd_handle_mgmt_v3_lock, NULL, 0, DRC_NA},
+ [GLUSTERD_MGMT_V3_PRE_VALIDATE] = { "PRE_VAL", GLUSTERD_MGMT_V3_PRE_VALIDATE, glusterd_handle_pre_validate, NULL, 0, DRC_NA},
+ [GLUSTERD_MGMT_V3_BRICK_OP] = { "BRCK_OP", GLUSTERD_MGMT_V3_BRICK_OP, glusterd_handle_brick_op, NULL, 0, DRC_NA},
+ [GLUSTERD_MGMT_V3_COMMIT] = { "COMMIT", GLUSTERD_MGMT_V3_COMMIT, glusterd_handle_commit, NULL, 0, DRC_NA},
+ [GLUSTERD_MGMT_V3_POST_VALIDATE] = { "POST_VAL", GLUSTERD_MGMT_V3_POST_VALIDATE, glusterd_handle_post_validate, NULL, 0, DRC_NA},
+ [GLUSTERD_MGMT_V3_UNLOCK] = { "MGMT_V3_UNLOCK", GLUSTERD_MGMT_V3_UNLOCK, glusterd_handle_mgmt_v3_unlock, NULL, 0, DRC_NA},
+};
+
+struct rpcsvc_program gd_svc_mgmt_v3_prog = {
+ .progname = "GlusterD svc mgmt v3",
+ .prognum = GD_MGMT_PROGRAM,
+ .progver = GD_MGMT_V3_VERSION,
+ .numactors = GLUSTERD_MGMT_V3_MAXVALUE,
+ .actors = gd_svc_mgmt_v3_actors,
+ .synctask = _gf_true,
+};
diff --git a/xlators/mgmt/glusterd/src/glusterd-mgmt.c b/xlators/mgmt/glusterd/src/glusterd-mgmt.c
new file mode 100644
index 000000000..5295f889e
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-mgmt.c
@@ -0,0 +1,1899 @@
+/*
+ Copyright (c) 2013-2014 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+/* rpc related syncops */
+#include "rpc-clnt.h"
+#include "protocol-common.h"
+#include "xdr-generic.h"
+#include "glusterd1-xdr.h"
+#include "glusterd-syncop.h"
+
+#include "glusterd.h"
+#include "glusterd-utils.h"
+#include "glusterd-locks.h"
+#include "glusterd-mgmt.h"
+#include "glusterd-op-sm.h"
+
+extern struct rpc_clnt_program gd_mgmt_v3_prog;
+
+
+static void
+gd_mgmt_v3_collate_errors (struct syncargs *args, int op_ret, int op_errno,
+ char *op_errstr, int op_code,
+ glusterd_peerinfo_t *peerinfo, u_char *uuid)
+{
+ char *peer_str = NULL;
+ char err_str[PATH_MAX] = "Please check log file for details.";
+ char op_err[PATH_MAX] = "";
+ int32_t len = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (args);
+ GF_ASSERT (uuid);
+
+ if (op_ret) {
+ args->op_ret = op_ret;
+ args->op_errno = op_errno;
+
+ if (peerinfo)
+ peer_str = peerinfo->hostname;
+ else
+ peer_str = uuid_utoa (uuid);
+
+ if (op_errstr && strcmp (op_errstr, "")) {
+ len = snprintf (err_str, sizeof(err_str) - 1,
+ "Error: %s", op_errstr);
+ err_str[len] = '\0';
+ }
+
+ switch (op_code) {
+ case GLUSTERD_MGMT_V3_LOCK:
+ {
+ len = snprintf (op_err, sizeof(op_err) - 1,
+ "Locking failed "
+ "on %s. %s", peer_str, err_str);
+ break;
+ }
+ case GLUSTERD_MGMT_V3_PRE_VALIDATE:
+ {
+ len = snprintf (op_err, sizeof(op_err) - 1,
+ "Pre Validation failed "
+ "on %s. %s", peer_str, err_str);
+ break;
+ }
+ case GLUSTERD_MGMT_V3_BRICK_OP:
+ {
+ len = snprintf (op_err, sizeof(op_err) - 1,
+ "Brick ops failed "
+ "on %s. %s", peer_str, err_str);
+ break;
+ }
+ case GLUSTERD_MGMT_V3_COMMIT:
+ {
+ len = snprintf (op_err, sizeof(op_err) - 1,
+ "Commit failed on %s. %s",
+ peer_str, err_str);
+ break;
+ }
+ case GLUSTERD_MGMT_V3_POST_VALIDATE:
+ {
+ len = snprintf (op_err, sizeof(op_err) - 1,
+ "Post Validation failed "
+ "on %s. %s", peer_str, err_str);
+ break;
+ }
+ case GLUSTERD_MGMT_V3_UNLOCK:
+ {
+ len = snprintf (op_err, sizeof(op_err) - 1,
+ "Unlocking failed "
+ "on %s. %s", peer_str, err_str);
+ break;
+ }
+ }
+ op_err[len] = '\0';
+
+ if (args->errstr) {
+ len = snprintf (err_str, sizeof(err_str) - 1,
+ "%s\n%s", args->errstr,
+ op_err);
+ GF_FREE (args->errstr);
+ args->errstr = NULL;
+ } else
+ len = snprintf (err_str, sizeof(err_str) - 1,
+ "%s", op_err);
+ err_str[len] = '\0';
+
+ gf_log (this->name, GF_LOG_ERROR, "%s", op_err);
+ args->errstr = gf_strdup (err_str);
+ }
+
+ return;
+}
+
+int32_t
+gd_mgmt_v3_pre_validate_fn (glusterd_op_t op, dict_t *dict,
+ char **op_errstr, dict_t *rsp_dict)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (op_errstr);
+ GF_ASSERT (rsp_dict);
+
+ switch (op) {
+ case GD_OP_SNAP:
+ ret = glusterd_snapshot_prevalidate (dict, op_errstr,
+ rsp_dict);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Snapshot Prevalidate Failed");
+ goto out;
+ }
+
+ break;
+
+ default:
+ break;
+ }
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_DEBUG, "OP = %d. Returning %d", op, ret);
+ return ret;
+}
+
+int32_t
+gd_mgmt_v3_brick_op_fn (glusterd_op_t op, dict_t *dict,
+ char **op_errstr, dict_t *rsp_dict)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (op_errstr);
+ GF_ASSERT (rsp_dict);
+
+ switch (op) {
+ case GD_OP_SNAP:
+ {
+ ret = glusterd_snapshot_brickop (dict, op_errstr, rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "snapshot brickop "
+ "failed");
+ goto out;
+ }
+ break;
+ }
+ default:
+ break;
+ }
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_TRACE, "OP = %d. Returning %d", op, ret);
+ return ret;
+}
+
+int32_t
+gd_mgmt_v3_commit_fn (glusterd_op_t op, dict_t *dict,
+ char **op_errstr, dict_t *rsp_dict)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (op_errstr);
+ GF_ASSERT (rsp_dict);
+
+ switch (op) {
+ case GD_OP_SNAP:
+ {
+ ret = glusterd_snapshot (dict, op_errstr, rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Snapshot Commit Failed");
+ goto out;
+ }
+ break;
+ }
+ default:
+ break;
+ }
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_DEBUG, "OP = %d. Returning %d", op, ret);
+ return ret;
+}
+
+int32_t
+gd_mgmt_v3_post_validate_fn (glusterd_op_t op, int32_t op_ret, dict_t *dict,
+ char **op_errstr, dict_t *rsp_dict)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (op_errstr);
+ GF_ASSERT (rsp_dict);
+
+ switch (op) {
+ case GD_OP_SNAP:
+ {
+ ret = glusterd_snapshot_postvalidate (dict, op_ret,
+ op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "postvalidate operation failed");
+ goto out;
+ }
+ break;
+ }
+ default:
+ break;
+ }
+
+ ret = 0;
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "OP = %d. Returning %d", op, ret);
+ return ret;
+}
+
+int32_t
+gd_mgmt_v3_lock_cbk_fn (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ int32_t ret = -1;
+ struct syncargs *args = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ gd1_mgmt_v3_lock_rsp rsp = {{0},};
+ call_frame_t *frame = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+ GF_ASSERT (myframe);
+
+ /* Even though the lock command has failed, while collating the errors
+ (gd_mgmt_v3_collate_errors), args->op_ret and args->op_errno will be
+ used. @args is obtained from frame->local. So before checking the
+ status of the request and going out if its a failure, args should be
+ set to frame->local. Otherwise, while collating args will be NULL.
+ This applies to other phases such as prevalidate, brickop, commit and
+ postvalidate also.
+ */
+ frame = myframe;
+ args = frame->local;
+ peerinfo = frame->cookie;
+ frame->local = NULL;
+ frame->cookie = NULL;
+
+ if (-1 == req->rpc_status) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ if (!iov) {
+ gf_log (this->name, GF_LOG_ERROR, "iov is NULL");
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ ret = xdr_to_generic (*iov, &rsp,
+ (xdrproc_t)xdr_gd1_mgmt_v3_lock_rsp);
+ if (ret < 0)
+ goto out;
+
+ uuid_copy (args->uuid, rsp.uuid);
+
+ op_ret = rsp.op_ret;
+ op_errno = rsp.op_errno;
+
+out:
+ gd_mgmt_v3_collate_errors (args, op_ret, op_errno, NULL,
+ GLUSTERD_MGMT_V3_LOCK,
+ peerinfo, rsp.uuid);
+ if (rsp.dict.dict_val)
+ free (rsp.dict.dict_val);
+ STACK_DESTROY (frame->root);
+ synctask_barrier_wake(args);
+ return 0;
+}
+
+int32_t
+gd_mgmt_v3_lock_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ gd_mgmt_v3_lock_cbk_fn);
+}
+
+int
+gd_mgmt_v3_lock (glusterd_op_t op, dict_t *op_ctx,
+ glusterd_peerinfo_t *peerinfo,
+ struct syncargs *args, uuid_t my_uuid,
+ uuid_t recv_uuid)
+{
+ gd1_mgmt_v3_lock_req req = {{0},};
+ glusterd_conf_t *conf = THIS->private;
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (op_ctx);
+ GF_ASSERT (peerinfo);
+ GF_ASSERT (args);
+
+ ret = dict_allocate_and_serialize (op_ctx,
+ &req.dict.dict_val,
+ &req.dict.dict_len);
+ if (ret)
+ goto out;
+
+ uuid_copy (req.uuid, my_uuid);
+ req.op = op;
+ synclock_unlock (&conf->big_lock);
+ ret = gd_syncop_submit_request (peerinfo->rpc, &req, args, peerinfo,
+ &gd_mgmt_v3_prog,
+ GLUSTERD_MGMT_V3_LOCK,
+ gd_mgmt_v3_lock_cbk,
+ (xdrproc_t) xdr_gd1_mgmt_v3_lock_req);
+ synclock_lock (&conf->big_lock);
+out:
+ GF_FREE (req.dict.dict_val);
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_mgmt_v3_initiate_lockdown (glusterd_conf_t *conf, glusterd_op_t op,
+ dict_t *dict, char **op_errstr, int npeers,
+ gf_boolean_t *is_acquired)
+{
+ char *volname = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ int32_t ret = -1;
+ int32_t peer_cnt = 0;
+ struct syncargs args = {0};
+ struct list_head *peers = NULL;
+ uuid_t peer_uuid = {0};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (conf);
+ GF_ASSERT (dict);
+ GF_ASSERT (op_errstr);
+ GF_ASSERT (is_acquired);
+
+ peers = &conf->xaction_peers;
+
+ /* Trying to acquire multiple mgmt_v3 locks on local node */
+ ret = glusterd_multiple_mgmt_v3_lock (dict, MY_UUID);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to acquire mgmt_v3 locks on localhost");
+ goto out;
+ }
+
+ *is_acquired = _gf_true;
+
+ if (!npeers) {
+ ret = 0;
+ goto out;
+ }
+
+ /* Sending mgmt_v3 lock req to other nodes in the cluster */
+ gd_syncargs_init (&args, NULL);
+ synctask_barrier_init((&args));
+ peer_cnt = 0;
+ list_for_each_entry (peerinfo, peers, op_peers_list) {
+ gd_mgmt_v3_lock (op, dict, peerinfo, &args,
+ MY_UUID, peer_uuid);
+ peer_cnt++;
+ }
+ gd_synctask_barrier_wait((&args), peer_cnt);
+
+ if (args.errstr)
+ *op_errstr = gf_strdup (args.errstr);
+
+ ret = args.op_ret;
+
+ gf_log (this->name, GF_LOG_DEBUG, "Sent lock op req for %s "
+ "to %d peers. Returning %d", gd_op_list[op], peer_cnt, ret);
+out:
+ if (ret) {
+ if (*op_errstr)
+ gf_log (this->name, GF_LOG_ERROR, "%s",
+ *op_errstr);
+
+ if (volname)
+ ret = gf_asprintf (op_errstr,
+ "Another transaction is in progress "
+ "for %s. Please try again after "
+ "sometime.", volname);
+ else
+ ret = gf_asprintf (op_errstr,
+ "Another transaction is in progress "
+ "Please try again after sometime.");
+
+ if (ret == -1)
+ *op_errstr = NULL;
+
+ ret = -1;
+ }
+
+ return ret;
+}
+
+int
+glusterd_pre_validate_aggr_rsp_dict (glusterd_op_t op,
+ dict_t *aggr, dict_t *rsp)
+{
+ int32_t ret = 0;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (aggr);
+ GF_ASSERT (rsp);
+
+ switch (op) {
+ case GD_OP_SNAP:
+ ret = glusterd_snap_pre_validate_use_rsp_dict (aggr, rsp);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to aggregate prevalidate "
+ "response dictionaries.");
+ goto out;
+ }
+ break;
+ default:
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR, "Invalid op (%s)",
+ gd_op_list[op]);
+
+ break;
+ }
+out:
+ return ret;
+}
+
+int32_t
+gd_mgmt_v3_pre_validate_cbk_fn (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ int32_t ret = -1;
+ struct syncargs *args = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ gd1_mgmt_v3_pre_val_rsp rsp = {{0},};
+ call_frame_t *frame = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = -1;
+ dict_t *rsp_dict = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+ GF_ASSERT (myframe);
+
+ frame = myframe;
+ args = frame->local;
+ peerinfo = frame->cookie;
+ frame->local = NULL;
+ frame->cookie = NULL;
+
+ if (-1 == req->rpc_status) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ if (!iov) {
+ gf_log (this->name, GF_LOG_ERROR, "iov is NULL");
+ op_errno = EINVAL;
+ }
+
+ ret = xdr_to_generic (*iov, &rsp,
+ (xdrproc_t)xdr_gd1_mgmt_v3_pre_val_rsp);
+ if (ret < 0)
+ goto out;
+
+ if (rsp.dict.dict_len) {
+ /* Unserialize the dictionary */
+ rsp_dict = dict_new ();
+
+ ret = dict_unserialize (rsp.dict.dict_val,
+ rsp.dict.dict_len,
+ &rsp_dict);
+ if (ret < 0) {
+ free (rsp.dict.dict_val);
+ goto out;
+ } else {
+ rsp_dict->extra_stdfree = rsp.dict.dict_val;
+ }
+ }
+
+ uuid_copy (args->uuid, rsp.uuid);
+ pthread_mutex_lock (&args->lock_dict);
+ {
+ ret = glusterd_pre_validate_aggr_rsp_dict (rsp.op, args->dict,
+ rsp_dict);
+ }
+ pthread_mutex_unlock (&args->lock_dict);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "%s",
+ "Failed to aggregate response from "
+ " node/brick");
+ if (!rsp.op_ret)
+ op_ret = ret;
+ else {
+ op_ret = rsp.op_ret;
+ op_errno = rsp.op_errno;
+ }
+ } else {
+ op_ret = rsp.op_ret;
+ op_errno = rsp.op_errno;
+ }
+
+out:
+ if (rsp_dict)
+ dict_unref (rsp_dict);
+
+ gd_mgmt_v3_collate_errors (args, op_ret, op_errno, NULL,
+ GLUSTERD_MGMT_V3_PRE_VALIDATE,
+ peerinfo, rsp.uuid);
+
+ if (rsp.op_errstr)
+ free (rsp.op_errstr);
+
+ STACK_DESTROY (frame->root);
+ synctask_barrier_wake(args);
+ return 0;
+}
+
+int32_t
+gd_mgmt_v3_pre_validate_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ gd_mgmt_v3_pre_validate_cbk_fn);
+}
+
+int
+gd_mgmt_v3_pre_validate_req (glusterd_op_t op, dict_t *op_ctx,
+ glusterd_peerinfo_t *peerinfo,
+ struct syncargs *args, uuid_t my_uuid,
+ uuid_t recv_uuid)
+{
+ int32_t ret = -1;
+ gd1_mgmt_v3_pre_val_req req = {{0},};
+ glusterd_conf_t *conf = THIS->private;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (op_ctx);
+ GF_ASSERT (peerinfo);
+ GF_ASSERT (args);
+
+ ret = dict_allocate_and_serialize (op_ctx,
+ &req.dict.dict_val,
+ &req.dict.dict_len);
+ if (ret)
+ goto out;
+
+ uuid_copy (req.uuid, my_uuid);
+ req.op = op;
+ synclock_unlock (&conf->big_lock);
+ ret = gd_syncop_submit_request (peerinfo->rpc, &req, args, peerinfo,
+ &gd_mgmt_v3_prog,
+ GLUSTERD_MGMT_V3_PRE_VALIDATE,
+ gd_mgmt_v3_pre_validate_cbk,
+ (xdrproc_t) xdr_gd1_mgmt_v3_pre_val_req);
+ synclock_lock (&conf->big_lock);
+out:
+ GF_FREE (req.dict.dict_val);
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_mgmt_v3_pre_validate (glusterd_conf_t *conf, glusterd_op_t op,
+ dict_t *req_dict, char **op_errstr, int npeers)
+{
+ int32_t ret = -1;
+ int32_t peer_cnt = 0;
+ dict_t *rsp_dict = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ struct syncargs args = {0};
+ struct list_head *peers = NULL;
+ uuid_t peer_uuid = {0};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (conf);
+ GF_ASSERT (req_dict);
+ GF_ASSERT (op_errstr);
+
+ peers = &conf->xaction_peers;
+
+ rsp_dict = dict_new ();
+ if (!rsp_dict) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to create response dictionary");
+ goto out;
+ }
+
+ /* Pre Validation on local node */
+ ret = gd_mgmt_v3_pre_validate_fn (op, req_dict, op_errstr,
+ rsp_dict);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Pre Validation failed for "
+ "operation %s on local node",
+ gd_op_list[op]);
+
+ if (*op_errstr == NULL) {
+ ret = gf_asprintf (op_errstr,
+ "Pre-validation failed "
+ "on localhost. Please "
+ "check log file for details");
+ if (ret == -1)
+ *op_errstr = NULL;
+
+ ret = -1;
+ }
+ goto out;
+ }
+
+ ret = glusterd_pre_validate_aggr_rsp_dict (op, req_dict,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "%s",
+ "Failed to aggregate response from "
+ " node/brick");
+ goto out;
+ }
+
+ dict_unref (rsp_dict);
+ rsp_dict = NULL;
+
+ if (!npeers) {
+ ret = 0;
+ goto out;
+ }
+
+ /* Sending Pre Validation req to other nodes in the cluster */
+ gd_syncargs_init (&args, req_dict);
+ synctask_barrier_init((&args));
+ peer_cnt = 0;
+ list_for_each_entry (peerinfo, peers, op_peers_list) {
+ gd_mgmt_v3_pre_validate_req (op, req_dict, peerinfo, &args,
+ MY_UUID, peer_uuid);
+ peer_cnt++;
+ }
+ gd_synctask_barrier_wait((&args), peer_cnt);
+
+ if (args.op_ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Pre Validation failed on peers");
+
+ if (args.errstr)
+ *op_errstr = gf_strdup (args.errstr);
+ }
+
+ ret = args.op_ret;
+
+ gf_log (this->name, GF_LOG_DEBUG, "Sent pre valaidation req for %s "
+ "to %d peers. Returning %d", gd_op_list[op], peer_cnt, ret);
+out:
+ return ret;
+}
+
+int
+glusterd_mgmt_v3_build_payload (dict_t **req, char **op_errstr, dict_t *dict,
+ glusterd_op_t op)
+{
+ int32_t ret = -1;
+ dict_t *req_dict = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+ GF_ASSERT (op_errstr);
+ GF_ASSERT (dict);
+
+ req_dict = dict_new ();
+ if (!req_dict)
+ goto out;
+
+ switch (op) {
+ case GD_OP_SNAP:
+ dict_copy (dict, req_dict);
+ break;
+ default:
+ break;
+ }
+
+ *req = req_dict;
+ ret = 0;
+out:
+ return ret;
+}
+
+int32_t
+gd_mgmt_v3_brick_op_cbk_fn (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ int32_t ret = -1;
+ struct syncargs *args = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ gd1_mgmt_v3_brick_op_rsp rsp = {{0},};
+ call_frame_t *frame = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+ GF_ASSERT (myframe);
+
+ frame = myframe;
+ args = frame->local;
+ peerinfo = frame->cookie;
+ frame->local = NULL;
+ frame->cookie = NULL;
+
+ /* If the operation failed, then iov can be NULL. So better check the
+ status of the operation and then worry about iov (if the status of
+ the command is success)
+ */
+ if (-1 == req->rpc_status) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ if (!iov) {
+ gf_log (this->name, GF_LOG_ERROR, "iov is NULL");
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ ret = xdr_to_generic (*iov, &rsp,
+ (xdrproc_t)xdr_gd1_mgmt_v3_brick_op_rsp);
+ if (ret < 0)
+ goto out;
+
+ uuid_copy (args->uuid, rsp.uuid);
+
+ op_ret = rsp.op_ret;
+ op_errno = rsp.op_errno;
+
+out:
+ gd_mgmt_v3_collate_errors (args, op_ret, op_errno, NULL,
+ GLUSTERD_MGMT_V3_BRICK_OP,
+ peerinfo, rsp.uuid);
+
+ if (rsp.op_errstr)
+ free (rsp.op_errstr);
+
+ if (rsp.dict.dict_val)
+ free (rsp.dict.dict_val);
+
+ STACK_DESTROY (frame->root);
+ synctask_barrier_wake(args);
+ return 0;
+}
+
+int32_t
+gd_mgmt_v3_brick_op_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ gd_mgmt_v3_brick_op_cbk_fn);
+}
+
+int
+gd_mgmt_v3_brick_op_req (glusterd_op_t op, dict_t *op_ctx,
+ glusterd_peerinfo_t *peerinfo,
+ struct syncargs *args, uuid_t my_uuid,
+ uuid_t recv_uuid)
+{
+ int32_t ret = -1;
+ gd1_mgmt_v3_brick_op_req req = {{0},};
+ glusterd_conf_t *conf = THIS->private;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (op_ctx);
+ GF_ASSERT (peerinfo);
+ GF_ASSERT (args);
+
+ ret = dict_allocate_and_serialize (op_ctx,
+ &req.dict.dict_val,
+ &req.dict.dict_len);
+ if (ret)
+ goto out;
+
+ uuid_copy (req.uuid, my_uuid);
+ req.op = op;
+ synclock_unlock (&conf->big_lock);
+ ret = gd_syncop_submit_request (peerinfo->rpc, &req, args, peerinfo,
+ &gd_mgmt_v3_prog,
+ GLUSTERD_MGMT_V3_BRICK_OP,
+ gd_mgmt_v3_brick_op_cbk,
+ (xdrproc_t) xdr_gd1_mgmt_v3_brick_op_req);
+ synclock_lock (&conf->big_lock);
+out:
+ GF_FREE (req.dict.dict_val);
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_mgmt_v3_brick_op (glusterd_conf_t *conf, glusterd_op_t op,
+ dict_t *req_dict, char **op_errstr, int npeers)
+{
+ int32_t ret = -1;
+ int32_t peer_cnt = 0;
+ dict_t *rsp_dict = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ struct syncargs args = {0};
+ struct list_head *peers = NULL;
+ uuid_t peer_uuid = {0};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (conf);
+ GF_ASSERT (req_dict);
+ GF_ASSERT (op_errstr);
+
+ peers = &conf->xaction_peers;
+
+ rsp_dict = dict_new ();
+ if (!rsp_dict) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to create response dictionary");
+ goto out;
+ }
+
+ /* Perform brick op on local node */
+ ret = gd_mgmt_v3_brick_op_fn (op, req_dict, op_errstr,
+ rsp_dict);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Brick ops failed for "
+ "operation %s on local node",
+ gd_op_list[op]);
+
+ if (*op_errstr == NULL) {
+ ret = gf_asprintf (op_errstr,
+ "Brick ops failed "
+ "on localhost. Please "
+ "check log file for details");
+ if (ret == -1)
+ *op_errstr = NULL;
+
+ ret = -1;
+ }
+ goto out;
+ }
+
+ dict_unref (rsp_dict);
+ rsp_dict = NULL;
+
+ if (!npeers) {
+ ret = 0;
+ goto out;
+ }
+
+ /* Sending brick op req to other nodes in the cluster */
+ gd_syncargs_init (&args, NULL);
+ synctask_barrier_init((&args));
+ peer_cnt = 0;
+ list_for_each_entry (peerinfo, peers, op_peers_list) {
+ gd_mgmt_v3_brick_op_req (op, req_dict, peerinfo, &args,
+ MY_UUID, peer_uuid);
+ peer_cnt++;
+ }
+ gd_synctask_barrier_wait((&args), peer_cnt);
+
+ if (args.op_ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Brick ops failed on peers");
+
+ if (args.errstr)
+ *op_errstr = gf_strdup (args.errstr);
+ }
+
+ ret = args.op_ret;
+
+ gf_log (this->name, GF_LOG_DEBUG, "Sent brick op req for %s "
+ "to %d peers. Returning %d", gd_op_list[op], peer_cnt, ret);
+out:
+ return ret;
+}
+
+int32_t
+gd_mgmt_v3_commit_cbk_fn (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ int32_t ret = -1;
+ struct syncargs *args = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ gd1_mgmt_v3_commit_rsp rsp = {{0},};
+ call_frame_t *frame = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = -1;
+ dict_t *rsp_dict = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+ GF_ASSERT (myframe);
+
+ frame = myframe;
+ args = frame->local;
+ peerinfo = frame->cookie;
+ frame->local = NULL;
+ frame->cookie = NULL;
+
+ if (-1 == req->rpc_status) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ if (!iov) {
+ gf_log (this->name, GF_LOG_ERROR, "iov is NULL");
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ ret = xdr_to_generic (*iov, &rsp,
+ (xdrproc_t)xdr_gd1_mgmt_v3_commit_rsp);
+ if (ret < 0)
+ goto out;
+
+ if (rsp.dict.dict_len) {
+ /* Unserialize the dictionary */
+ rsp_dict = dict_new ();
+
+ ret = dict_unserialize (rsp.dict.dict_val,
+ rsp.dict.dict_len,
+ &rsp_dict);
+ if (ret < 0) {
+ free (rsp.dict.dict_val);
+ goto out;
+ } else {
+ rsp_dict->extra_stdfree = rsp.dict.dict_val;
+ }
+ }
+
+ uuid_copy (args->uuid, rsp.uuid);
+ pthread_mutex_lock (&args->lock_dict);
+ {
+ ret = glusterd_syncop_aggr_rsp_dict (rsp.op, args->dict,
+ rsp_dict);
+ }
+ pthread_mutex_unlock (&args->lock_dict);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "%s",
+ "Failed to aggregate response from "
+ " node/brick");
+ if (!rsp.op_ret)
+ op_ret = ret;
+ else {
+ op_ret = rsp.op_ret;
+ op_errno = rsp.op_errno;
+ }
+ } else {
+ op_ret = rsp.op_ret;
+ op_errno = rsp.op_errno;
+ }
+
+out:
+ if (rsp_dict)
+ dict_unref (rsp_dict);
+
+ gd_mgmt_v3_collate_errors (args, op_ret, op_errno, NULL,
+ GLUSTERD_MGMT_V3_COMMIT,
+ peerinfo, rsp.uuid);
+
+ STACK_DESTROY (frame->root);
+ synctask_barrier_wake(args);
+ return 0;
+}
+
+int32_t
+gd_mgmt_v3_commit_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ gd_mgmt_v3_commit_cbk_fn);
+}
+
+int
+gd_mgmt_v3_commit_req (glusterd_op_t op, dict_t *op_ctx,
+ glusterd_peerinfo_t *peerinfo,
+ struct syncargs *args, uuid_t my_uuid,
+ uuid_t recv_uuid)
+{
+ int32_t ret = -1;
+ gd1_mgmt_v3_commit_req req = {{0},};
+ glusterd_conf_t *conf = THIS->private;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (op_ctx);
+ GF_ASSERT (peerinfo);
+ GF_ASSERT (args);
+
+ ret = dict_allocate_and_serialize (op_ctx,
+ &req.dict.dict_val,
+ &req.dict.dict_len);
+ if (ret)
+ goto out;
+
+ uuid_copy (req.uuid, my_uuid);
+ req.op = op;
+ synclock_unlock (&conf->big_lock);
+ ret = gd_syncop_submit_request (peerinfo->rpc, &req, args, peerinfo,
+ &gd_mgmt_v3_prog,
+ GLUSTERD_MGMT_V3_COMMIT,
+ gd_mgmt_v3_commit_cbk,
+ (xdrproc_t) xdr_gd1_mgmt_v3_commit_req);
+ synclock_lock (&conf->big_lock);
+out:
+ GF_FREE (req.dict.dict_val);
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_mgmt_v3_commit (glusterd_conf_t *conf, glusterd_op_t op,
+ dict_t *op_ctx, dict_t *req_dict,
+ char **op_errstr, int npeers)
+{
+ int32_t ret = -1;
+ int32_t peer_cnt = 0;
+ dict_t *rsp_dict = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ struct syncargs args = {0};
+ struct list_head *peers = NULL;
+ uuid_t peer_uuid = {0};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (conf);
+ GF_ASSERT (op_ctx);
+ GF_ASSERT (req_dict);
+ GF_ASSERT (op_errstr);
+
+ peers = &conf->xaction_peers;
+
+ rsp_dict = dict_new ();
+ if (!rsp_dict) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to create response dictionary");
+ goto out;
+ }
+
+ /* Commit on local node */
+ ret = gd_mgmt_v3_commit_fn (op, req_dict, op_errstr,
+ rsp_dict);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Commit failed for "
+ "operation %s on local node",
+ gd_op_list[op]);
+
+ if (*op_errstr == NULL) {
+ ret = gf_asprintf (op_errstr,
+ "Commit failed "
+ "on localhost. Please "
+ "check log file for details.");
+ if (ret == -1)
+ *op_errstr = NULL;
+
+ ret = -1;
+ }
+ goto out;
+ }
+
+ ret = glusterd_syncop_aggr_rsp_dict (op, op_ctx,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "%s",
+ "Failed to aggregate response from "
+ " node/brick");
+ goto out;
+ }
+
+ dict_unref (rsp_dict);
+ rsp_dict = NULL;
+
+ if (!npeers) {
+ ret = 0;
+ goto out;
+ }
+
+ /* Sending commit req to other nodes in the cluster */
+ gd_syncargs_init (&args, op_ctx);
+ synctask_barrier_init((&args));
+ peer_cnt = 0;
+ list_for_each_entry (peerinfo, peers, op_peers_list) {
+ gd_mgmt_v3_commit_req (op, req_dict, peerinfo, &args,
+ MY_UUID, peer_uuid);
+ peer_cnt++;
+ }
+ gd_synctask_barrier_wait((&args), peer_cnt);
+
+ if (args.op_ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Commit failed on peers");
+
+ if (args.errstr)
+ *op_errstr = gf_strdup (args.errstr);
+ }
+
+ ret = args.op_ret;
+
+ gf_log (this->name, GF_LOG_DEBUG, "Sent commit req for %s to %d "
+ "peers. Returning %d", gd_op_list[op], peer_cnt, ret);
+out:
+ return ret;
+}
+
+int32_t
+gd_mgmt_v3_post_validate_cbk_fn (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ int32_t ret = -1;
+ struct syncargs *args = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ gd1_mgmt_v3_post_val_rsp rsp = {{0},};
+ call_frame_t *frame = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+ GF_ASSERT (myframe);
+
+ frame = myframe;
+ args = frame->local;
+ peerinfo = frame->cookie;
+ frame->local = NULL;
+ frame->cookie = NULL;
+
+ if (-1 == req->rpc_status) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ if (!iov) {
+ gf_log (this->name, GF_LOG_ERROR, "iov is NULL");
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ ret = xdr_to_generic (*iov, &rsp,
+ (xdrproc_t)xdr_gd1_mgmt_v3_post_val_rsp);
+ if (ret < 0)
+ goto out;
+
+ uuid_copy (args->uuid, rsp.uuid);
+
+ op_ret = rsp.op_ret;
+ op_errno = rsp.op_errno;
+
+out:
+ gd_mgmt_v3_collate_errors (args, op_ret, op_errno, NULL,
+ GLUSTERD_MGMT_V3_POST_VALIDATE,
+ peerinfo, rsp.uuid);
+ if (rsp.op_errstr)
+ free (rsp.op_errstr);
+
+ if (rsp.dict.dict_val)
+ free (rsp.dict.dict_val);
+ STACK_DESTROY (frame->root);
+ synctask_barrier_wake(args);
+ return 0;
+}
+
+int32_t
+gd_mgmt_v3_post_validate_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ gd_mgmt_v3_post_validate_cbk_fn);
+}
+
+int
+gd_mgmt_v3_post_validate_req (glusterd_op_t op, int32_t op_ret, dict_t *op_ctx,
+ glusterd_peerinfo_t *peerinfo,
+ struct syncargs *args, uuid_t my_uuid,
+ uuid_t recv_uuid)
+{
+ int32_t ret = -1;
+ gd1_mgmt_v3_post_val_req req = {{0},};
+ glusterd_conf_t *conf = THIS->private;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (op_ctx);
+ GF_ASSERT (peerinfo);
+ GF_ASSERT (args);
+
+ ret = dict_allocate_and_serialize (op_ctx,
+ &req.dict.dict_val,
+ &req.dict.dict_len);
+ if (ret)
+ goto out;
+
+ uuid_copy (req.uuid, my_uuid);
+ req.op = op;
+ req.op_ret = op_ret;
+ synclock_unlock (&conf->big_lock);
+ ret = gd_syncop_submit_request (peerinfo->rpc, &req, args, peerinfo,
+ &gd_mgmt_v3_prog,
+ GLUSTERD_MGMT_V3_POST_VALIDATE,
+ gd_mgmt_v3_post_validate_cbk,
+ (xdrproc_t) xdr_gd1_mgmt_v3_post_val_req);
+ synclock_lock (&conf->big_lock);
+out:
+ GF_FREE (req.dict.dict_val);
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_mgmt_v3_post_validate (glusterd_conf_t *conf, glusterd_op_t op,
+ int32_t op_ret, dict_t *dict, dict_t *req_dict,
+ char **op_errstr, int npeers)
+{
+ int32_t ret = -1;
+ int32_t peer_cnt = 0;
+ dict_t *rsp_dict = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ struct syncargs args = {0};
+ struct list_head *peers = NULL;
+ uuid_t peer_uuid = {0};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (conf);
+ GF_ASSERT (dict);
+ GF_ASSERT (req_dict);
+ GF_ASSERT (op_errstr);
+
+ peers = &conf->xaction_peers;
+ GF_ASSERT (peers);
+
+ rsp_dict = dict_new ();
+ if (!rsp_dict) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to create response dictionary");
+ goto out;
+ }
+
+ /* Copy the contents of dict like missed snaps info to req_dict */
+ dict_copy (dict, req_dict);
+
+ /* Post Validation on local node */
+ ret = gd_mgmt_v3_post_validate_fn (op, op_ret, req_dict, op_errstr,
+ rsp_dict);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Post Validation failed for "
+ "operation %s on local node",
+ gd_op_list[op]);
+
+ if (*op_errstr == NULL) {
+ ret = gf_asprintf (op_errstr,
+ "Post-validation failed "
+ "on localhost. Please check "
+ "log file for details");
+ if (ret == -1)
+ *op_errstr = NULL;
+
+ ret = -1;
+ }
+ goto out;
+ }
+
+ dict_unref (rsp_dict);
+ rsp_dict = NULL;
+
+ if (!npeers) {
+ ret = 0;
+ goto out;
+ }
+
+ /* Sending Post Validation req to other nodes in the cluster */
+ gd_syncargs_init (&args, req_dict);
+ synctask_barrier_init((&args));
+ peer_cnt = 0;
+ list_for_each_entry (peerinfo, peers, op_peers_list) {
+ gd_mgmt_v3_post_validate_req (op, op_ret, req_dict, peerinfo,
+ &args, MY_UUID, peer_uuid);
+ peer_cnt++;
+ }
+ gd_synctask_barrier_wait((&args), peer_cnt);
+
+ if (args.op_ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Post Validation failed on peers");
+
+ if (args.errstr)
+ *op_errstr = gf_strdup (args.errstr);
+ }
+
+ ret = args.op_ret;
+
+ gf_log (this->name, GF_LOG_DEBUG, "Sent post valaidation req for %s "
+ "to %d peers. Returning %d", gd_op_list[op], peer_cnt, ret);
+out:
+ return ret;
+}
+
+int32_t
+gd_mgmt_v3_unlock_cbk_fn (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ int32_t ret = -1;
+ struct syncargs *args = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ gd1_mgmt_v3_unlock_rsp rsp = {{0},};
+ call_frame_t *frame = NULL;
+ int32_t op_ret = -1;
+ int32_t op_errno = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+ GF_ASSERT (myframe);
+
+ frame = myframe;
+ args = frame->local;
+ peerinfo = frame->cookie;
+ frame->local = NULL;
+ frame->cookie = NULL;
+
+ if (-1 == req->rpc_status) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ if (!iov) {
+ gf_log (this->name, GF_LOG_ERROR, "iov is NULL");
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ ret = xdr_to_generic (*iov, &rsp,
+ (xdrproc_t)xdr_gd1_mgmt_v3_unlock_rsp);
+ if (ret < 0)
+ goto out;
+
+ uuid_copy (args->uuid, rsp.uuid);
+
+ op_ret = rsp.op_ret;
+ op_errno = rsp.op_errno;
+
+out:
+ gd_mgmt_v3_collate_errors (args, op_ret, op_errno, NULL,
+ GLUSTERD_MGMT_V3_UNLOCK,
+ peerinfo, rsp.uuid);
+ if (rsp.dict.dict_val)
+ free (rsp.dict.dict_val);
+ STACK_DESTROY (frame->root);
+ synctask_barrier_wake(args);
+ return 0;
+}
+
+int32_t
+gd_mgmt_v3_unlock_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ gd_mgmt_v3_unlock_cbk_fn);
+}
+
+int
+gd_mgmt_v3_unlock (glusterd_op_t op, dict_t *op_ctx,
+ glusterd_peerinfo_t *peerinfo,
+ struct syncargs *args, uuid_t my_uuid,
+ uuid_t recv_uuid)
+{
+ int32_t ret = -1;
+ gd1_mgmt_v3_unlock_req req = {{0},};
+ glusterd_conf_t *conf = THIS->private;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (op_ctx);
+ GF_ASSERT (peerinfo);
+ GF_ASSERT (args);
+
+ ret = dict_allocate_and_serialize (op_ctx,
+ &req.dict.dict_val,
+ &req.dict.dict_len);
+ if (ret)
+ goto out;
+
+ uuid_copy (req.uuid, my_uuid);
+ req.op = op;
+ synclock_unlock (&conf->big_lock);
+ ret = gd_syncop_submit_request (peerinfo->rpc, &req, args, peerinfo,
+ &gd_mgmt_v3_prog,
+ GLUSTERD_MGMT_V3_UNLOCK,
+ gd_mgmt_v3_unlock_cbk,
+ (xdrproc_t) xdr_gd1_mgmt_v3_unlock_req);
+ synclock_lock (&conf->big_lock);
+out:
+ GF_FREE (req.dict.dict_val);
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_mgmt_v3_release_peer_locks (glusterd_conf_t *conf, glusterd_op_t op,
+ dict_t *dict, int32_t op_ret,
+ char **op_errstr, int npeers,
+ gf_boolean_t is_acquired)
+{
+ int32_t ret = -1;
+ int32_t peer_cnt = 0;
+ uuid_t peer_uuid = {0};
+ xlator_t *this = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ struct syncargs args = {0};
+ struct list_head *peers = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (conf);
+ GF_ASSERT (dict);
+ GF_ASSERT (op_errstr);
+
+ peers = &conf->xaction_peers;
+
+ /* If the lock has not been held during this
+ * transaction, do not send unlock requests */
+ if (!is_acquired)
+ goto out;
+
+ if (!npeers) {
+ ret = 0;
+ goto out;
+ }
+
+ /* Sending mgmt_v3 unlock req to other nodes in the cluster */
+ gd_syncargs_init (&args, NULL);
+ synctask_barrier_init((&args));
+ peer_cnt = 0;
+ list_for_each_entry (peerinfo, peers, op_peers_list) {
+ gd_mgmt_v3_unlock (op, dict, peerinfo, &args,
+ MY_UUID, peer_uuid);
+ peer_cnt++;
+ }
+ gd_synctask_barrier_wait((&args), peer_cnt);
+
+ if (args.op_ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unlock failed on peers");
+
+ if (!op_ret && args.errstr)
+ *op_errstr = gf_strdup (args.errstr);
+ }
+
+ ret = args.op_ret;
+
+ gf_log (this->name, GF_LOG_DEBUG, "Sent unlock op req for %s "
+ "to %d peers. Returning %d", gd_op_list[op], peer_cnt, ret);
+
+out:
+ return ret;
+}
+
+int32_t
+glusterd_mgmt_v3_initiate_all_phases (rpcsvc_request_t *req, glusterd_op_t op,
+ dict_t *dict)
+{
+ int32_t ret = -1;
+ int32_t op_ret = -1;
+ int32_t npeers = 0;
+ dict_t *req_dict = NULL;
+ dict_t *tmp_dict = NULL;
+ glusterd_conf_t *conf = NULL;
+ char *op_errstr = NULL;
+ xlator_t *this = NULL;
+ gf_boolean_t is_acquired = _gf_false;
+ uuid_t *originator_uuid = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+ GF_ASSERT (dict);
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ /* Save the MY_UUID as the originator_uuid. This originator_uuid
+ * will be used by is_origin_glusterd() to determine if a node
+ * is the originator node for a command. */
+ originator_uuid = GF_CALLOC (1, sizeof(uuid_t),
+ gf_common_mt_uuid_t);
+ if (!originator_uuid) {
+ ret = -1;
+ goto out;
+ }
+
+ uuid_copy (*originator_uuid, MY_UUID);
+ ret = dict_set_bin (dict, "originator_uuid",
+ originator_uuid, sizeof (uuid_t));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set originator_uuid.");
+ goto out;
+ }
+
+ /* Marking the operation as complete synctasked */
+ ret = dict_set_int32 (dict, "is_synctasked", _gf_true);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set synctasked flag.");
+ goto out;
+ }
+
+ /* Use a copy at local unlock as cli response will be sent before
+ * the unlock and the volname in the dict might be removed */
+ tmp_dict = dict_new();
+ if (!tmp_dict) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to create dict");
+ goto out;
+ }
+ dict_copy (dict, tmp_dict);
+
+ /* BUILD PEERS LIST */
+ INIT_LIST_HEAD (&conf->xaction_peers);
+ npeers = gd_build_peers_list (&conf->peers, &conf->xaction_peers, op);
+
+ /* LOCKDOWN PHASE - Acquire mgmt_v3 locks */
+ ret = glusterd_mgmt_v3_initiate_lockdown (conf, op, dict, &op_errstr,
+ npeers, &is_acquired);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "mgmt_v3 lockdown failed.");
+ goto out;
+ }
+
+ /* BUILD PAYLOAD */
+ ret = glusterd_mgmt_v3_build_payload (&req_dict, &op_errstr, dict, op);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, LOGSTR_BUILD_PAYLOAD,
+ gd_op_list[op]);
+ if (op_errstr == NULL)
+ gf_asprintf (&op_errstr, OPERRSTR_BUILD_PAYLOAD);
+ goto out;
+ }
+
+ /* PRE-COMMIT VALIDATE PHASE */
+ ret = glusterd_mgmt_v3_pre_validate (conf, op, req_dict,
+ &op_errstr, npeers);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Pre Validation Failed");
+ goto out;
+ }
+
+ /* COMMIT OP PHASE */
+ ret = glusterd_mgmt_v3_commit (conf, op, dict, req_dict,
+ &op_errstr, npeers);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Commit Op Failed");
+ goto out;
+ }
+
+ /* POST-COMMIT VALIDATE PHASE */
+ /* As of now, post_validate is not handling any other
+ commands other than snapshot. So as of now, I am
+ sending 0 (op_ret as 0).
+ */
+ ret = glusterd_mgmt_v3_post_validate (conf, op, 0, dict, req_dict,
+ &op_errstr, npeers);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Post Validation Failed");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ op_ret = ret;
+ /* UNLOCK PHASE FOR PEERS*/
+ (void) glusterd_mgmt_v3_release_peer_locks (conf, op, dict,
+ op_ret, &op_errstr,
+ npeers, is_acquired);
+
+ /* LOCAL VOLUME(S) UNLOCK */
+ if (is_acquired) {
+ /* Trying to release multiple mgmt_v3 locks */
+ ret = glusterd_multiple_mgmt_v3_unlock (tmp_dict, MY_UUID);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to release mgmt_v3 locks on localhost");
+ op_ret = ret;
+ }
+ }
+
+ /* SEND CLI RESPONSE */
+ glusterd_op_send_cli_response (op, op_ret, 0, req, dict, op_errstr);
+
+ if (req_dict)
+ dict_unref (req_dict);
+
+ if (tmp_dict)
+ dict_unref (tmp_dict);
+
+ if (op_errstr) {
+ GF_FREE (op_errstr);
+ op_errstr = NULL;
+ }
+
+ return 0;
+}
+
+int32_t
+glusterd_mgmt_v3_initiate_snap_phases (rpcsvc_request_t *req, glusterd_op_t op,
+ dict_t *dict)
+{
+ int32_t ret = -1;
+ int32_t op_ret = -1;
+ int32_t npeers = 0;
+ dict_t *req_dict = NULL;
+ dict_t *tmp_dict = NULL;
+ glusterd_conf_t *conf = NULL;
+ char *op_errstr = NULL;
+ xlator_t *this = NULL;
+ gf_boolean_t is_acquired = _gf_false;
+ uuid_t *originator_uuid = NULL;
+ gf_boolean_t success = _gf_false;
+ char *tmp_errstr = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+ GF_ASSERT (dict);
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ /* Save the MY_UUID as the originator_uuid. This originator_uuid
+ * will be used by is_origin_glusterd() to determine if a node
+ * is the originator node for a command. */
+ originator_uuid = GF_CALLOC (1, sizeof(uuid_t),
+ gf_common_mt_uuid_t);
+ if (!originator_uuid) {
+ ret = -1;
+ goto out;
+ }
+
+ uuid_copy (*originator_uuid, MY_UUID);
+ ret = dict_set_bin (dict, "originator_uuid",
+ originator_uuid, sizeof (uuid_t));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set originator_uuid.");
+ goto out;
+ }
+
+ /* Marking the operation as complete synctasked */
+ ret = dict_set_int32 (dict, "is_synctasked", _gf_true);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set synctasked flag.");
+ goto out;
+ }
+
+ /* Use a copy at local unlock as cli response will be sent before
+ * the unlock and the volname in the dict might be removed */
+ tmp_dict = dict_new();
+ if (!tmp_dict) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to create dict");
+ goto out;
+ }
+ dict_copy (dict, tmp_dict);
+
+ /* BUILD PEERS LIST */
+ INIT_LIST_HEAD (&conf->xaction_peers);
+ npeers = gd_build_peers_list (&conf->peers, &conf->xaction_peers, op);
+
+ /* LOCKDOWN PHASE - Acquire mgmt_v3 locks */
+ ret = glusterd_mgmt_v3_initiate_lockdown (conf, op, dict, &op_errstr,
+ npeers, &is_acquired);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "mgmt_v3 lockdown failed.");
+ goto out;
+ }
+
+ /* BUILD PAYLOAD */
+ ret = glusterd_mgmt_v3_build_payload (&req_dict, &op_errstr, dict, op);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, LOGSTR_BUILD_PAYLOAD,
+ gd_op_list[op]);
+ if (op_errstr == NULL)
+ gf_asprintf (&op_errstr, OPERRSTR_BUILD_PAYLOAD);
+ goto out;
+ }
+
+ /* PRE-COMMIT VALIDATE PHASE */
+ ret = glusterd_mgmt_v3_pre_validate (conf, op, req_dict,
+ &op_errstr, npeers);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Pre Validation Failed");
+ goto out;
+ }
+
+ /* BRICK OP PHASE for initiating barrier*/
+ ret = dict_set_int32 (req_dict, "barrier", 1);
+ if (ret)
+ goto out;
+ ret = glusterd_mgmt_v3_brick_op (conf, op, req_dict,
+ &op_errstr, npeers);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Brick Ops Failed");
+ goto unbarrier;
+ }
+
+ /* COMMIT OP PHASE */
+ /* TODO: As of now, the plan is to do quorum check before sending the
+ commit fop and if the quorum succeeds, then commit is sent to all
+ the other glusterds.
+ snap create functionality now creates the in memory and on disk
+ objects for the snapshot (marking them as incomplete), takes the lvm
+ snapshot and then updates the status of the in memory and on disk
+ snap objects as complete. Suppose one of the glusterds goes down
+ after taking the lvm snapshot, but before updating the snap object,
+ then treat it as a snapshot create failure and trigger cleanup.
+ i.e the number of commit responses received by the originator
+ glusterd shold be the same as the number of peers it has sent the
+ request to (i.e npeers variable). If not, then originator glusterd
+ will initiate cleanup in post-validate fop.
+ Question: What if one of the other glusterds goes down as explained
+ above and along with it the originator glusterd also goes down?
+ Who will initiate the cleanup?
+ */
+ ret = glusterd_mgmt_v3_commit (conf, op, dict, req_dict,
+ &op_errstr, npeers);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Commit Op Failed");
+ /* If the main op fails, we should save the error string.
+ Because, op_errstr will be used for unbarrier and
+ unlock ops also. We might lose the actual error that
+ caused the failure.
+ */
+ tmp_errstr = op_errstr;
+ op_errstr = NULL;
+ goto unbarrier;
+ }
+
+ success = _gf_true;
+unbarrier:
+ /* BRICK OP PHASE for removing the barrier*/
+ ret = dict_set_int32 (req_dict, "barrier", 0);
+ if (ret)
+ goto out;
+ ret = glusterd_mgmt_v3_brick_op (conf, op, req_dict,
+ &op_errstr, npeers);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Brick Ops Failed");
+ goto out;
+ }
+
+ ret = 0;
+
+out:
+ op_ret = ret;
+
+ if (success == _gf_false)
+ op_ret = -1;
+
+ /* POST-COMMIT VALIDATE PHASE */
+ ret = glusterd_mgmt_v3_post_validate (conf, op, op_ret, dict, req_dict,
+ &op_errstr, npeers);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Post Validation Failed");
+ op_ret = -1;
+ }
+
+ /* UNLOCK PHASE FOR PEERS*/
+ (void) glusterd_mgmt_v3_release_peer_locks (conf, op, dict,
+ op_ret, &op_errstr,
+ npeers, is_acquired);
+
+ /* If the commit op (snapshot taking) failed, then the error is stored
+ in tmp_errstr and unbarrier is called. Suppose, if unbarrier also
+ fails, then the error happened in unbarrier is logged and freed.
+ The error happened in commit op, which is stored in tmp_errstr
+ is sent to cli.
+ */
+ if (tmp_errstr) {
+ if (op_errstr) {
+ gf_log (this->name, GF_LOG_ERROR, "unbarrier brick op"
+ "failed with the error %s", op_errstr);
+ GF_FREE (op_errstr);
+ op_errstr = NULL;
+ }
+ op_errstr = tmp_errstr;
+ }
+
+ /* LOCAL VOLUME(S) UNLOCK */
+ if (is_acquired) {
+ /* Trying to release multiple mgmt_v3 locks */
+ ret = glusterd_multiple_mgmt_v3_unlock (tmp_dict, MY_UUID);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to release mgmt_v3 locks on localhost");
+ op_ret = ret;
+ }
+ }
+
+ /* SEND CLI RESPONSE */
+ glusterd_op_send_cli_response (op, op_ret, 0, req, dict, op_errstr);
+
+ if (req_dict)
+ dict_unref (req_dict);
+
+ if (tmp_dict)
+ dict_unref (tmp_dict);
+
+ if (op_errstr) {
+ GF_FREE (op_errstr);
+ op_errstr = NULL;
+ }
+
+ return 0;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-mgmt.h b/xlators/mgmt/glusterd/src/glusterd-mgmt.h
new file mode 100644
index 000000000..b185a9bec
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-mgmt.h
@@ -0,0 +1,45 @@
+/*
+ Copyright (c) 2013-2014 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _GLUSTERD_MGMT_H_
+#define _GLUSTERD_MGMT_H_
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+int32_t
+gd_mgmt_v3_pre_validate_fn (glusterd_op_t op, dict_t *dict,
+ char **op_errstr, dict_t *rsp_dict);
+
+int32_t
+gd_mgmt_v3_brick_op_fn (glusterd_op_t op, dict_t *dict,
+ char **op_errstr, dict_t *rsp_dict);
+
+int32_t
+gd_mgmt_v3_commit_fn (glusterd_op_t op, dict_t *dict,
+ char **op_errstr, dict_t *rsp_dict);
+
+int32_t
+gd_mgmt_v3_post_validate_fn (glusterd_op_t op, int32_t op_ret, dict_t *dict,
+ char **op_errstr, dict_t *rsp_dict);
+
+int32_t
+glusterd_mgmt_v3_initiate_all_phases (rpcsvc_request_t *req, glusterd_op_t op,
+ dict_t *dict);
+
+int32_t
+glusterd_mgmt_v3_initiate_snap_phases (rpcsvc_request_t *req, glusterd_op_t op,
+ dict_t *dict);
+
+int
+glusterd_snap_pre_validate_use_rsp_dict (dict_t *dst, dict_t *src);
+
+#endif /* _GLUSTERD_MGMT_H_ */
diff --git a/xlators/mgmt/glusterd/src/glusterd-mountbroker.c b/xlators/mgmt/glusterd/src/glusterd-mountbroker.c
index 0d67d1303..4ce441da8 100644
--- a/xlators/mgmt/glusterd/src/glusterd-mountbroker.c
+++ b/xlators/mgmt/glusterd/src/glusterd-mountbroker.c
@@ -231,7 +231,6 @@ parse_mount_pattern_desc (gf_mount_spec_t *mspec, char *pdesc)
const char *georep_mnt_desc_template =
"SUP("
- "xlator-option=\\*-dht.assert-no-child-down=true "
"volfile-server=localhost "
"client-pid=%d "
"user-map-root=%s "
diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
index d117b05af..9b130b4c6 100644
--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c
+++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c
@@ -37,6 +37,7 @@
#include "glusterd-store.h"
#include "glusterd-hooks.h"
#include "glusterd-volgen.h"
+#include "glusterd-locks.h"
#include "syscall.h"
#include "cli1-xdr.h"
#include "common-utils.h"
@@ -46,9 +47,274 @@
#include <signal.h>
#include <sys/wait.h>
+#define ALL_VOLUME_OPTION_CHECK(volname, key, ret, op_errstr, label) \
+ do { \
+ gf_boolean_t _all = !strcmp ("all", volname); \
+ gf_boolean_t _ratio = !strcmp (key, \
+ GLUSTERD_QUORUM_RATIO_KEY); \
+ if (_all && !_ratio) { \
+ ret = -1; \
+ *op_errstr = gf_strdup ("Not a valid option for all " \
+ "volumes"); \
+ goto label; \
+ } else if (!_all && _ratio) { \
+ ret = -1; \
+ *op_errstr = gf_strdup ("Not a valid option for " \
+ "single volume"); \
+ goto label; \
+ } \
+ } while (0)
+
static struct list_head gd_op_sm_queue;
pthread_mutex_t gd_op_sm_lock;
glusterd_op_info_t opinfo = {{0},};
+
+int32_t
+glusterd_txn_opinfo_dict_init ()
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ priv->glusterd_txn_opinfo = dict_new ();
+ if (!priv->glusterd_txn_opinfo) {
+ ret = -1;
+ goto out;
+ }
+
+ memset (priv->global_txn_id, '\0', sizeof(uuid_t));
+
+ ret = 0;
+out:
+ return ret;
+}
+
+void
+glusterd_txn_opinfo_dict_fini ()
+{
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ if (priv->glusterd_txn_opinfo)
+ dict_unref (priv->glusterd_txn_opinfo);
+}
+
+void
+glusterd_txn_opinfo_init (glusterd_op_info_t *opinfo,
+ glusterd_op_sm_state_info_t *state,
+ glusterd_op_t *op,
+ dict_t *op_ctx,
+ rpcsvc_request_t *req)
+{
+ GF_ASSERT (opinfo);
+
+ if (state)
+ opinfo->state = *state;
+
+ if (op)
+ opinfo->op = *op;
+
+ if (op_ctx)
+ opinfo->op_ctx = dict_ref(op_ctx);
+ else
+ opinfo->op_ctx = NULL;
+
+ if (req)
+ opinfo->req = req;
+
+ return;
+}
+
+int32_t
+glusterd_generate_txn_id (dict_t *dict, uuid_t **txn_id)
+{
+ int32_t ret = -1;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (dict);
+
+ *txn_id = GF_CALLOC (1, sizeof(uuid_t), gf_common_mt_uuid_t);
+ if (!*txn_id)
+ goto out;
+
+ if (priv->op_version < GD_OP_VERSION_4)
+ uuid_copy (**txn_id, priv->global_txn_id);
+ else
+ uuid_generate (**txn_id);
+
+ ret = dict_set_bin (dict, "transaction_id",
+ *txn_id, sizeof (**txn_id));
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Failed to set transaction id.");
+ goto out;
+ }
+
+ gf_log ("", GF_LOG_DEBUG,
+ "Transaction_id = %s", uuid_utoa (**txn_id));
+out:
+ if (ret && *txn_id) {
+ GF_FREE (*txn_id);
+ *txn_id = NULL;
+ }
+
+ return ret;
+}
+
+int32_t
+glusterd_get_txn_opinfo (uuid_t *txn_id, glusterd_op_info_t *opinfo)
+{
+ int32_t ret = -1;
+ glusterd_txn_opinfo_obj *opinfo_obj = NULL;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ if (!txn_id || !opinfo) {
+ gf_log ("", GF_LOG_ERROR,
+ "Empty transaction id or opinfo received.");
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_get_bin(priv->glusterd_txn_opinfo,
+ uuid_utoa (*txn_id),
+ (void **) &opinfo_obj);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to get transaction opinfo "
+ "for transaction ID : %s",
+ uuid_utoa (*txn_id));
+ goto out;
+ }
+
+ (*opinfo) = opinfo_obj->opinfo;
+
+ gf_log ("", GF_LOG_DEBUG,
+ "Successfully got opinfo for transaction ID : %s",
+ uuid_utoa (*txn_id));
+
+ ret = 0;
+out:
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_set_txn_opinfo (uuid_t *txn_id, glusterd_op_info_t *opinfo)
+{
+ int32_t ret = -1;
+ glusterd_txn_opinfo_obj *opinfo_obj = NULL;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ if (!txn_id) {
+ gf_log ("", GF_LOG_ERROR, "Empty transaction id received.");
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_get_bin(priv->glusterd_txn_opinfo,
+ uuid_utoa (*txn_id),
+ (void **) &opinfo_obj);
+ if (ret) {
+ opinfo_obj = GF_CALLOC (1, sizeof(glusterd_txn_opinfo_obj),
+ gf_common_mt_txn_opinfo_obj_t);
+ if (!opinfo_obj) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_bin(priv->glusterd_txn_opinfo,
+ uuid_utoa (*txn_id), opinfo_obj,
+ sizeof(glusterd_txn_opinfo_obj));
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to set opinfo for transaction ID : %s",
+ uuid_utoa (*txn_id));
+ goto out;
+ }
+ }
+
+ opinfo_obj->opinfo = (*opinfo);
+
+ gf_log ("", GF_LOG_DEBUG,
+ "Successfully set opinfo for transaction ID : %s",
+ uuid_utoa (*txn_id));
+ ret = 0;
+out:
+ if (ret)
+ if (opinfo_obj)
+ GF_FREE (opinfo_obj);
+
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_clear_txn_opinfo (uuid_t *txn_id)
+{
+ int32_t ret = -1;
+ glusterd_op_info_t txn_op_info = {{0},};
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ if (!txn_id) {
+ gf_log ("", GF_LOG_ERROR, "Empty transaction id received.");
+ ret = -1;
+ goto out;
+ }
+
+ ret = glusterd_get_txn_opinfo (txn_id, &txn_op_info);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Transaction opinfo not found");
+ goto out;
+ }
+
+ if (txn_op_info.op_ctx)
+ dict_unref (txn_op_info.op_ctx);
+
+ dict_del(priv->glusterd_txn_opinfo, uuid_utoa (*txn_id));
+
+ gf_log ("", GF_LOG_DEBUG,
+ "Successfully cleared opinfo for transaction ID : %s",
+ uuid_utoa (*txn_id));
+
+ ret = 0;
+out:
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
static int glusterfs_port = GLUSTERD_DEFAULT_PORT;
static char *glusterd_op_sm_state_names[] = {
"Default",
@@ -87,6 +353,8 @@ static char *glusterd_op_sm_event_names[] = {
"GD_OP_EVENT_INVALID"
};
+extern struct volopt_map_entry glusterd_volopt_map[];
+
char*
glusterd_op_sm_state_name_get (int state)
{
@@ -127,14 +395,48 @@ glusterd_is_volume_started (glusterd_volinfo_t *volinfo)
}
static int
-glusterd_op_sm_inject_all_acc ()
+glusterd_op_sm_inject_all_acc (uuid_t *txn_id)
{
int32_t ret = -1;
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACC, NULL);
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACC, txn_id, NULL);
gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
+static int
+glusterd_check_quota_cmd (char *key, char *value, char *errstr, size_t size)
+{
+ int ret = -1;
+ gf_boolean_t b = _gf_false;
+
+ if ((strcmp (key, "quota") == 0) ||
+ (strcmp (key, "features.quota") == 0)) {
+ ret = gf_string2boolean (value, &b);
+ if (ret)
+ goto out;
+ if (b) {
+ snprintf (errstr, size," 'gluster "
+ "volume set <VOLNAME> %s %s' is "
+ "deprecated. Use 'gluster volume "
+ "quota <VOLNAME> enable' instead.",
+ key, value);
+ ret = -1;
+ goto out;
+ } else {
+ snprintf (errstr, size, " 'gluster "
+ "volume set <VOLNAME> %s %s' is "
+ "deprecated. Use 'gluster volume "
+ "quota <VOLNAME> disable' instead.",
+ key, value);
+ ret = -1;
+ goto out;
+ }
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
int
glusterd_brick_op_build_payload (glusterd_op_t op, glusterd_brickinfo_t *brickinfo,
gd1_mgmt_brick_op_req **req, dict_t *dict)
@@ -144,6 +446,10 @@ glusterd_brick_op_build_payload (glusterd_op_t op, glusterd_brickinfo_t *brickin
char *volname = NULL;
char name[1024] = {0,};
gf_xl_afr_op_t heal_op = GF_AFR_OP_INVALID;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
GF_ASSERT (op < GD_OP_MAX);
GF_ASSERT (op > GD_OP_NONE);
@@ -155,10 +461,8 @@ glusterd_brick_op_build_payload (glusterd_op_t op, glusterd_brickinfo_t *brickin
case GD_OP_STOP_VOLUME:
brick_req = GF_CALLOC (1, sizeof (*brick_req),
gf_gld_mt_mop_brick_req_t);
- if (!brick_req) {
- gf_log ("", GF_LOG_ERROR, "Out of Memory");
+ if (!brick_req)
goto out;
- }
brick_req->op = GLUSTERD_BRICK_TERMINATE;
brick_req->name = "";
break;
@@ -166,10 +470,8 @@ glusterd_brick_op_build_payload (glusterd_op_t op, glusterd_brickinfo_t *brickin
brick_req = GF_CALLOC (1, sizeof (*brick_req),
gf_gld_mt_mop_brick_req_t);
- if (!brick_req) {
- gf_log ("", GF_LOG_ERROR, "Out of Memory");
+ if (!brick_req)
goto out;
- }
brick_req->op = GLUSTERD_BRICK_XLATOR_INFO;
brick_req->name = brickinfo->path;
@@ -194,10 +496,8 @@ glusterd_brick_op_build_payload (glusterd_op_t op, glusterd_brickinfo_t *brickin
{
brick_req = GF_CALLOC (1, sizeof (*brick_req),
gf_gld_mt_mop_brick_req_t);
- if (!brick_req) {
- gf_log (THIS->name, GF_LOG_ERROR, "Out of memory");
+ if (!brick_req)
goto out;
- }
brick_req->op = GLUSTERD_BRICK_STATUS;
brick_req->name = "";
}
@@ -217,6 +517,20 @@ glusterd_brick_op_build_payload (glusterd_op_t op, glusterd_brickinfo_t *brickin
brick_req->name = gf_strdup (name);
break;
+ case GD_OP_SNAP:
+ brick_req = GF_CALLOC (1, sizeof (*brick_req),
+ gf_gld_mt_mop_brick_req_t);
+ if (!brick_req)
+ goto out;
+
+ brick_req->op = GLUSTERD_VOLUME_BARRIER_OP;
+ ret = dict_get_str (dict, "volname", &volname);
+ if (ret)
+ goto out;
+ snprintf (name, 1024, "%s-server",volname);
+ brick_req->name = gf_strdup (name);
+
+ break;
default:
goto out;
break;
@@ -232,7 +546,7 @@ glusterd_brick_op_build_payload (glusterd_op_t op, glusterd_brickinfo_t *brickin
out:
if (ret && brick_req)
GF_FREE (brick_req);
- gf_log ("glusterd", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -291,24 +605,92 @@ out:
}
static int
+glusterd_validate_quorum_options (xlator_t *this, char *fullkey, char *value,
+ char **op_errstr)
+{
+ int ret = 0;
+ char *key = NULL;
+ volume_option_t *opt = NULL;
+
+ if (!glusterd_is_quorum_option (fullkey))
+ goto out;
+ key = strchr (fullkey, '.');
+ if (key == NULL) {
+ ret = -1;
+ goto out;
+ }
+ key++;
+ opt = xlator_volume_option_get (this, key);
+ ret = xlator_option_validate (this, key, value, opt, op_errstr);
+out:
+ return ret;
+}
+
+static int
+glusterd_check_client_op_version_support (char *volname, uint32_t op_version,
+ char **op_errstr)
+{
+ int ret = 0;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ rpc_transport_t *xprt = NULL;
+
+ this = THIS;
+ GF_ASSERT(this);
+ priv = this->private;
+ GF_ASSERT(priv);
+
+ pthread_mutex_lock (&priv->xprt_lock);
+ list_for_each_entry (xprt, &priv->xprt_list, list) {
+ if ((!strcmp(volname, xprt->peerinfo.volname)) &&
+ ((op_version > xprt->peerinfo.max_op_version) ||
+ (op_version < xprt->peerinfo.min_op_version))) {
+ ret = -1;
+ break;
+ }
+ }
+ pthread_mutex_unlock (&priv->xprt_lock);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "One or more clients "
+ "don't support the required op-version");
+ ret = gf_asprintf (op_errstr, "One or more connected clients "
+ "cannot support the feature being set. "
+ "These clients need to be upgraded or "
+ "disconnected before running this command"
+ " again");
+ return -1;
+ }
+ return 0;
+}
+
+static int
glusterd_op_stage_set_volume (dict_t *dict, char **op_errstr)
{
- int ret = 0;
- char *volname = NULL;
- int exists = 0;
- char *key = NULL;
- char *key_fixed = NULL;
- char *value = NULL;
- char str[100] = {0, };
- int count = 0;
- int dict_count = 0;
- char errstr[2048] = {0, };
- glusterd_volinfo_t *volinfo = NULL;
- dict_t *val_dict = NULL;
- gf_boolean_t global_opt = _gf_false;
- glusterd_volinfo_t *voliter = NULL;
- glusterd_conf_t *priv = NULL;
- xlator_t *this = NULL;
+ int ret = -1;
+ char *volname = NULL;
+ int exists = 0;
+ char *key = NULL;
+ char *key_fixed = NULL;
+ char *value = NULL;
+ char str[100] = {0, };
+ int count = 0;
+ int dict_count = 0;
+ char errstr[2048] = {0, };
+ glusterd_volinfo_t *volinfo = NULL;
+ dict_t *val_dict = NULL;
+ gf_boolean_t global_opt = _gf_false;
+ glusterd_volinfo_t *voliter = NULL;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+ uint32_t new_op_version = 0;
+ uint32_t local_new_op_version = 0;
+ uint32_t key_op_version = 0;
+ uint32_t local_key_op_version = 0;
+ gf_boolean_t origin_glusterd = _gf_true;
+ gf_boolean_t check_op_version = _gf_true;
+ gf_boolean_t all_vol = _gf_false;
+ struct volopt_map_entry *vme = NULL;
GF_ASSERT (dict);
this = THIS;
@@ -320,8 +702,40 @@ glusterd_op_stage_set_volume (dict_t *dict, char **op_errstr)
if (!val_dict)
goto out;
- ret = dict_get_int32 (dict, "count", &dict_count);
+ /* Check if we can support the required op-version
+ * This check is not done on the originator glusterd. The originator
+ * glusterd sets this value.
+ */
+ origin_glusterd = is_origin_glusterd (dict);
+
+ if (!origin_glusterd) {
+ /* Check for v3.3.x origin glusterd */
+ check_op_version = dict_get_str_boolean (dict,
+ "check-op-version",
+ _gf_false);
+
+ if (check_op_version) {
+ ret = dict_get_uint32 (dict, "new-op-version",
+ &new_op_version);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get new_op_version");
+ goto out;
+ }
+
+ if ((new_op_version > GD_OP_VERSION_MAX) ||
+ (new_op_version < GD_OP_VERSION_MIN)) {
+ ret = -1;
+ snprintf (errstr, sizeof (errstr),
+ "Required op_version (%d) is not "
+ "supported", new_op_version);
+ gf_log (this->name, GF_LOG_ERROR, "%s", errstr);
+ goto out;
+ }
+ }
+ }
+ ret = dict_get_int32 (dict, "count", &dict_count);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"Count(dict),not set in Volume-Set");
@@ -360,26 +774,31 @@ glusterd_op_stage_set_volume (dict_t *dict, char **op_errstr)
goto out;
}
- exists = glusterd_check_volume_exists (volname);
- if (!exists) {
- snprintf (errstr, sizeof (errstr), "Volume %s does not exist",
- volname);
- gf_log (this->name, GF_LOG_ERROR, "%s", errstr);
- *op_errstr = gf_strdup (errstr);
- ret = -1;
- goto out;
- }
+ if (strcasecmp (volname, "all") != 0) {
+ exists = glusterd_check_volume_exists (volname);
+ if (!exists) {
+ snprintf (errstr, sizeof (errstr),
+ FMTSTR_CHECK_VOL_EXISTS, volname);
+ gf_log (this->name, GF_LOG_ERROR, "%s", errstr);
+ ret = -1;
+ goto out;
+ }
- ret = glusterd_volinfo_find (volname, &volinfo);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "Unable to allocate memory");
- goto out;
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ FMTSTR_CHECK_VOL_EXISTS, volname);
+ goto out;
+ }
+
+ ret = glusterd_validate_volume_id (dict, volinfo);
+ if (ret)
+ goto out;
+ } else {
+ all_vol = _gf_true;
}
- ret = glusterd_validate_volume_id (dict, volinfo);
- if (ret)
- goto out;
+ local_new_op_version = priv->op_version;
for ( count = 1; ret != 1 ; count++ ) {
global_opt = _gf_false;
@@ -398,7 +817,7 @@ glusterd_op_stage_set_volume (dict_t *dict, char **op_errstr)
}
if (strcmp (key, "config.memory-accounting") == 0) {
- gf_log (this->name, GF_LOG_INFO,
+ gf_log (this->name, GF_LOG_DEBUG,
"enabling memory accounting for volume %s",
volname);
ret = 0;
@@ -414,41 +833,95 @@ glusterd_op_stage_set_volume (dict_t *dict, char **op_errstr)
(strcasecmp (value, "tcp") == 0) ||
(strcasecmp (value, "tcp,rdma") == 0) ||
(strcasecmp (value, "rdma,tcp") == 0))) {
- ret = snprintf (errstr, 2048,
+ ret = snprintf (errstr, sizeof (errstr),
"transport-type %s does "
- "not exists", key);
- *op_errstr = gf_strdup (errstr);
+ "not exist", value);
/* lets not bother about above return value,
its a failure anyways */
ret = -1;
+ goto out;
}
}
+ ret = glusterd_check_quota_cmd (key, value, errstr, sizeof (errstr));
+ if (ret)
+ goto out;
+
if (is_key_glusterd_hooks_friendly (key))
continue;
+ for (vme = &glusterd_volopt_map[0]; vme->key; vme++) {
+ if ((vme->validate_fn) &&
+ ((!strcmp (key, vme->key)) ||
+ (!strcmp (key, strchr (vme->key, '.') + 1)))) {
+ ret = vme->validate_fn (dict, key, value,
+ op_errstr);
+ if (ret)
+ goto out;
+ break;
+ }
+ }
+
exists = glusterd_check_option_exists (key, &key_fixed);
if (exists == -1) {
ret = -1;
goto out;
}
+
if (!exists) {
gf_log (this->name, GF_LOG_ERROR,
- "Option with name: %s "
- "does not exist", key);
- ret = snprintf (errstr, 2048,
+ "Option with name: %s does not exist", key);
+ ret = snprintf (errstr, sizeof (errstr),
"option : %s does not exist",
key);
if (key_fixed)
- snprintf (errstr + ret, 2048 - ret,
+ snprintf (errstr + ret, sizeof (errstr) - ret,
"\nDid you mean %s?", key_fixed);
- *op_errstr = gf_strdup (errstr);
ret = -1;
goto out;
}
if (key_fixed)
key = key_fixed;
+ ALL_VOLUME_OPTION_CHECK (volname, key, ret, op_errstr, out);
+ ret = glusterd_validate_quorum_options (this, key, value,
+ op_errstr);
+ if (ret)
+ goto out;
+
+ local_key_op_version = glusterd_get_op_version_for_key (key);
+ if (local_key_op_version > local_new_op_version)
+ local_new_op_version = local_key_op_version;
+
+ sprintf (str, "op-version%d", count);
+ if (origin_glusterd) {
+ ret = dict_set_uint32 (dict, str, local_key_op_version);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set key-op-version in dict");
+ goto out;
+ }
+ } else if (check_op_version) {
+ ret = dict_get_uint32 (dict, str, &key_op_version);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get key-op-version from"
+ " dict");
+ goto out;
+ }
+ if (local_key_op_version != key_op_version) {
+ ret = -1;
+ snprintf (errstr, sizeof (errstr),
+ "option: %s op-version mismatch",
+ key);
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s, required op-version = %"PRIu32", "
+ "available op-version = %"PRIu32,
+ errstr, key_op_version,
+ local_key_op_version);
+ goto out;
+ }
+ }
if (glusterd_check_globaloption (key))
global_opt = _gf_true;
@@ -463,9 +936,9 @@ glusterd_op_stage_set_volume (dict_t *dict, char **op_errstr)
}
*op_errstr = NULL;
- if (!global_opt)
+ if (!global_opt && !all_vol)
ret = glusterd_validate_reconfopts (volinfo, val_dict, op_errstr);
- else {
+ else if (!all_vol) {
voliter = NULL;
list_for_each_entry (voliter, &priv->volumes, vol_list) {
ret = glusterd_validate_globalopts (voliter, val_dict, op_errstr);
@@ -475,8 +948,9 @@ glusterd_op_stage_set_volume (dict_t *dict, char **op_errstr)
}
if (ret) {
- gf_log (this->name, GF_LOG_DEBUG, "Could not create temp "
- "volfile, some option failed: %s", *op_errstr);
+ gf_log (this->name, GF_LOG_ERROR, "Could not create "
+ "temp volfile, some option failed: %s",
+ *op_errstr);
goto out;
}
dict_del (val_dict, key);
@@ -487,6 +961,35 @@ glusterd_op_stage_set_volume (dict_t *dict, char **op_errstr)
}
}
+ // Check if all the connected clients support the new op-version
+ ret = glusterd_check_client_op_version_support (volname,
+ local_new_op_version,
+ op_errstr);
+ if (ret)
+ goto out;
+
+ if (origin_glusterd) {
+ ret = dict_set_uint32 (dict, "new-op-version",
+ local_new_op_version);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set new-op-version in dict");
+ goto out;
+ }
+ /* Set this value in dict so other peers know to check for
+ * op-version. This is a hack for 3.3.x compatibility
+ *
+ * TODO: Remove this and the other places this is referred once
+ * 3.3.x compatibility is not required
+ */
+ ret = dict_set_uint32 (dict, "check-op-version",
+ _gf_true);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set check-op-version in dict");
+ goto out;
+ }
+ }
ret = 0;
@@ -495,6 +998,8 @@ out:
dict_unref (val_dict);
GF_FREE (key_fixed);
+ if (errstr[0] != '\0')
+ *op_errstr = gf_strdup (errstr);
if (ret) {
if (!(*op_errstr)) {
@@ -515,40 +1020,46 @@ glusterd_op_stage_reset_volume (dict_t *dict, char **op_errstr)
{
int ret = 0;
char *volname = NULL;
- gf_boolean_t exists = _gf_false;
+ int exists = 0;
char msg[2048] = {0};
char *key = NULL;
char *key_fixed = NULL;
glusterd_volinfo_t *volinfo = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get volume name");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get volume name");
goto out;
}
- exists = glusterd_check_volume_exists (volname);
+ if (strcasecmp (volname, "all") != 0) {
+ exists = glusterd_check_volume_exists (volname);
+ if (!exists) {
+ snprintf (msg, sizeof (msg), FMTSTR_CHECK_VOL_EXISTS,
+ volname);
+ ret = -1;
+ goto out;
+ }
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret) {
+ snprintf (msg, sizeof (msg), FMTSTR_CHECK_VOL_EXISTS,
+ volname);
+ goto out;
+ }
- if (!exists) {
- snprintf (msg, sizeof (msg), "Volume %s does not "
- "exist", volname);
- gf_log ("", GF_LOG_ERROR, "%s", msg);
- *op_errstr = gf_strdup (msg);
- ret = -1;
- goto out;
+ ret = glusterd_validate_volume_id (dict, volinfo);
+ if (ret)
+ goto out;
}
- ret = glusterd_volinfo_find (volname, &volinfo);
- if (ret)
- goto out;
-
- ret = glusterd_validate_volume_id (dict, volinfo);
- if (ret)
- goto out;
ret = dict_get_str (dict, "key", &key);
if (ret) {
- gf_log ("glusterd", GF_LOG_ERROR, "Unable to get option key");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get option key");
goto out;
}
if (strcmp(key, "all")) {
@@ -557,24 +1068,32 @@ glusterd_op_stage_reset_volume (dict_t *dict, char **op_errstr)
ret = -1;
goto out;
}
+
if (!exists) {
- gf_log ("glusterd", GF_LOG_ERROR,
- "Option %s does not exist", key);
- ret = snprintf (msg, 2048,
+ ret = snprintf (msg, sizeof (msg),
"Option %s does not exist", key);
if (key_fixed)
- snprintf (msg + ret, 2048 - ret,
+ snprintf (msg + ret, sizeof (msg) - ret,
"\nDid you mean %s?", key_fixed);
- *op_errstr = gf_strdup (msg);
ret = -1;
goto out;
+ } else if (exists > 0) {
+ if (key_fixed)
+ key = key_fixed;
+ ALL_VOLUME_OPTION_CHECK (volname, key, ret,
+ op_errstr, out);
}
}
out:
GF_FREE (key_fixed);
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ if (msg[0] != '\0') {
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg);
+ *op_errstr = gf_strdup (msg);
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -600,25 +1119,7 @@ glusterd_op_stage_sync_volume (dict_t *dict, char **op_errstr)
goto out;
}
- ret = glusterd_is_local_addr (hostname);
- if (ret) {
- ret = glusterd_friend_find (NULL, hostname, &peerinfo);
- if (ret) {
- snprintf (msg, sizeof (msg), "%s, is not a friend",
- hostname);
- *op_errstr = gf_strdup (msg);
- goto out;
- }
-
- if (!peerinfo->connected) {
- snprintf (msg, sizeof (msg), "%s, is not connected at "
- "the moment", hostname);
- *op_errstr = gf_strdup (msg);
- ret = -1;
- goto out;
- }
- } else {
-
+ if (gf_is_local_addr (hostname)) {
//volname is not present in case of sync all
ret = dict_get_str (dict, "volname", &volname);
if (!ret) {
@@ -634,12 +1135,26 @@ glusterd_op_stage_sync_volume (dict_t *dict, char **op_errstr)
if (ret)
goto out;
- ret = glusterd_validate_volume_id (dict, volinfo);
- if (ret)
- goto out;
} else {
ret = 0;
}
+ } else {
+ ret = glusterd_friend_find (NULL, hostname, &peerinfo);
+ if (ret) {
+ snprintf (msg, sizeof (msg), "%s, is not a friend",
+ hostname);
+ *op_errstr = gf_strdup (msg);
+ goto out;
+ }
+
+ if (!peerinfo->connected) {
+ snprintf (msg, sizeof (msg), "%s, is not connected at "
+ "the moment", hostname);
+ *op_errstr = gf_strdup (msg);
+ ret = -1;
+ goto out;
+ }
+
}
out:
@@ -677,18 +1192,24 @@ glusterd_op_stage_status_volume (dict_t *dict, char **op_errstr)
if (cmd & GF_CLI_STATUS_ALL)
goto out;
+ if ((cmd & GF_CLI_STATUS_QUOTAD) &&
+ (priv->op_version == GD_OP_VERSION_MIN)) {
+ snprintf (msg, sizeof (msg), "The cluster is operating at "
+ "version 1. Getting the status of quotad is not "
+ "allowed in this state.");
+ ret = -1;
+ goto out;
+ }
+
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR,
- "Unable to get volume name");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get volume name");
goto out;
}
ret = glusterd_volinfo_find (volname, &volinfo);
if (ret) {
- snprintf (msg, sizeof(msg), "Volume %s does not exist",
- volname);
- gf_log (THIS->name, GF_LOG_ERROR, "%s", msg);
+ snprintf (msg, sizeof(msg), FMTSTR_CHECK_VOL_EXISTS, volname);
ret = -1;
goto out;
}
@@ -701,7 +1222,6 @@ glusterd_op_stage_status_volume (dict_t *dict, char **op_errstr)
if (!ret) {
snprintf (msg, sizeof (msg), "Volume %s is not started",
volname);
- gf_log (THIS->name, GF_LOG_ERROR, "%s", msg);
ret = -1;
goto out;
}
@@ -716,7 +1236,6 @@ glusterd_op_stage_status_volume (dict_t *dict, char **op_errstr)
snprintf (msg, sizeof (msg),
"NFS server is disabled for volume %s",
volname);
- gf_log (THIS->name, GF_LOG_ERROR, "%s", msg);
goto out;
}
} else if ((cmd & GF_CLI_STATUS_SHD) != 0) {
@@ -725,7 +1244,6 @@ glusterd_op_stage_status_volume (dict_t *dict, char **op_errstr)
snprintf (msg, sizeof (msg),
"Volume %s is not of type replicate",
volname);
- gf_log (THIS->name, GF_LOG_ERROR, "%s", msg);
goto out;
}
@@ -737,10 +1255,15 @@ glusterd_op_stage_status_volume (dict_t *dict, char **op_errstr)
snprintf (msg, sizeof (msg),
"Self-heal Daemon is disabled for volume %s",
volname);
- gf_log (THIS->name, GF_LOG_ERROR, "%s", msg);
goto out;
}
-
+ } else if ((cmd & GF_CLI_STATUS_QUOTAD) != 0) {
+ if (!glusterd_is_volume_quota_enabled (volinfo)) {
+ ret = -1;
+ snprintf (msg, sizeof (msg), "Volume %s does not have "
+ "quota enabled", volname);
+ goto out;
+ }
} else if ((cmd & GF_CLI_STATUS_BRICK) != 0) {
ret = dict_get_str (dict, "brick", &brick);
if (ret)
@@ -751,8 +1274,6 @@ glusterd_op_stage_status_volume (dict_t *dict, char **op_errstr)
if (ret) {
snprintf (msg, sizeof(msg), "No brick %s in"
" volume %s", brick, volname);
- gf_log (THIS->name, GF_LOG_ERROR, "%s", msg);
-
ret = -1;
goto out;
}
@@ -768,7 +1289,7 @@ glusterd_op_stage_status_volume (dict_t *dict, char **op_errstr)
*op_errstr = gf_strdup ("Validation Failed for Status");
}
- gf_log (THIS->name, GF_LOG_DEBUG, "Returning: %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning: %d", ret);
return ret;
}
@@ -877,14 +1398,17 @@ _delete_reconfig_opt (dict_t *this, char *key, data_t *value, void *data)
GF_ASSERT (data);
is_force = (int32_t*)data;
- if (*is_force != 1 &&
- (_gf_true == glusterd_check_voloption_flags (key,
- OPT_FLAG_FORCE))) {
+ if (*is_force != 1) {
+ if (_gf_true == glusterd_check_voloption_flags (key,
+ OPT_FLAG_FORCE)) {
/* indicate to caller that we don't set the option
* due to being protected
*/
- *is_force = -1;
- goto out;
+ *is_force = *is_force | GD_OP_PROTECTED;
+ goto out;
+ } else {
+ *is_force = *is_force | GD_OP_UNPROTECTED;
+ }
}
gf_log ("", GF_LOG_DEBUG, "deleting dict with key=%s,value=%s",
@@ -895,44 +1419,53 @@ out:
}
static int
+_delete_reconfig_global_opt (dict_t *this, char *key, data_t *value, void *data)
+{
+ int32_t *is_force = 0;
+
+ GF_ASSERT (data);
+ is_force = (int32_t*)data;
+
+ if (strcmp (GLUSTERD_GLOBAL_OPT_VERSION, key) == 0)
+ goto out;
+
+ _delete_reconfig_opt (this, key, value, data);
+out:
+ return 0;
+}
+
+static int
glusterd_options_reset (glusterd_volinfo_t *volinfo, char *key,
int32_t *is_force)
{
int ret = 0;
data_t *value = NULL;
char *key_fixed = NULL;
+ xlator_t *this = NULL;
- gf_log ("", GF_LOG_DEBUG, "Received volume set reset command");
-
+ this = THIS;
+ GF_ASSERT (this);
GF_ASSERT (volinfo->dict);
GF_ASSERT (key);
if (!strncmp(key, "all", 3))
dict_foreach (volinfo->dict, _delete_reconfig_opt, is_force);
else {
- if (glusterd_check_option_exists (key, &key_fixed) != 1) {
- gf_log ("glusterd", GF_LOG_ERROR,
- "volinfo dict inconsistency: option %s not found",
- key);
- ret = -1;
- goto out;
- }
- if (key_fixed)
- key = key_fixed;
value = dict_get (volinfo->dict, key);
if (!value) {
- gf_log ("glusterd", GF_LOG_DEBUG,
+ gf_log (this->name, GF_LOG_DEBUG,
"no value set for option %s", key);
goto out;
}
_delete_reconfig_opt (volinfo->dict, key, value, is_force);
}
- ret = glusterd_create_volfiles_and_notify_services (volinfo);
+ gd_update_volume_op_versions (volinfo);
+ ret = glusterd_create_volfiles_and_notify_services (volinfo);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to create volfile for"
- " 'volume set'");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to create volfile for"
+ " 'volume reset'");
ret = -1;
goto out;
}
@@ -951,23 +1484,119 @@ glusterd_options_reset (glusterd_volinfo_t *volinfo, char *key,
out:
GF_FREE (key_fixed);
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
+static int
+glusterd_op_reset_all_volume_options (xlator_t *this, dict_t *dict)
+{
+ char *key = NULL;
+ char *key_fixed = NULL;
+ int ret = -1;
+ int32_t is_force = 0;
+ glusterd_conf_t *conf = NULL;
+ dict_t *dup_opt = NULL;
+ gf_boolean_t all = _gf_false;
+ char *next_version = NULL;
+ gf_boolean_t quorum_action = _gf_false;
+
+ conf = this->private;
+ ret = dict_get_str (dict, "key", &key);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get key");
+ goto out;
+ }
+
+ ret = dict_get_int32 (dict, "force", &is_force);
+ if (ret)
+ is_force = 0;
+
+ if (strcmp (key, "all")) {
+ ret = glusterd_check_option_exists (key, &key_fixed);
+ if (ret <= 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Option %s does not "
+ "exist", key);
+ ret = -1;
+ goto out;
+ }
+ } else {
+ all = _gf_true;
+ }
+
+ if (key_fixed)
+ key = key_fixed;
+
+ ret = -1;
+ dup_opt = dict_new ();
+ if (!dup_opt)
+ goto out;
+ if (!all) {
+ dict_copy (conf->opts, dup_opt);
+ dict_del (dup_opt, key);
+ }
+ ret = glusterd_get_next_global_opt_version_str (conf->opts,
+ &next_version);
+ if (ret)
+ goto out;
+
+ ret = dict_set_str (dup_opt, GLUSTERD_GLOBAL_OPT_VERSION, next_version);
+ if (ret)
+ goto out;
+
+ ret = glusterd_store_options (this, dup_opt);
+ if (ret)
+ goto out;
+
+ if (glusterd_is_quorum_changed (conf->opts, key, NULL))
+ quorum_action = _gf_true;
+
+ ret = dict_set_dynstr (conf->opts, GLUSTERD_GLOBAL_OPT_VERSION,
+ next_version);
+ if (ret)
+ goto out;
+ else
+ next_version = NULL;
+
+ if (!all) {
+ dict_del (conf->opts, key);
+ } else {
+ dict_foreach (conf->opts, _delete_reconfig_global_opt,
+ &is_force);
+ }
+out:
+ GF_FREE (key_fixed);
+ if (dup_opt)
+ dict_unref (dup_opt);
+
+ gf_log (this->name, GF_LOG_DEBUG, "returning %d", ret);
+ if (quorum_action)
+ glusterd_do_quorum_action ();
+ GF_FREE (next_version);
+ return ret;
+}
static int
-glusterd_op_reset_volume (dict_t *dict, char **op_errstr)
+glusterd_op_reset_volume (dict_t *dict, char **op_rspstr)
{
glusterd_volinfo_t *volinfo = NULL;
int ret = -1;
char *volname = NULL;
char *key = NULL;
+ char *key_fixed = NULL;
int32_t is_force = 0;
+ gf_boolean_t quorum_action = _gf_false;
+ xlator_t *this = NULL;
+ this = THIS;
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get volume name " );
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get volume name" );
+ goto out;
+ }
+
+ if (strcasecmp (volname, "all") == 0) {
+ ret = glusterd_op_reset_all_volume_options (this, dict);
goto out;
}
@@ -977,30 +1606,55 @@ glusterd_op_reset_volume (dict_t *dict, char **op_errstr)
ret = dict_get_str (dict, "key", &key);
if (ret) {
- gf_log ("glusterd", GF_LOG_ERROR, "Unable to get option key");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get option key");
goto out;
}
ret = glusterd_volinfo_find (volname, &volinfo);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to allocate memory");
+ gf_log (this->name, GF_LOG_ERROR, FMTSTR_CHECK_VOL_EXISTS,
+ volname);
goto out;
}
- ret = glusterd_options_reset (volinfo, key, &is_force);
- if (is_force == -1) {
+ if (strcmp (key, "all") &&
+ glusterd_check_option_exists (key, &key_fixed) != 1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "volinfo dict inconsistency: option %s not found",
+ key);
ret = -1;
- gf_asprintf(op_errstr, "'%s' is protected. To reset use 'force'.",
- key);
+ goto out;
+ }
+ if (key_fixed)
+ key = key_fixed;
+
+ if (glusterd_is_quorum_changed (volinfo->dict, key, NULL))
+ quorum_action = _gf_true;
+
+ ret = glusterd_options_reset (volinfo, key, &is_force);
+ if (ret == -1) {
+ gf_asprintf(op_rspstr, "Volume reset : failed");
+ } else if (is_force & GD_OP_PROTECTED) {
+ if (is_force & GD_OP_UNPROTECTED) {
+ gf_asprintf (op_rspstr, "All unprotected fields were"
+ " reset. To reset the protected fields,"
+ " use 'force'.");
+ } else {
+ ret = -1;
+ gf_asprintf (op_rspstr, "'%s' is protected. To reset"
+ " use 'force'.", key);
+ }
}
out:
- gf_log ("", GF_LOG_DEBUG, "'volume reset' returning %d", ret);
- return ret;
+ GF_FREE (key_fixed);
+ if (quorum_action)
+ glusterd_do_quorum_action ();
+ gf_log (this->name, GF_LOG_DEBUG, "'volume reset' returning %d", ret);
+ return ret;
}
-
int
glusterd_stop_bricks (glusterd_volinfo_t *volinfo)
{
@@ -1019,14 +1673,114 @@ glusterd_stop_bricks (glusterd_volinfo_t *volinfo)
int
glusterd_start_bricks (glusterd_volinfo_t *volinfo)
{
- glusterd_brickinfo_t *brickinfo = NULL;
+ int ret = -1;
+ glusterd_brickinfo_t *brickinfo = NULL;
+
+ GF_ASSERT (volinfo);
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
- if (glusterd_brick_start (volinfo, brickinfo, _gf_false))
- return -1;
+ ret = glusterd_brick_start (volinfo, brickinfo, _gf_false);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "Failed to start %s:%s for %s",
+ brickinfo->hostname, brickinfo->path,
+ volinfo->volname);
+ goto out;
+ }
}
- return 0;
+ ret = 0;
+out:
+ return ret;
+}
+
+static int
+glusterd_op_set_all_volume_options (xlator_t *this, dict_t *dict)
+{
+ char *key = NULL;
+ char *key_fixed = NULL;
+ char *value = NULL;
+ char *dup_value = NULL;
+ int ret = -1;
+ glusterd_conf_t *conf = NULL;
+ dict_t *dup_opt = NULL;
+ char *next_version = NULL;
+ gf_boolean_t quorum_action = _gf_false;
+
+ conf = this->private;
+ ret = dict_get_str (dict, "key1", &key);
+ if (ret)
+ goto out;
+
+ ret = dict_get_str (dict, "value1", &value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "invalid key,value pair in 'volume set'");
+ goto out;
+ }
+ ret = glusterd_check_option_exists (key, &key_fixed);
+ if (ret <= 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Invalid key %s", key);
+ ret = -1;
+ goto out;
+ }
+
+ if (key_fixed)
+ key = key_fixed;
+
+ ret = -1;
+ dup_opt = dict_new ();
+ if (!dup_opt)
+ goto out;
+ dict_copy (conf->opts, dup_opt);
+ ret = dict_set_str (dup_opt, key, value);
+ if (ret)
+ goto out;
+
+ ret = glusterd_get_next_global_opt_version_str (conf->opts,
+ &next_version);
+ if (ret)
+ goto out;
+
+ ret = dict_set_str (dup_opt, GLUSTERD_GLOBAL_OPT_VERSION, next_version);
+ if (ret)
+ goto out;
+
+ ret = glusterd_store_options (this, dup_opt);
+ if (ret)
+ goto out;
+
+ if (glusterd_is_quorum_changed (conf->opts, key, value))
+ quorum_action = _gf_true;
+
+ ret = dict_set_dynstr (conf->opts, GLUSTERD_GLOBAL_OPT_VERSION,
+ next_version);
+ if (ret)
+ goto out;
+ else
+ next_version = NULL;
+
+ dup_value = gf_strdup (value);
+ if (!dup_value)
+ goto out;
+
+ ret = dict_set_dynstr (conf->opts, key, dup_value);
+ if (ret)
+ goto out;
+ else
+ dup_value = NULL; /* Protect the allocation from GF_FREE */
+
+out:
+ GF_FREE (dup_value);
+ GF_FREE (key_fixed);
+ if (dup_opt)
+ dict_unref (dup_opt);
+
+ gf_log (this->name, GF_LOG_DEBUG, "returning %d", ret);
+ if (quorum_action)
+ glusterd_do_quorum_action ();
+ GF_FREE (next_version);
+ return ret;
}
static int
@@ -1044,8 +1798,12 @@ glusterd_op_set_volume (dict_t *dict)
char str[50] = {0, };
char *op_errstr = NULL;
gf_boolean_t global_opt = _gf_false;
+ gf_boolean_t global_opts_set = _gf_false;
glusterd_volinfo_t *voliter = NULL;
int32_t dict_count = 0;
+ gf_boolean_t check_op_version = _gf_false;
+ uint32_t new_op_version = 0;
+ gf_boolean_t quorum_action = _gf_false;
this = THIS;
GF_ASSERT (this);
@@ -1062,10 +1820,12 @@ glusterd_op_set_volume (dict_t *dict)
if (dict_count == 0) {
ret = glusterd_volset_help (NULL, &op_errstr);
if (ret) {
- op_errstr = (op_errstr)? op_errstr:
- "Volume set help internal error";
- gf_log (this->name, GF_LOG_ERROR, op_errstr);
+ gf_log (this->name, GF_LOG_ERROR, "%s",
+ (op_errstr)? op_errstr:
+ "Volume set help internal error");
}
+
+ GF_FREE(op_errstr);
goto out;
}
@@ -1075,15 +1835,33 @@ glusterd_op_set_volume (dict_t *dict)
goto out;
}
+ if (strcasecmp (volname, "all") == 0) {
+ ret = glusterd_op_set_all_volume_options (this, dict);
+ goto out;
+ }
+
ret = glusterd_volinfo_find (volname, &volinfo);
if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Unable to allocate memory");
+ gf_log (this->name, GF_LOG_ERROR, FMTSTR_CHECK_VOL_EXISTS,
+ volname);
goto out;
}
+ // TODO: Remove this once v3.3 compatability is not required
+ check_op_version = dict_get_str_boolean (dict, "check-op-version",
+ _gf_false);
+
+ if (check_op_version) {
+ ret = dict_get_uint32 (dict, "new-op-version", &new_op_version);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to get new op-version from dict");
+ goto out;
+ }
+ }
+
for (count = 1; ret != -1 ; count++) {
- global_opt = _gf_false;
sprintf (str, "key%d", count);
ret = dict_get_str (dict, str, &key);
if (ret)
@@ -1129,11 +1907,13 @@ glusterd_op_set_volume (dict_t *dict)
key_fixed = NULL;
goto out;
}
-
}
- if (glusterd_check_globaloption (key))
+ global_opt = _gf_false;
+ if (glusterd_check_globaloption (key)) {
global_opt = _gf_true;
+ global_opts_set = _gf_true;
+ }
if (!global_opt)
value = gf_strdup (value);
@@ -1148,6 +1928,9 @@ glusterd_op_set_volume (dict_t *dict)
if (key_fixed)
key = key_fixed;
+ if (glusterd_is_quorum_changed (volinfo->dict, key, value))
+ quorum_action = _gf_true;
+
if (global_opt) {
list_for_each_entry (voliter, &priv->volumes, vol_list) {
value = gf_strdup (value);
@@ -1173,7 +1956,21 @@ glusterd_op_set_volume (dict_t *dict)
goto out;
}
- if (!global_opt) {
+ /* Update the cluster op-version before regenerating volfiles so that
+ * correct volfiles are generated
+ */
+ if (new_op_version > priv->op_version) {
+ priv->op_version = new_op_version;
+ ret = glusterd_store_global_info (this);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to store op-version");
+ goto out;
+ }
+ }
+
+ if (!global_opts_set) {
+ gd_update_volume_op_versions (volinfo);
ret = glusterd_create_volfiles_and_notify_services (volinfo);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
@@ -1199,6 +1996,7 @@ glusterd_op_set_volume (dict_t *dict)
} else {
list_for_each_entry (voliter, &priv->volumes, vol_list) {
volinfo = voliter;
+ gd_update_volume_op_versions (volinfo);
ret = glusterd_create_volfiles_and_notify_services (volinfo);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
@@ -1224,10 +2022,11 @@ glusterd_op_set_volume (dict_t *dict)
}
}
- ret = 0;
out:
GF_FREE (key_fixed);
gf_log (this->name, GF_LOG_DEBUG, "returning %d", ret);
+ if (quorum_action)
+ glusterd_do_quorum_action ();
return ret;
}
@@ -1259,7 +2058,7 @@ glusterd_op_sync_volume (dict_t *dict, char **op_errstr,
goto out;
}
- if (glusterd_is_local_addr (hostname)) {
+ if (!gf_is_local_addr (hostname)) {
ret = 0;
goto out;
}
@@ -1283,12 +2082,12 @@ glusterd_op_sync_volume (dict_t *dict, char **op_errstr,
if (volname) {
ret = glusterd_add_volume_to_dict (volinfo, rsp_dict,
- 1);
+ 1, "volume");
vol_count = 1;
} else {
list_for_each_entry (volinfo, &priv->volumes, vol_list) {
- ret = glusterd_add_volume_to_dict (volinfo,
- rsp_dict, count);
+ ret = glusterd_add_volume_to_dict (volinfo, rsp_dict,
+ count, "volume");
if (ret)
goto out;
@@ -1430,6 +2229,241 @@ out:
}
static int
+_add_brick_name_to_dict (dict_t *dict, char *key, glusterd_brickinfo_t *brick)
+{
+ int ret = -1;
+ char tmp[1024] = {0,};
+ char *brickname = NULL;
+ xlator_t *this = NULL;
+
+ GF_ASSERT (dict);
+ GF_ASSERT (key);
+ GF_ASSERT (brick);
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ snprintf (tmp, sizeof (tmp), "%s:%s", brick->hostname, brick->path);
+ brickname = gf_strdup (tmp);
+ if (!brickname) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to dup brick name");
+ goto out;
+ }
+
+ ret = dict_set_dynstr (dict, key, brickname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to add brick name to dict");
+ goto out;
+ }
+ brickname = NULL;
+out:
+ if (brickname)
+ GF_FREE (brickname);
+ return ret;
+}
+
+static int
+_add_remove_bricks_to_dict (dict_t *dict, glusterd_volinfo_t *volinfo,
+ char *prefix)
+{
+ int ret = -1;
+ int count = 0;
+ int i = 0;
+ char brick_key[1024] = {0,};
+ char dict_key[1024] ={0,};
+ char *brick = NULL;
+ xlator_t *this = NULL;
+
+ GF_ASSERT (dict);
+ GF_ASSERT (volinfo);
+ GF_ASSERT (prefix);
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ ret = dict_get_int32 (volinfo->rebal.dict, "count", &count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get brick count");
+ goto out;
+ }
+
+ snprintf (dict_key, sizeof (dict_key), "%s.count", prefix);
+ ret = dict_set_int32 (dict, dict_key, count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set brick count in dict");
+ goto out;
+ }
+
+ for (i = 1; i <= count; i++) {
+ memset (brick_key, 0, sizeof (brick_key));
+ snprintf (brick_key, sizeof (brick_key), "brick%d", i);
+
+ ret = dict_get_str (volinfo->rebal.dict, brick_key, &brick);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to get %s", brick_key);
+ goto out;
+ }
+
+ memset (dict_key, 0, sizeof (dict_key));
+ snprintf (dict_key, sizeof (dict_key), "%s.%s", prefix,
+ brick_key);
+ ret = dict_set_str (dict, dict_key, brick);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to add brick to dict");
+ goto out;
+ }
+ brick = NULL;
+ }
+
+out:
+ return ret;
+}
+
+/* This adds the respective task-id and all available parameters of a task into
+ * a dictionary
+ */
+static int
+_add_task_to_dict (dict_t *dict, glusterd_volinfo_t *volinfo, int op, int index)
+{
+
+ int ret = -1;
+ char key[128] = {0,};
+ char *uuid_str = NULL;
+ int status = 0;
+ xlator_t *this = NULL;
+
+ GF_ASSERT (dict);
+ GF_ASSERT (volinfo);
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ switch (op) {
+ case GD_OP_REMOVE_BRICK:
+ snprintf (key, sizeof (key), "task%d", index);
+ ret = _add_remove_bricks_to_dict (dict, volinfo, key);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to add remove bricks to dict");
+ goto out;
+ }
+ case GD_OP_REBALANCE:
+ uuid_str = gf_strdup (uuid_utoa (volinfo->rebal.rebalance_id));
+ status = volinfo->rebal.defrag_status;
+ break;
+
+ case GD_OP_REPLACE_BRICK:
+ snprintf (key, sizeof (key), "task%d.src-brick", index);
+ ret = _add_brick_name_to_dict (dict, key,
+ volinfo->rep_brick.src_brick);
+ if (ret)
+ goto out;
+ memset (key, 0, sizeof (key));
+
+ snprintf (key, sizeof (key), "task%d.dst-brick", index);
+ ret = _add_brick_name_to_dict (dict, key,
+ volinfo->rep_brick.dst_brick);
+ if (ret)
+ goto out;
+ memset (key, 0, sizeof (key));
+
+ uuid_str = gf_strdup (uuid_utoa (volinfo->rep_brick.rb_id));
+ status = volinfo->rep_brick.rb_status;
+ break;
+
+ default:
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR, "%s operation doesn't have a"
+ " task_id", gd_op_list[op]);
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "task%d.type", index);
+ ret = dict_set_str (dict, key, (char *)gd_op_list[op]);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Error setting task type in dict");
+ goto out;
+ }
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "task%d.id", index);
+
+ if (!uuid_str)
+ goto out;
+ ret = dict_set_dynstr (dict, key, uuid_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Error setting task id in dict");
+ goto out;
+ }
+ uuid_str = NULL;
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "task%d.status", index);
+ ret = dict_set_int32 (dict, key, status);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Error setting task status in dict");
+ goto out;
+ }
+
+out:
+ if (uuid_str)
+ GF_FREE (uuid_str);
+ return ret;
+}
+
+static int
+glusterd_aggregate_task_status (dict_t *rsp_dict, glusterd_volinfo_t *volinfo)
+{
+ int ret = -1;
+ int tasks = 0;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ if (!uuid_is_null (volinfo->rebal.rebalance_id)) {
+ ret = _add_task_to_dict (rsp_dict, volinfo, volinfo->rebal.op,
+ tasks);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to add task details to dict");
+ goto out;
+ }
+ tasks++;
+ }
+
+ if (!uuid_is_null (volinfo->rep_brick.rb_id)) {
+ ret = _add_task_to_dict (rsp_dict, volinfo, GD_OP_REPLACE_BRICK,
+ tasks);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to add task details to dict");
+ goto out;
+ }
+ tasks++;
+ }
+
+ ret = dict_set_int32 (rsp_dict, "tasks", tasks);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Error setting tasks count in dict");
+ goto out;
+ }
+ ret = 0;
+
+out:
+ return ret;
+}
+
+static int
glusterd_op_status_volume (dict_t *dict, char **op_errstr,
dict_t *rsp_dict)
{
@@ -1448,6 +2482,7 @@ glusterd_op_status_volume (dict_t *dict, char **op_errstr,
dict_t *vol_opts = NULL;
gf_boolean_t nfs_disabled = _gf_false;
gf_boolean_t shd_enabled = _gf_true;
+ gf_boolean_t origin_glusterd = _gf_false;
this = THIS;
GF_ASSERT (this);
@@ -1457,23 +2492,21 @@ glusterd_op_status_volume (dict_t *dict, char **op_errstr,
GF_ASSERT (dict);
+ origin_glusterd = is_origin_glusterd (dict);
+
ret = dict_get_uint32 (dict, "cmd", &cmd);
if (ret)
goto out;
- if (!rsp_dict) {
- //this should happen only on source
+ if (origin_glusterd) {
ret = 0;
- rsp_dict = glusterd_op_get_ctx ();
-
if ((cmd & GF_CLI_STATUS_ALL)) {
ret = glusterd_get_all_volnames (rsp_dict);
if (ret)
- gf_log (THIS->name, GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"failed to get all volume "
"names for status");
}
-
}
ret = dict_set_uint32 (rsp_dict, "cmd", cmd);
@@ -1489,7 +2522,7 @@ glusterd_op_status_volume (dict_t *dict, char **op_errstr,
ret = glusterd_volinfo_find (volname, &volinfo);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "Volume with name: %s "
+ gf_log (this->name, GF_LOG_ERROR, "Volume with name: %s "
"does not exist", volname);
goto out;
}
@@ -1510,6 +2543,14 @@ glusterd_op_status_volume (dict_t *dict, char **op_errstr,
other_count++;
node_count++;
+ } else if ((cmd & GF_CLI_STATUS_QUOTAD) != 0) {
+ ret = glusterd_add_node_to_dict ("quotad", rsp_dict, 0,
+ vol_opts);
+ if (ret)
+ goto out;
+ other_count++;
+ node_count++;
+
} else if ((cmd & GF_CLI_STATUS_BRICK) != 0) {
ret = dict_get_str (dict, "brick", &brick);
if (ret)
@@ -1532,6 +2573,10 @@ glusterd_op_status_volume (dict_t *dict, char **op_errstr,
brick_index);
node_count++;
+ } else if ((cmd & GF_CLI_STATUS_TASKS) != 0) {
+ ret = glusterd_aggregate_task_status (rsp_dict, volinfo);
+ goto out;
+
} else {
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
brick_index++;
@@ -1581,29 +2626,54 @@ glusterd_op_status_volume (dict_t *dict, char **op_errstr,
goto out;
other_count++;
node_count++;
+ other_index++;
+ }
+ if (glusterd_is_volume_quota_enabled (volinfo)) {
+ ret = glusterd_add_node_to_dict ("quotad",
+ rsp_dict,
+ other_index,
+ vol_opts);
+ if (ret)
+ goto out;
+ other_count++;
+ node_count++;
}
}
}
ret = dict_set_int32 (rsp_dict, "brick-index-max", brick_index);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"Error setting brick-index-max to dict");
goto out;
}
ret = dict_set_int32 (rsp_dict, "other-count", other_count);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"Error setting other-count to dict");
goto out;
}
ret = dict_set_int32 (rsp_dict, "count", node_count);
- if (ret)
- gf_log (THIS->name, GF_LOG_ERROR,
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
"Error setting node count to dict");
+ goto out;
+ }
+
+ /* Active tasks */
+ /* Tasks are added only for normal volume status request for either a
+ * single volume or all volumes
+ */
+ if (!glusterd_status_has_tasks (cmd))
+ goto out;
+
+ ret = glusterd_aggregate_task_status (rsp_dict, volinfo);
+ if (ret)
+ goto out;
+ ret = 0;
out:
- gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -1613,7 +2683,7 @@ glusterd_op_ac_none (glusterd_op_sm_event_t *event, void *ctx)
{
int ret = 0;
- gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
+ gf_log (THIS->name, GF_LOG_DEBUG, "Returning with %d", ret);
return ret;
}
@@ -1627,6 +2697,7 @@ glusterd_op_ac_send_lock (glusterd_op_sm_event_t *event, void *ctx)
xlator_t *this = NULL;
glusterd_peerinfo_t *peerinfo = NULL;
uint32_t pending_count = 0;
+ dict_t *dict = NULL;
this = THIS;
priv = this->private;
@@ -1641,21 +2712,61 @@ glusterd_op_ac_send_lock (glusterd_op_sm_event_t *event, void *ctx)
(glusterd_op_get_op() != GD_OP_SYNC_VOLUME))
continue;
- proc = &peerinfo->mgmt->proctable[GLUSTERD_MGMT_CLUSTER_LOCK];
- if (proc->fn) {
- ret = proc->fn (NULL, this, peerinfo);
- if (ret)
- continue;
- pending_count++;
+ /* Based on the op_version, acquire a cluster or mgmt_v3 lock */
+ if (priv->op_version < GD_OP_VERSION_4) {
+ proc = &peerinfo->mgmt->proctable
+ [GLUSTERD_MGMT_CLUSTER_LOCK];
+ if (proc->fn) {
+ ret = proc->fn (NULL, this, peerinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Failed to send lock request "
+ "for operation 'Volume %s' to "
+ "peer %s",
+ gd_op_list[opinfo.op],
+ peerinfo->hostname);
+ continue;
+ }
+ pending_count++;
+ }
+ } else {
+ dict = glusterd_op_get_ctx ();
+ dict_ref (dict);
+
+ proc = &peerinfo->mgmt_v3->proctable
+ [GLUSTERD_MGMT_V3_LOCK];
+ if (proc->fn) {
+ ret = dict_set_static_ptr (dict, "peerinfo",
+ peerinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set peerinfo");
+ dict_unref (dict);
+ goto out;
+ }
+
+ ret = proc->fn (NULL, this, dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Failed to send mgmt_v3 lock "
+ "request for operation "
+ "'Volume %s' to peer %s",
+ gd_op_list[opinfo.op],
+ peerinfo->hostname);
+ dict_unref (dict);
+ continue;
+ }
+ pending_count++;
+ }
}
}
opinfo.pending_count = pending_count;
if (!opinfo.pending_count)
- ret = glusterd_op_sm_inject_all_acc ();
-
- gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
+ ret = glusterd_op_sm_inject_all_acc (&event->txn_id);
+out:
+ gf_log (this->name, GF_LOG_DEBUG, "Returning with %d", ret);
return ret;
}
@@ -1668,17 +2779,12 @@ glusterd_op_ac_send_unlock (glusterd_op_sm_event_t *event, void *ctx)
xlator_t *this = NULL;
glusterd_peerinfo_t *peerinfo = NULL;
uint32_t pending_count = 0;
+ dict_t *dict = NULL;
this = THIS;
priv = this->private;
GF_ASSERT (priv);
- /*ret = glusterd_unlock (MY_UUID);
-
- if (ret)
- goto out;
- */
-
list_for_each_entry (peerinfo, &priv->peers, uuid_list) {
GF_ASSERT (peerinfo);
@@ -1688,23 +2794,63 @@ glusterd_op_ac_send_unlock (glusterd_op_sm_event_t *event, void *ctx)
(glusterd_op_get_op() != GD_OP_SYNC_VOLUME))
continue;
- proc = &peerinfo->mgmt->proctable[GLUSTERD_MGMT_CLUSTER_UNLOCK];
- if (proc->fn) {
- ret = proc->fn (NULL, this, peerinfo);
- if (ret)
- continue;
- pending_count++;
+ /* Based on the op_version,
+ * release the cluster or mgmt_v3 lock */
+ if (priv->op_version < GD_OP_VERSION_4) {
+ proc = &peerinfo->mgmt->proctable
+ [GLUSTERD_MGMT_CLUSTER_UNLOCK];
+ if (proc->fn) {
+ ret = proc->fn (NULL, this, peerinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Failed to send unlock request "
+ "for operation 'Volume %s' to "
+ "peer %s",
+ gd_op_list[opinfo.op],
+ peerinfo->hostname);
+ continue;
+ }
+ pending_count++;
+ }
+ } else {
+ dict = glusterd_op_get_ctx ();
+ dict_ref (dict);
+
+ proc = &peerinfo->mgmt_v3->proctable
+ [GLUSTERD_MGMT_V3_UNLOCK];
+ if (proc->fn) {
+ ret = dict_set_static_ptr (dict, "peerinfo",
+ peerinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to set peerinfo");
+ dict_unref (dict);
+ goto out;
+ }
+
+ ret = proc->fn (NULL, this, dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Failed to send volume unlock "
+ "request for operation "
+ "'Volume %s' to peer %s",
+ gd_op_list[opinfo.op],
+ peerinfo->hostname);
+ dict_unref (dict);
+ continue;
+ }
+ pending_count++;
+ }
}
}
opinfo.pending_count = pending_count;
if (!opinfo.pending_count)
- ret = glusterd_op_sm_inject_all_acc ();
-
- gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
+ ret = glusterd_op_sm_inject_all_acc (&event->txn_id);
+out:
+ gf_log (this->name, GF_LOG_DEBUG, "Returning with %d", ret);
return ret;
-
}
static int
@@ -1716,9 +2862,10 @@ glusterd_op_ac_ack_drain (glusterd_op_sm_event_t *event, void *ctx)
opinfo.pending_count--;
if (!opinfo.pending_count)
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK, NULL);
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK,
+ &event->txn_id, NULL);
- gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
+ gf_log (THIS->name, GF_LOG_DEBUG, "Returning with %d", ret);
return ret;
}
@@ -1732,41 +2879,98 @@ glusterd_op_ac_send_unlock_drain (glusterd_op_sm_event_t *event, void *ctx)
static int
glusterd_op_ac_lock (glusterd_op_sm_event_t *event, void *ctx)
{
- glusterd_op_lock_ctx_t *lock_ctx = NULL;
- int32_t ret = 0;
-
+ int32_t ret = 0;
+ char *volname = NULL;
+ glusterd_op_lock_ctx_t *lock_ctx = NULL;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
GF_ASSERT (event);
GF_ASSERT (ctx);
+ this = THIS;
+ priv = this->private;
+
lock_ctx = (glusterd_op_lock_ctx_t *)ctx;
- ret = glusterd_lock (lock_ctx->uuid);
+ /* If the req came from a node running on older op_version
+ * the dict won't be present. Based on it acquiring a cluster
+ * or mgmt_v3 lock */
+ if (lock_ctx->dict == NULL) {
+ ret = glusterd_lock (lock_ctx->uuid);
+ glusterd_op_lock_send_resp (lock_ctx->req, ret);
+ } else {
+ ret = dict_get_str (lock_ctx->dict, "volname", &volname);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to acquire volname");
+ else {
+ ret = glusterd_mgmt_v3_lock (volname, lock_ctx->uuid,
+ "vol");
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to acquire lock for %s",
+ volname);
+ }
- gf_log ("", GF_LOG_DEBUG, "Lock Returned %d", ret);
+ glusterd_op_mgmt_v3_lock_send_resp (lock_ctx->req,
+ &event->txn_id, ret);
- glusterd_op_lock_send_resp (lock_ctx->req, ret);
+ dict_unref (lock_ctx->dict);
+ }
+ gf_log (THIS->name, GF_LOG_DEBUG, "Lock Returned %d", ret);
return ret;
}
static int
glusterd_op_ac_unlock (glusterd_op_sm_event_t *event, void *ctx)
{
- int ret = 0;
- glusterd_op_lock_ctx_t *lock_ctx = NULL;
+ int32_t ret = 0;
+ char *volname = NULL;
+ glusterd_op_lock_ctx_t *lock_ctx = NULL;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+
GF_ASSERT (event);
GF_ASSERT (ctx);
+ this = THIS;
+ priv = this->private;
+
lock_ctx = (glusterd_op_lock_ctx_t *)ctx;
- ret = glusterd_unlock (lock_ctx->uuid);
+ /* If the req came from a node running on older op_version
+ * the dict won't be present. Based on it releasing the cluster
+ * or mgmt_v3 lock */
+ if (lock_ctx->dict == NULL) {
+ ret = glusterd_unlock (lock_ctx->uuid);
+ glusterd_op_unlock_send_resp (lock_ctx->req, ret);
+ } else {
+ ret = dict_get_str (lock_ctx->dict, "volname", &volname);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to acquire volname");
+ else {
+ ret = glusterd_mgmt_v3_unlock (volname, lock_ctx->uuid,
+ "vol");
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to release lock for %s",
+ volname);
+ }
+
+ glusterd_op_mgmt_v3_unlock_send_resp (lock_ctx->req,
+ &event->txn_id, ret);
- gf_log ("", GF_LOG_DEBUG, "Unlock Returned %d", ret);
+ dict_unref (lock_ctx->dict);
+ }
- glusterd_op_unlock_send_resp (lock_ctx->req, ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Unlock Returned %d", ret);
+ if (priv->pending_quorum_action)
+ glusterd_do_quorum_action ();
return ret;
}
@@ -1783,7 +2987,7 @@ glusterd_op_ac_local_unlock (glusterd_op_sm_event_t *event, void *ctx)
ret = glusterd_unlock (*originator);
- gf_log ("", GF_LOG_DEBUG, "Unlock Returned %d", ret);
+ gf_log (THIS->name, GF_LOG_DEBUG, "Unlock Returned %d", ret);
return ret;
}
@@ -1801,9 +3005,10 @@ glusterd_op_ac_rcvd_lock_acc (glusterd_op_sm_event_t *event, void *ctx)
if (opinfo.pending_count > 0)
goto out;
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACC, NULL);
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACC,
+ &event->txn_id, NULL);
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
out:
return ret;
@@ -1816,16 +3021,17 @@ glusterd_dict_set_volid (dict_t *dict, char *volname, char **op_errstr)
glusterd_volinfo_t *volinfo = NULL;
char *volid = NULL;
char msg[1024] = {0,};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
if (!dict || !volname)
goto out;
ret = glusterd_volinfo_find (volname, &volinfo);
if (ret) {
- snprintf (msg, sizeof (msg), "Volume %s does not exist",
- volname);
- gf_log (THIS->name, GF_LOG_ERROR, "%s", msg);
- *op_errstr = gf_strdup (msg);
+ snprintf (msg, sizeof (msg), FMTSTR_CHECK_VOL_EXISTS, volname);
goto out;
}
volid = gf_strdup (uuid_utoa (volinfo->volume_id));
@@ -1835,11 +3041,15 @@ glusterd_dict_set_volid (dict_t *dict, char *volname, char **op_errstr)
}
ret = dict_set_dynstr (dict, "vol-id", volid);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR,
- "Failed to set volume id in dictionary");
+ snprintf (msg, sizeof (msg), "Failed to set volume id of volume"
+ " %s", volname);
goto out;
}
out:
+ if (msg[0] != '\0') {
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg);
+ *op_errstr = gf_strdup (msg);
+ }
return ret;
}
@@ -1848,14 +3058,19 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx)
{
int ret = -1;
void *ctx = NULL;
+ dict_t *dict = NULL;
dict_t *req_dict = NULL;
glusterd_op_t op = GD_OP_NONE;
char *volname = NULL;
uint32_t status_cmd = GF_CLI_STATUS_NONE;
char *errstr = NULL;
+ xlator_t *this = NULL;
GF_ASSERT (req);
+ this = THIS;
+ GF_ASSERT (this);
+
req_dict = dict_new ();
if (!req_dict)
goto out;
@@ -1864,7 +3079,7 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx)
op = glusterd_op_get_op ();
ctx = (void*)glusterd_op_get_ctx ();
if (!ctx) {
- gf_log ("", GF_LOG_ERROR, "Null Context for "
+ gf_log (this->name, GF_LOG_ERROR, "Null Context for "
"op %d", op);
ret = -1;
goto out;
@@ -1873,32 +3088,39 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx)
} else {
#define GD_SYNC_OPCODE_KEY "sync-mgmt-operation"
ret = dict_get_int32 (op_ctx, GD_SYNC_OPCODE_KEY, (int32_t*)&op);
- if (ret)
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get volume"
+ " operation");
goto out;
+ }
ctx = op_ctx;
- }
#undef GD_SYNC_OPCODE_KEY
+ }
+ dict = ctx;
switch (op) {
case GD_OP_CREATE_VOLUME:
{
- dict_t *dict = ctx;
++glusterfs_port;
ret = dict_set_int32 (dict, "port",
glusterfs_port);
- if (ret)
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set port in "
+ "dictionary");
goto out;
+ }
dict_copy (dict, req_dict);
}
break;
+ case GD_OP_GSYNC_CREATE:
case GD_OP_GSYNC_SET:
{
- dict_t *dict = ctx;
ret = glusterd_op_gsync_args_get (dict,
&errstr,
&volname,
- NULL);
+ NULL, NULL);
if (ret == 0) {
ret = glusterd_dict_set_volid
(dict, volname, op_errstr);
@@ -1911,32 +3133,59 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx)
case GD_OP_SET_VOLUME:
{
- dict_t *dict = ctx;
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log (THIS->name, GF_LOG_CRITICAL,
+ gf_log (this->name, GF_LOG_CRITICAL,
"volname is not present in "
"operation ctx");
goto out;
}
if (strcmp (volname, "help") &&
- strcmp (volname, "help-xml")) {
+ strcmp (volname, "help-xml") &&
+ strcasecmp (volname, "all")) {
ret = glusterd_dict_set_volid
(dict, volname, op_errstr);
if (ret)
goto out;
}
+ dict_destroy (req_dict);
+ req_dict = dict_ref (dict);
+ }
+ break;
+
+ case GD_OP_SYNC_VOLUME:
+ {
dict_copy (dict, req_dict);
+ break;
+ }
+
+ case GD_OP_REMOVE_BRICK:
+ {
+ dict_t *dict = ctx;
+ ret = dict_get_str (dict, "volname", &volname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "volname is not present in "
+ "operation ctx");
+ goto out;
+ }
+
+ ret = glusterd_dict_set_volid (dict, volname,
+ op_errstr);
+ if (ret)
+ goto out;
+
+ dict_destroy (req_dict);
+ req_dict = dict_ref (dict);
}
break;
case GD_OP_STATUS_VOLUME:
{
- dict_t *dict = ctx;
ret = dict_get_uint32 (dict, "cmd",
&status_cmd);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"Status command not present "
"in op ctx");
goto out;
@@ -1946,16 +3195,14 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx)
break;
}
}
-
+ /*fall-through*/
case GD_OP_DELETE_VOLUME:
case GD_OP_START_VOLUME:
case GD_OP_STOP_VOLUME:
case GD_OP_ADD_BRICK:
case GD_OP_REPLACE_BRICK:
case GD_OP_RESET_VOLUME:
- case GD_OP_REMOVE_BRICK:
case GD_OP_LOG_ROTATE:
- case GD_OP_SYNC_VOLUME:
case GD_OP_QUOTA:
case GD_OP_PROFILE_VOLUME:
case GD_OP_REBALANCE:
@@ -1964,23 +3211,37 @@ glusterd_op_build_payload (dict_t **req, char **op_errstr, dict_t *op_ctx)
case GD_OP_CLEARLOCKS_VOLUME:
case GD_OP_DEFRAG_BRICK_VOLUME:
{
- dict_t *dict = ctx;
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log (THIS->name, GF_LOG_CRITICAL,
+ gf_log (this->name, GF_LOG_CRITICAL,
"volname is not present in "
"operation ctx");
goto out;
}
- ret = glusterd_dict_set_volid (dict, volname,
- op_errstr);
- if (ret)
- goto out;
+ if (strcasecmp (volname, "all")) {
+ ret = glusterd_dict_set_volid (dict,
+ volname,
+ op_errstr);
+ if (ret)
+ goto out;
+ }
dict_copy (dict, req_dict);
}
break;
+ case GD_OP_COPY_FILE:
+ {
+ dict_copy (dict, req_dict);
+ break;
+ }
+
+ case GD_OP_SYS_EXEC:
+ {
+ dict_copy (dict, req_dict);
+ break;
+ }
+
default:
break;
}
@@ -1992,6 +3253,105 @@ out:
return ret;
}
+gf_boolean_t
+glusterd_is_get_op (xlator_t *this, glusterd_op_t op, dict_t *dict)
+{
+ char *key = NULL;
+ char *volname = NULL;
+ int ret = 0;
+
+ if (op == GD_OP_STATUS_VOLUME)
+ return _gf_true;
+
+ if (op == GD_OP_SET_VOLUME) {
+ //check for set volume help
+ ret = dict_get_str (dict, "volname", &volname);
+ if (volname &&
+ ((strcmp (volname, "help") == 0) ||
+ (strcmp (volname, "help-xml") == 0))) {
+ ret = dict_get_str (dict, "key1", &key);
+ if (ret < 0)
+ return _gf_true;
+ }
+ }
+
+ return _gf_false;
+}
+
+gf_boolean_t
+glusterd_is_op_quorum_validation_required (xlator_t *this, glusterd_op_t op,
+ dict_t *dict)
+{
+ gf_boolean_t required = _gf_true;
+ char *key = NULL;
+ char *key_fixed = NULL;
+ int ret = -1;
+
+ if (glusterd_is_get_op (this, op, dict)) {
+ required = _gf_false;
+ goto out;
+ }
+ if ((op != GD_OP_SET_VOLUME) && (op != GD_OP_RESET_VOLUME))
+ goto out;
+ if (op == GD_OP_SET_VOLUME)
+ ret = dict_get_str (dict, "key1", &key);
+ else if (op == GD_OP_RESET_VOLUME)
+ ret = dict_get_str (dict, "key", &key);
+ if (ret)
+ goto out;
+ ret = glusterd_check_option_exists (key, &key_fixed);
+ if (ret <= 0)
+ goto out;
+ if (key_fixed)
+ key = key_fixed;
+ if (glusterd_is_quorum_option (key))
+ required = _gf_false;
+out:
+ GF_FREE (key_fixed);
+ return required;
+}
+
+static int
+glusterd_op_validate_quorum (xlator_t *this, glusterd_op_t op,
+ dict_t *dict, char **op_errstr)
+{
+ int ret = 0;
+ char *volname = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ char *errstr = NULL;
+
+
+ errstr = "Quorum not met. Volume operation not allowed.";
+ if (!glusterd_is_op_quorum_validation_required (this, op, dict))
+ goto out;
+
+ ret = dict_get_str (dict, "volname", &volname);
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+
+ if (does_gd_meet_server_quorum (this)) {
+ ret = 0;
+ goto out;
+ }
+
+ if (glusterd_is_volume_in_server_quorum (volinfo)) {
+ ret = -1;
+ *op_errstr = gf_strdup (errstr);
+ goto out;
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
static int
glusterd_op_ac_send_stage_op (glusterd_op_sm_event_t *event, void *ctx)
{
@@ -2014,7 +3374,17 @@ glusterd_op_ac_send_stage_op (glusterd_op_sm_event_t *event, void *ctx)
ret = glusterd_op_build_payload (&dict, &op_errstr, NULL);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "Building payload failed");
+ gf_log (this->name, GF_LOG_ERROR, LOGSTR_BUILD_PAYLOAD,
+ gd_op_list[op]);
+ if (op_errstr == NULL)
+ gf_asprintf (&op_errstr, OPERRSTR_BUILD_PAYLOAD);
+ opinfo.op_errstr = op_errstr;
+ goto out;
+ }
+
+ ret = glusterd_op_validate_quorum (this, op, dict, &op_errstr);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "%s", op_errstr);
opinfo.op_errstr = op_errstr;
goto out;
}
@@ -2022,7 +3392,12 @@ glusterd_op_ac_send_stage_op (glusterd_op_sm_event_t *event, void *ctx)
/* rsp_dict NULL from source */
ret = glusterd_op_stage_validate (op, dict, &op_errstr, NULL);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Staging failed");
+ gf_log (this->name, GF_LOG_ERROR, LOGSTR_STAGE_FAIL,
+ gd_op_list[op], "localhost",
+ (op_errstr) ? ":" : " ", (op_errstr) ? op_errstr : " ");
+ if (op_errstr == NULL)
+ gf_asprintf (&op_errstr, OPERRSTR_STAGE_FAIL,
+ "localhost");
opinfo.op_errstr = op_errstr;
goto out;
}
@@ -2041,13 +3416,19 @@ glusterd_op_ac_send_stage_op (glusterd_op_sm_event_t *event, void *ctx)
if (proc->fn) {
ret = dict_set_static_ptr (dict, "peerinfo", peerinfo);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "failed to set peerinfo");
+ gf_log (this->name, GF_LOG_ERROR, "failed to "
+ "set peerinfo");
goto out;
}
ret = proc->fn (NULL, this, dict);
- if (ret)
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Failed to "
+ "send stage request for operation "
+ "'Volume %s' to peer %s",
+ gd_op_list[op], peerinfo->hostname);
continue;
+ }
pending_count++;
}
}
@@ -2057,27 +3438,29 @@ out:
if (dict)
dict_unref (dict);
if (ret) {
- glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT, NULL);
+ glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT,
+ &event->txn_id, NULL);
opinfo.op_ret = ret;
}
- gf_log ("glusterd", GF_LOG_INFO, "Sent op req to %d peers",
+ gf_log (this->name, GF_LOG_DEBUG, "Sent stage op request for "
+ "'Volume %s' to %d peers", gd_op_list[op],
opinfo.pending_count);
if (!opinfo.pending_count)
- ret = glusterd_op_sm_inject_all_acc ();
+ ret = glusterd_op_sm_inject_all_acc (&event->txn_id);
- gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning with %d", ret);
return ret;
}
static int32_t
-glusterd_op_start_rb_timer (dict_t *dict)
+glusterd_op_start_rb_timer (dict_t *dict, uuid_t *txn_id)
{
int32_t op = 0;
- struct timeval timeout = {0, };
+ struct timespec timeout = {0, };
glusterd_conf_t *priv = NULL;
int32_t ret = -1;
dict_t *rb_ctx = NULL;
@@ -2093,12 +3476,12 @@ glusterd_op_start_rb_timer (dict_t *dict)
}
if (op != GF_REPLACE_OP_START) {
- ret = glusterd_op_sm_inject_all_acc ();
+ ret = glusterd_op_sm_inject_all_acc (txn_id);
goto out;
}
timeout.tv_sec = 5;
- timeout.tv_usec = 0;
+ timeout.tv_nsec = 0;
rb_ctx = dict_copy (dict, rb_ctx);
@@ -2108,6 +3491,17 @@ glusterd_op_start_rb_timer (dict_t *dict)
ret = -1;
goto out;
}
+
+ ret = dict_set_bin (rb_ctx, "transaction_id",
+ txn_id, sizeof (*txn_id));
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Failed to set transaction id.");
+ goto out;
+ } else
+ gf_log ("", GF_LOG_DEBUG,
+ "transaction_id = %s", uuid_utoa (*txn_id));
+
priv->timer = gf_timer_call_after (THIS->ctx, timeout,
glusterd_do_replace_brick,
(void *) rb_ctx);
@@ -2131,6 +3525,10 @@ glusterd_op_volume_dict_uuid_to_hostname (dict_t *dict, const char *key_fmt,
char *uuid_str = NULL;
uuid_t uuid = {0,};
char *hostname = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
GF_ASSERT (dict);
GF_ASSERT (key_fmt);
@@ -2142,7 +3540,7 @@ glusterd_op_volume_dict_uuid_to_hostname (dict_t *dict, const char *key_fmt,
if (ret)
continue;
- gf_log (THIS->name, GF_LOG_DEBUG, "Got uuid %s",
+ gf_log (this->name, GF_LOG_DEBUG, "Got uuid %s",
uuid_str);
ret = uuid_parse (uuid_str, uuid);
@@ -2154,12 +3552,13 @@ glusterd_op_volume_dict_uuid_to_hostname (dict_t *dict, const char *key_fmt,
hostname = glusterd_uuid_to_hostname (uuid);
if (hostname) {
- gf_log (THIS->name, GF_LOG_DEBUG, "%s -> %s",
+ gf_log (this->name, GF_LOG_DEBUG, "%s -> %s",
uuid_str, hostname);
ret = dict_set_dynstr (dict, key, hostname);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR,
- "Error setting hostname to dict");
+ gf_log (this->name, GF_LOG_ERROR,
+ "Error setting hostname %s to dict",
+ hostname);
GF_FREE (hostname);
goto out;
}
@@ -2167,8 +3566,99 @@ glusterd_op_volume_dict_uuid_to_hostname (dict_t *dict, const char *key_fmt,
}
out:
- gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+static int
+reassign_defrag_status (dict_t *dict, char *key, gf_defrag_status_t *status)
+{
+ int ret = 0;
+
+ if (!*status)
+ return ret;
+
+ switch (*status) {
+ case GF_DEFRAG_STATUS_STARTED:
+ *status = GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED;
+ break;
+
+ case GF_DEFRAG_STATUS_STOPPED:
+ *status = GF_DEFRAG_STATUS_LAYOUT_FIX_STOPPED;
+ break;
+
+ case GF_DEFRAG_STATUS_COMPLETE:
+ *status = GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE;
+ break;
+
+ case GF_DEFRAG_STATUS_FAILED:
+ *status = GF_DEFRAG_STATUS_LAYOUT_FIX_FAILED;
+ break;
+ default:
+ break;
+ }
+
+ ret = dict_set_int32(dict, key, *status);
+ if (ret)
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "failed to reset defrag %s in dict", key);
+
+ return ret;
+}
+
+/* Check and reassign the defrag_status enum got from the rebalance process
+ * of all peers so that the rebalance-status CLI command can display if a
+ * full-rebalance or just a fix-layout was carried out.
+ */
+static int
+glusterd_op_check_peer_defrag_status (dict_t *dict, int count)
+{
+ glusterd_volinfo_t *volinfo = NULL;
+ gf_defrag_status_t status = GF_DEFRAG_STATUS_NOT_STARTED;
+ char key[256] = {0,};
+ char *volname = NULL;
+ int ret = -1;
+ int i = 1;
+
+ ret = dict_get_str (dict, "volname", &volname);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_WARNING, "Unable to get volume name");
+ goto out;
+ }
+
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_WARNING, FMTSTR_CHECK_VOL_EXISTS,
+ volname);
+ goto out;
+ }
+
+ if (volinfo->rebal.defrag_cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) {
+ /* Fix layout was not issued; we don't need to reassign
+ the status */
+ ret = 0;
+ goto out;
+ }
+
+ do {
+ memset (key, 0, 256);
+ snprintf (key, 256, "status-%d", i);
+ ret = dict_get_int32 (dict, key, (int32_t *)&status);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "failed to get defrag %s", key);
+ goto out;
+ }
+ ret = reassign_defrag_status (dict, key, &status);
+ if (ret)
+ goto out;
+ i++;
+ } while (i <= count);
+
+ ret = 0;
+out:
return ret;
+
}
/* This function is used to modify the op_ctx dict before sending it back
@@ -2176,7 +3666,7 @@ out:
* hostnames etc.
*/
void
-glusterd_op_modify_op_ctx (glusterd_op_t op)
+glusterd_op_modify_op_ctx (glusterd_op_t op, void *ctx)
{
int ret = -1;
dict_t *op_ctx = NULL;
@@ -2184,10 +3674,18 @@ glusterd_op_modify_op_ctx (glusterd_op_t op)
int other_count = 0;
int count = 0;
uint32_t cmd = GF_CLI_STATUS_NONE;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ if (ctx)
+ op_ctx = ctx;
+ else
+ op_ctx = glusterd_op_get_ctx();
- op_ctx = glusterd_op_get_ctx();
if (!op_ctx) {
- gf_log (THIS->name, GF_LOG_CRITICAL,
+ gf_log (this->name, GF_LOG_CRITICAL,
"Operation context is not present.");
goto out;
}
@@ -2196,13 +3694,13 @@ glusterd_op_modify_op_ctx (glusterd_op_t op)
case GD_OP_STATUS_VOLUME:
ret = dict_get_uint32 (op_ctx, "cmd", &cmd);
if (ret) {
- gf_log (THIS->name, GF_LOG_DEBUG,
+ gf_log (this->name, GF_LOG_DEBUG,
"Failed to get status cmd");
goto out;
}
if (!(cmd & GF_CLI_STATUS_NFS || cmd & GF_CLI_STATUS_SHD ||
(cmd & GF_CLI_STATUS_MASK) == GF_CLI_STATUS_NONE)) {
- gf_log (THIS->name, GF_LOG_INFO,
+ gf_log (this->name, GF_LOG_DEBUG,
"op_ctx modification not required for status "
"operation being performed");
goto out;
@@ -2211,25 +3709,57 @@ glusterd_op_modify_op_ctx (glusterd_op_t op)
ret = dict_get_int32 (op_ctx, "brick-index-max",
&brick_index_max);
if (ret) {
- gf_log (THIS->name, GF_LOG_DEBUG,
+ gf_log (this->name, GF_LOG_DEBUG,
"Failed to get brick-index-max");
goto out;
}
ret = dict_get_int32 (op_ctx, "other-count", &other_count);
if (ret) {
- gf_log (THIS->name, GF_LOG_DEBUG,
+ gf_log (this->name, GF_LOG_DEBUG,
"Failed to get other-count");
goto out;
}
count = brick_index_max + other_count + 1;
+ /* add 'brick%d.peerid' into op_ctx with value of 'brick%d.path'.
+ nfs/sshd like services have this additional uuid */
+ {
+ char key[1024];
+ char *uuid_str = NULL;
+ char *uuid = NULL;
+ int i;
+
+ for (i = brick_index_max + 1; i < count; i++) {
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "brick%d.path", i);
+ ret = dict_get_str (op_ctx, key, &uuid_str);
+ if (!ret) {
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key),
+ "brick%d.peerid", i);
+ uuid = gf_strdup (uuid_str);
+ if (!uuid) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unable to create dup of"
+ " uuid_str");
+ continue;
+ }
+ ret = dict_set_dynstr (op_ctx, key,
+ uuid);
+ if (ret != 0) {
+ GF_FREE (uuid);
+ }
+ }
+ }
+ }
+
ret = glusterd_op_volume_dict_uuid_to_hostname (op_ctx,
"brick%d.path",
0, count);
if (ret)
- gf_log (THIS->name, GF_LOG_WARNING,
+ gf_log (this->name, GF_LOG_WARNING,
"Failed uuid to hostname conversion");
break;
@@ -2241,7 +3771,7 @@ glusterd_op_modify_op_ctx (glusterd_op_t op)
ret = dict_get_int32 (op_ctx, "count", &count);
if (ret) {
- gf_log (THIS->name, GF_LOG_DEBUG,
+ gf_log (this->name, GF_LOG_DEBUG,
"Failed to get brick count");
goto out;
}
@@ -2250,7 +3780,7 @@ glusterd_op_modify_op_ctx (glusterd_op_t op)
"%d-brick",
1, (count + 1));
if (ret)
- gf_log (THIS->name, GF_LOG_WARNING,
+ gf_log (this->name, GF_LOG_WARNING,
"Failed uuid to hostname conversion");
break;
@@ -2261,22 +3791,59 @@ glusterd_op_modify_op_ctx (glusterd_op_t op)
case GD_OP_DEFRAG_BRICK_VOLUME:
ret = dict_get_int32 (op_ctx, "count", &count);
if (ret) {
- gf_log (THIS->name, GF_LOG_DEBUG,
+ gf_log (this->name, GF_LOG_DEBUG,
"Failed to get count");
goto out;
}
+ /* add 'node-name-%d' into op_ctx with value uuid_str.
+ this will be used to convert to hostname later */
+ {
+ char key[1024];
+ char *uuid_str = NULL;
+ char *uuid = NULL;
+ int i;
+
+ for (i = 1; i <= count; i++) {
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "node-uuid-%d", i);
+ ret = dict_get_str (op_ctx, key, &uuid_str);
+ if (!ret) {
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key),
+ "node-name-%d", i);
+ uuid = gf_strdup (uuid_str);
+ if (!uuid) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "unable to create dup of"
+ " uuid_str");
+ continue;
+ }
+ ret = dict_set_dynstr (op_ctx, key,
+ uuid);
+ if (ret != 0) {
+ GF_FREE (uuid);
+ }
+ }
+ }
+ }
+
ret = glusterd_op_volume_dict_uuid_to_hostname (op_ctx,
- "node-uuid-%d",
+ "node-name-%d",
1, (count + 1));
if (ret)
- gf_log (THIS->name, GF_LOG_WARNING,
+ gf_log (this->name, GF_LOG_WARNING,
"Failed uuid to hostname conversion");
+
+ ret = glusterd_op_check_peer_defrag_status (op_ctx, count);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to reset defrag status for fix-layout");
break;
default:
ret = 0;
- gf_log (THIS->name, GF_LOG_INFO,
+ gf_log (this->name, GF_LOG_DEBUG,
"op_ctx modification not required");
break;
@@ -2284,7 +3851,7 @@ glusterd_op_modify_op_ctx (glusterd_op_t op)
out:
if (ret)
- gf_log (THIS->name, GF_LOG_WARNING,
+ gf_log (this->name, GF_LOG_WARNING,
"op_ctx modification failed");
return;
}
@@ -2366,20 +3933,26 @@ glusterd_op_ac_send_commit_op (glusterd_op_sm_event_t *event, void *ctx)
ret = glusterd_op_build_payload (&dict, &op_errstr, NULL);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "Building payload failed");
+ gf_log (this->name, GF_LOG_ERROR, LOGSTR_BUILD_PAYLOAD,
+ gd_op_list[op]);
+ if (op_errstr == NULL)
+ gf_asprintf (&op_errstr, OPERRSTR_BUILD_PAYLOAD);
opinfo.op_errstr = op_errstr;
goto out;
}
- glusterd_op_commit_hook (op, op_dict, GD_COMMIT_HOOK_PRE);
ret = glusterd_op_commit_perform (op, dict, &op_errstr, NULL); //rsp_dict invalid for source
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "Commit failed");
+ gf_log (this->name, GF_LOG_ERROR, LOGSTR_COMMIT_FAIL,
+ gd_op_list[op], "localhost", (op_errstr) ? ":" : " ",
+ (op_errstr) ? op_errstr : " ");
+ if (op_errstr == NULL)
+ gf_asprintf (&op_errstr, OPERRSTR_COMMIT_FAIL,
+ "localhost");
opinfo.op_errstr = op_errstr;
goto out;
}
- glusterd_op_commit_hook (op, op_dict, GD_COMMIT_HOOK_POST);
list_for_each_entry (peerinfo, &priv->peers, uuid_list) {
GF_ASSERT (peerinfo);
@@ -2395,41 +3968,48 @@ glusterd_op_ac_send_commit_op (glusterd_op_sm_event_t *event, void *ctx)
if (proc->fn) {
ret = dict_set_static_ptr (dict, "peerinfo", peerinfo);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"failed to set peerinfo");
goto out;
}
ret = proc->fn (NULL, this, dict);
- if (ret)
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Failed to "
+ "send commit request for operation "
+ "'Volume %s' to peer %s",
+ gd_op_list[op], peerinfo->hostname);
continue;
+ }
pending_count++;
}
}
opinfo.pending_count = pending_count;
- gf_log (THIS->name, GF_LOG_INFO, "Sent op req to %d peers",
- opinfo.pending_count);
+ gf_log (this->name, GF_LOG_DEBUG, "Sent commit op req for 'Volume %s' "
+ "to %d peers", gd_op_list[op], opinfo.pending_count);
out:
if (dict)
dict_unref (dict);
if (ret) {
- glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT, NULL);
+ glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT,
+ &event->txn_id, NULL);
opinfo.op_ret = ret;
}
if (!opinfo.pending_count) {
if (op == GD_OP_REPLACE_BRICK) {
- ret = glusterd_op_start_rb_timer (op_dict);
+ ret = glusterd_op_start_rb_timer (op_dict,
+ &event->txn_id);
} else {
- glusterd_op_modify_op_ctx (op);
- ret = glusterd_op_sm_inject_all_acc ();
+ glusterd_op_modify_op_ctx (op, NULL);
+ ret = glusterd_op_sm_inject_all_acc (&event->txn_id);
}
goto err;
}
err:
- gf_log (THIS->name, GF_LOG_DEBUG, "Returning with %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning with %d", ret);
return ret;
@@ -2448,10 +4028,11 @@ glusterd_op_ac_rcvd_stage_op_acc (glusterd_op_sm_event_t *event, void *ctx)
if (opinfo.pending_count > 0)
goto out;
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_STAGE_ACC, NULL);
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_STAGE_ACC,
+ &event->txn_id, NULL);
out:
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -2469,10 +4050,11 @@ glusterd_op_ac_stage_op_failed (glusterd_op_sm_event_t *event, void *ctx)
if (opinfo.pending_count > 0)
goto out;
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK, NULL);
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK,
+ &event->txn_id, NULL);
out:
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -2490,10 +4072,11 @@ glusterd_op_ac_commit_op_failed (glusterd_op_sm_event_t *event, void *ctx)
if (opinfo.pending_count > 0)
goto out;
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK, NULL);
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK,
+ &event->txn_id, NULL);
out:
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -2504,6 +4087,10 @@ glusterd_op_ac_brick_op_failed (glusterd_op_sm_event_t *event, void *ctx)
int ret = 0;
glusterd_op_brick_rsp_ctx_t *ev_ctx = NULL;
gf_boolean_t free_errstr = _gf_false;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
GF_ASSERT (event);
GF_ASSERT (ctx);
@@ -2511,7 +4098,7 @@ glusterd_op_ac_brick_op_failed (glusterd_op_sm_event_t *event, void *ctx)
ret = glusterd_remove_pending_entry (&opinfo.pending_bricks, ev_ctx->pending_node->node);
if (ret) {
- gf_log ("glusterd", GF_LOG_ERROR, "unknown response received ");
+ gf_log (this->name, GF_LOG_ERROR, "unknown response received ");
ret = -1;
free_errstr = _gf_true;
goto out;
@@ -2529,7 +4116,8 @@ glusterd_op_ac_brick_op_failed (glusterd_op_sm_event_t *event, void *ctx)
if (opinfo.brick_pending_count > 0)
goto out;
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK, ev_ctx->commit_ctx);
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK,
+ &event->txn_id, ev_ctx->commit_ctx);
out:
if (ev_ctx->rsp_dict)
@@ -2537,7 +4125,7 @@ out:
if (free_errstr && ev_ctx->op_errstr)
GF_FREE (ev_ctx->op_errstr);
GF_FREE (ctx);
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -2549,7 +4137,10 @@ glusterd_op_ac_rcvd_commit_op_acc (glusterd_op_sm_event_t *event, void *ctx)
int ret = 0;
gf_boolean_t commit_ack_inject = _gf_true;
glusterd_op_t op = GD_OP_NONE;
+ xlator_t *this = NULL;
+ this = THIS;
+ GF_ASSERT (this);
op = glusterd_op_get_op ();
GF_ASSERT (event);
@@ -2562,15 +4153,15 @@ glusterd_op_ac_rcvd_commit_op_acc (glusterd_op_sm_event_t *event, void *ctx)
if (op == GD_OP_REPLACE_BRICK) {
op_ctx = glusterd_op_get_ctx ();
if (!op_ctx) {
- gf_log (THIS->name, GF_LOG_CRITICAL, "Operation "
+ gf_log (this->name, GF_LOG_CRITICAL, "Operation "
"context is not present.");
ret = -1;
goto out;
}
- ret = glusterd_op_start_rb_timer (op_ctx);
+ ret = glusterd_op_start_rb_timer (op_ctx, &event->txn_id);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "Couldn't start "
+ gf_log (this->name, GF_LOG_ERROR, "Couldn't start "
"replace-brick operation.");
goto out;
}
@@ -2583,10 +4174,14 @@ glusterd_op_ac_rcvd_commit_op_acc (glusterd_op_sm_event_t *event, void *ctx)
out:
if (commit_ack_inject) {
if (ret)
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT, NULL);
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT,
+ &event->txn_id,
+ NULL);
else if (!opinfo.pending_count) {
- glusterd_op_modify_op_ctx (op);
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_COMMIT_ACC, NULL);
+ glusterd_op_modify_op_ctx (op, NULL);
+ ret = glusterd_op_sm_inject_event
+ (GD_OP_EVENT_COMMIT_ACC,
+ &event->txn_id, NULL);
}
/*else do nothing*/
}
@@ -2607,9 +4202,10 @@ glusterd_op_ac_rcvd_unlock_acc (glusterd_op_sm_event_t *event, void *ctx)
if (opinfo.pending_count > 0)
goto out;
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACC, NULL);
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACC,
+ &event->txn_id, NULL);
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
out:
return ret;
@@ -2641,7 +4237,7 @@ glusterd_op_reset_ctx ()
}
int32_t
-glusterd_op_txn_complete ()
+glusterd_op_txn_complete (uuid_t *txn_id)
{
int32_t ret = -1;
glusterd_conf_t *priv = NULL;
@@ -2651,9 +4247,13 @@ glusterd_op_txn_complete ()
rpcsvc_request_t *req = NULL;
void *ctx = NULL;
char *op_errstr = NULL;
+ char *volname = NULL;
+ xlator_t *this = NULL;
+ this = THIS;
+ GF_ASSERT (this);
- priv = THIS->private;
+ priv = this->private;
GF_ASSERT (priv);
op = glusterd_op_get_op ();
@@ -2670,32 +4270,56 @@ glusterd_op_txn_complete ()
glusterd_op_reset_ctx ();
glusterd_op_clear_errstr ();
- ret = glusterd_unlock (MY_UUID);
-
- /* unlock cant/shouldnt fail here!! */
- if (ret) {
- gf_log ("glusterd", GF_LOG_CRITICAL,
- "Unable to clear local lock, ret: %d", ret);
+ /* Based on the op-version, we release the cluster or mgmt_v3 lock */
+ if (priv->op_version < GD_OP_VERSION_4) {
+ ret = glusterd_unlock (MY_UUID);
+ /* unlock cant/shouldnt fail here!! */
+ if (ret)
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "Unable to clear local lock, ret: %d", ret);
+ else
+ gf_log (this->name, GF_LOG_DEBUG, "Cleared local lock");
} else {
- gf_log ("glusterd", GF_LOG_INFO, "Cleared local lock");
+ ret = dict_get_str (ctx, "volname", &volname);
+ if (ret)
+ gf_log ("", GF_LOG_INFO,
+ "No Volume name present. "
+ "Locks have not been held.");
+
+ if (volname) {
+ ret = glusterd_mgmt_v3_unlock (volname, MY_UUID,
+ "vol");
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to release lock for %s",
+ volname);
+ }
}
ret = glusterd_op_send_cli_response (op, op_ret,
op_errno, req, ctx, op_errstr);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Responding to cli failed, ret: %d",
- ret);
+ gf_log (this->name, GF_LOG_ERROR, "Responding to cli failed, "
+ "ret: %d", ret);
//Ignore this error, else state machine blocks
ret = 0;
}
- glusterd_op_free_ctx (op, ctx);
if (op_errstr && (strcmp (op_errstr, "")))
GF_FREE (op_errstr);
- gf_log ("glusterd", GF_LOG_DEBUG, "Returning %d", ret);
+ if (priv->pending_quorum_action)
+ glusterd_do_quorum_action ();
+
+ /* Clearing the transaction opinfo */
+ ret = glusterd_clear_txn_opinfo (txn_id);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to clear transaction's opinfo");
+
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -2706,9 +4330,9 @@ glusterd_op_ac_unlocked_all (glusterd_op_sm_event_t *event, void *ctx)
GF_ASSERT (event);
- ret = glusterd_op_txn_complete ();
+ ret = glusterd_op_txn_complete (&event->txn_id);
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -2722,7 +4346,11 @@ glusterd_op_ac_stage_op (glusterd_op_sm_event_t *event, void *ctx)
dict_t *rsp_dict = NULL;
char *op_errstr = NULL;
dict_t *dict = NULL;
+ xlator_t *this = NULL;
+ uuid_t *txn_id = NULL;
+ this = THIS;
+ GF_ASSERT (this);
GF_ASSERT (ctx);
req_ctx = ctx;
@@ -2731,8 +4359,8 @@ glusterd_op_ac_stage_op (glusterd_op_sm_event_t *event, void *ctx)
rsp_dict = dict_new ();
if (!rsp_dict) {
- gf_log ("", GF_LOG_DEBUG,
- "Out of memory");
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get new dictionary");
return -1;
}
@@ -2740,16 +4368,36 @@ glusterd_op_ac_stage_op (glusterd_op_sm_event_t *event, void *ctx)
rsp_dict);
if (status) {
- gf_log ("", GF_LOG_ERROR, "Validate failed: %d", status);
+ gf_log (this->name, GF_LOG_ERROR, "Stage failed on operation"
+ " 'Volume %s', Status : %d", gd_op_list[req_ctx->op],
+ status);
+ }
+
+ txn_id = GF_CALLOC (1, sizeof(uuid_t), gf_common_mt_uuid_t);
+
+ if (txn_id)
+ uuid_copy (*txn_id, event->txn_id);
+ else {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_bin (rsp_dict, "transaction_id",
+ txn_id, sizeof(*txn_id));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set transaction id.");
+ goto out;
}
ret = glusterd_op_stage_send_resp (req_ctx->req, req_ctx->op,
status, op_errstr, rsp_dict);
+out:
if (op_errstr && (strcmp (op_errstr, "")))
GF_FREE (op_errstr);
- gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning with %d", ret);
if (rsp_dict)
dict_unref (rsp_dict);
@@ -2806,7 +4454,11 @@ glusterd_op_ac_commit_op (glusterd_op_sm_event_t *event, void *ctx)
char *op_errstr = NULL;
dict_t *dict = NULL;
dict_t *rsp_dict = NULL;
+ xlator_t *this = NULL;
+ uuid_t *txn_id = NULL;
+ this = THIS;
+ GF_ASSERT (this);
GF_ASSERT (ctx);
req_ctx = ctx;
@@ -2817,7 +4469,6 @@ glusterd_op_ac_commit_op (glusterd_op_sm_event_t *event, void *ctx)
if (NULL == rsp_dict)
return -1;
- glusterd_op_commit_hook (req_ctx->op, dict, GD_COMMIT_HOOK_PRE);
if (GD_OP_CLEARLOCKS_VOLUME == req_ctx->op) {
/*clear locks should be run only on
@@ -2829,25 +4480,39 @@ glusterd_op_ac_commit_op (glusterd_op_sm_event_t *event, void *ctx)
&op_errstr, rsp_dict);
}
- if (status) {
- gf_log (THIS->name, GF_LOG_ERROR, "Commit failed: %d", status);
- } else {
- /* On successful commit */
- glusterd_op_commit_hook (req_ctx->op, dict,
- GD_COMMIT_HOOK_POST);
+ if (status)
+ gf_log (this->name, GF_LOG_ERROR, "Commit of operation "
+ "'Volume %s' failed: %d", gd_op_list[req_ctx->op],
+ status);
+
+ txn_id = GF_CALLOC (1, sizeof(uuid_t), gf_common_mt_uuid_t);
+
+ if (txn_id)
+ uuid_copy (*txn_id, event->txn_id);
+ else {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_bin (rsp_dict, "transaction_id",
+ txn_id, sizeof(*txn_id));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set transaction id.");
+ goto out;
}
ret = glusterd_op_commit_send_resp (req_ctx->req, req_ctx->op,
status, op_errstr, rsp_dict);
- glusterd_op_fini_ctx ();
+out:
if (op_errstr && (strcmp (op_errstr, "")))
GF_FREE (op_errstr);
if (rsp_dict)
dict_unref (rsp_dict);
- gf_log (THIS->name, GF_LOG_DEBUG, "Returning with %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning with %d", ret);
return ret;
}
@@ -2869,13 +4534,12 @@ glusterd_op_ac_send_commit_failed (glusterd_op_sm_event_t *event, void *ctx)
opinfo.op_ret, opinfo.op_errstr,
op_ctx);
- glusterd_op_fini_ctx ();
if (opinfo.op_errstr && (strcmp (opinfo.op_errstr, ""))) {
GF_FREE (opinfo.op_errstr);
opinfo.op_errstr = NULL;
}
- gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
+ gf_log (THIS->name, GF_LOG_DEBUG, "Returning with %d", ret);
return ret;
}
@@ -2906,6 +4570,7 @@ glusterd_op_stage_validate (glusterd_op_t op, dict_t *dict, char **op_errstr,
dict_t *rsp_dict)
{
int ret = -1;
+ xlator_t *this = THIS;
switch (op) {
case GD_OP_CREATE_VOLUME:
@@ -2953,6 +4618,10 @@ glusterd_op_stage_validate (glusterd_op_t op, dict_t *dict, char **op_errstr,
ret = glusterd_op_stage_sync_volume (dict, op_errstr);
break;
+ case GD_OP_GSYNC_CREATE:
+ ret = glusterd_op_stage_gsync_create (dict, op_errstr);
+ break;
+
case GD_OP_GSYNC_SET:
ret = glusterd_op_stage_gsync_set (dict, op_errstr);
break;
@@ -2962,7 +4631,8 @@ glusterd_op_stage_validate (glusterd_op_t op, dict_t *dict, char **op_errstr,
break;
case GD_OP_QUOTA:
- ret = glusterd_op_stage_quota (dict, op_errstr);
+ ret = glusterd_op_stage_quota (dict, op_errstr,
+ rsp_dict);
break;
case GD_OP_STATUS_VOLUME:
@@ -2987,13 +4657,20 @@ glusterd_op_stage_validate (glusterd_op_t op, dict_t *dict, char **op_errstr,
op_errstr);
break;
+ case GD_OP_COPY_FILE:
+ ret = glusterd_op_stage_copy_file (dict, op_errstr);
+ break;
+
+ case GD_OP_SYS_EXEC:
+ ret = glusterd_op_stage_sys_exec (dict, op_errstr);
+ break;
+
default:
- gf_log ("", GF_LOG_ERROR, "Unknown op %d",
- op);
+ gf_log (this->name, GF_LOG_ERROR, "Unknown op %s",
+ gd_op_list[op]);
}
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
-
+ gf_log (this->name, GF_LOG_DEBUG, "OP = %d. Returning %d", op, ret);
return ret;
}
@@ -3003,7 +4680,9 @@ glusterd_op_commit_perform (glusterd_op_t op, dict_t *dict, char **op_errstr,
dict_t *rsp_dict)
{
int ret = -1;
+ xlator_t *this = THIS;
+ glusterd_op_commit_hook (op, dict, GD_COMMIT_HOOK_PRE);
switch (op) {
case GD_OP_CREATE_VOLUME:
ret = glusterd_op_create_volume (dict, op_errstr);
@@ -3049,6 +4728,11 @@ glusterd_op_commit_perform (glusterd_op_t op, dict_t *dict, char **op_errstr,
ret = glusterd_op_sync_volume (dict, op_errstr, rsp_dict);
break;
+ case GD_OP_GSYNC_CREATE:
+ ret = glusterd_op_gsync_create (dict, op_errstr,
+ rsp_dict);
+ break;
+
case GD_OP_GSYNC_SET:
ret = glusterd_op_gsync_set (dict, op_errstr, rsp_dict);
break;
@@ -3059,7 +4743,7 @@ glusterd_op_commit_perform (glusterd_op_t op, dict_t *dict, char **op_errstr,
break;
case GD_OP_QUOTA:
- ret = glusterd_op_quota (dict, op_errstr);
+ ret = glusterd_op_quota (dict, op_errstr, rsp_dict);
break;
case GD_OP_STATUS_VOLUME:
@@ -3080,371 +4764,35 @@ glusterd_op_commit_perform (glusterd_op_t op, dict_t *dict, char **op_errstr,
break;
case GD_OP_CLEARLOCKS_VOLUME:
- ret = glusterd_op_clearlocks_volume (dict, op_errstr);
+ ret = glusterd_op_clearlocks_volume (dict, op_errstr,
+ rsp_dict);
break;
- default:
- gf_log ("", GF_LOG_ERROR, "Unknown op %d",
- op);
+ case GD_OP_COPY_FILE:
+ ret = glusterd_op_copy_file (dict, op_errstr);
break;
- }
-
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
-
- return ret;
-}
-
-int
-_profile_volume_add_brick_rsp (dict_t *this, char *key, data_t *value,
- void *data)
-{
- char new_key[256] = {0};
- glusterd_pr_brick_rsp_conv_t *rsp_ctx = NULL;
- data_t *new_value = NULL;
-
- rsp_ctx = data;
- new_value = data_copy (value);
- GF_ASSERT (new_value);
- snprintf (new_key, sizeof (new_key), "%d-%s", rsp_ctx->count, key);
- dict_set (rsp_ctx->dict, new_key, new_value);
- return 0;
-}
-
-int
-glusterd_profile_volume_brick_rsp (void *pending_entry,
- dict_t *rsp_dict, dict_t *op_ctx,
- char **op_errstr, gd_node_type type)
-{
- int ret = 0;
- glusterd_pr_brick_rsp_conv_t rsp_ctx = {0};
- int32_t count = 0;
- char brick[PATH_MAX+1024] = {0};
- char key[256] = {0};
- char *full_brick = NULL;
- glusterd_brickinfo_t *brickinfo = NULL;
- xlator_t *this = NULL;
- glusterd_conf_t *priv = NULL;
- GF_ASSERT (rsp_dict);
- GF_ASSERT (op_ctx);
- GF_ASSERT (op_errstr);
- GF_ASSERT (pending_entry);
-
- this = THIS;
- GF_ASSERT (this);
- priv = this->private;
- GF_ASSERT (priv);
-
- ret = dict_get_int32 (op_ctx, "count", &count);
- if (ret) {
- count = 1;
- } else {
- count++;
- }
- snprintf (key, sizeof (key), "%d-brick", count);
- if (type == GD_NODE_BRICK) {
- brickinfo = pending_entry;
- snprintf (brick, sizeof (brick), "%s:%s", brickinfo->hostname,
- brickinfo->path);
- } else if (type == GD_NODE_NFS) {
- snprintf (brick, sizeof (brick), "%s", uuid_utoa (MY_UUID));
- }
- full_brick = gf_strdup (brick);
- GF_ASSERT (full_brick);
- ret = dict_set_dynstr (op_ctx, key, full_brick);
-
- rsp_ctx.count = count;
- rsp_ctx.dict = op_ctx;
- dict_foreach (rsp_dict, _profile_volume_add_brick_rsp, &rsp_ctx);
- dict_del (op_ctx, "count");
- ret = dict_set_int32 (op_ctx, "count", count);
- return ret;
-}
-
-//input-key: <replica-id>:<child-id>-*
-//output-key: <brick-id>-*
-int
-_heal_volume_add_shd_rsp (dict_t *this, char *key, data_t *value, void *data)
-{
- char new_key[256] = {0,};
- char int_str[16] = {0};
- data_t *new_value = NULL;
- char *rxl_end = NULL;
- char *rxl_child_end = NULL;
- glusterd_volinfo_t *volinfo = NULL;
- int rxl_id = 0;
- int rxl_child_id = 0;
- int brick_id = 0;
- int int_len = 0;
- int ret = 0;
- glusterd_heal_rsp_conv_t *rsp_ctx = NULL;
- glusterd_brickinfo_t *brickinfo = NULL;
-
- rsp_ctx = data;
- rxl_end = strchr (key, '-');
- if (!rxl_end)
- goto out;
-
- int_len = strlen (key) - strlen (rxl_end);
- strncpy (int_str, key, int_len);
- int_str[int_len] = '\0';
- ret = gf_string2int (int_str, &rxl_id);
- if (ret)
- goto out;
-
- rxl_child_end = strchr (rxl_end + 1, '-');
- if (!rxl_child_end)
- goto out;
-
- int_len = strlen (rxl_end) - strlen (rxl_child_end) - 1;
- strncpy (int_str, rxl_end + 1, int_len);
- int_str[int_len] = '\0';
- ret = gf_string2int (int_str, &rxl_child_id);
- if (ret)
- goto out;
-
- volinfo = rsp_ctx->volinfo;
- brick_id = rxl_id * volinfo->replica_count + rxl_child_id;
-
- if (!strcmp (rxl_child_end, "-status")) {
- brickinfo = glusterd_get_brickinfo_by_position (volinfo,
- brick_id);
- if (!brickinfo)
- goto out;
- if (!glusterd_is_local_brick (rsp_ctx->this, volinfo,
- brickinfo))
- goto out;
- }
- new_value = data_copy (value);
- snprintf (new_key, sizeof (new_key), "%d%s", brick_id, rxl_child_end);
- dict_set (rsp_ctx->dict, new_key, new_value);
-
-out:
- return 0;
-}
-
-int
-glusterd_heal_volume_brick_rsp (dict_t *req_dict, dict_t *rsp_dict,
- dict_t *op_ctx, char **op_errstr)
-{
- int ret = 0;
- glusterd_heal_rsp_conv_t rsp_ctx = {0};
- char *volname = NULL;
- glusterd_volinfo_t *volinfo = NULL;
-
- GF_ASSERT (rsp_dict);
- GF_ASSERT (op_ctx);
- GF_ASSERT (op_errstr);
-
- ret = dict_get_str (req_dict, "volname", &volname);
- if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get volume name");
- goto out;
- }
-
- ret = glusterd_volinfo_find (volname, &volinfo);
-
- if (ret)
- goto out;
-
- rsp_ctx.dict = op_ctx;
- rsp_ctx.volinfo = volinfo;
- rsp_ctx.this = THIS;
- dict_foreach (rsp_dict, _heal_volume_add_shd_rsp, &rsp_ctx);
-
-out:
- return ret;
-}
-
-int
-_status_volume_add_brick_rsp (dict_t *this, char *key, data_t *value,
- void *data)
-{
- char new_key[256] = {0,};
- data_t *new_value = 0;
- glusterd_pr_brick_rsp_conv_t *rsp_ctx = NULL;
-
- rsp_ctx = data;
- new_value = data_copy (value);
- snprintf (new_key, sizeof (new_key), "brick%d.%s", rsp_ctx->count, key);
- dict_set (rsp_ctx->dict, new_key, new_value);
-
- return 0;
-}
-
-int
-glusterd_status_volume_brick_rsp (dict_t *rsp_dict, dict_t *op_ctx,
- char **op_errstr)
-{
- int ret = 0;
- glusterd_pr_brick_rsp_conv_t rsp_ctx = {0};
- int32_t count = 0;
- int index = 0;
-
- GF_ASSERT (rsp_dict);
- GF_ASSERT (op_ctx);
- GF_ASSERT (op_errstr);
-
- ret = dict_get_int32 (op_ctx, "count", &count);
- if (ret) {
- count = 0;
- } else {
- count++;
- }
- ret = dict_get_int32 (rsp_dict, "index", &index);
- if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "Couldn't get node index");
- goto out;
- }
- dict_del (rsp_dict, "index");
-
- rsp_ctx.count = index;
- rsp_ctx.dict = op_ctx;
- dict_foreach (rsp_dict, _status_volume_add_brick_rsp, &rsp_ctx);
- ret = dict_set_int32 (op_ctx, "count", count);
-
-out:
- return ret;
-}
-
-int
-glusterd_defrag_volume_node_rsp (dict_t *req_dict, dict_t *rsp_dict,
- dict_t *op_ctx)
-{
- int ret = 0;
- char *volname = NULL;
- glusterd_volinfo_t *volinfo = NULL;
- char key[256] = {0,};
- int32_t i = 0;
- char buf[1024] = {0,};
- char *node_str = NULL;
- glusterd_conf_t *priv = NULL;
-
- priv = THIS->private;
- GF_ASSERT (req_dict);
-
- ret = dict_get_str (req_dict, "volname", &volname);
- if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get volume name");
- goto out;
- }
-
- ret = glusterd_volinfo_find (volname, &volinfo);
-
- if (ret)
- goto out;
-
- if (rsp_dict) {
- ret = glusterd_defrag_volume_status_update (volinfo,
- rsp_dict);
- }
+ case GD_OP_SYS_EXEC:
+ ret = glusterd_op_sys_exec (dict, op_errstr, rsp_dict);
+ break;
- if (!op_ctx) {
- dict_copy (rsp_dict, op_ctx);
- goto out;
+ default:
+ gf_log (this->name, GF_LOG_ERROR, "Unknown op %s",
+ gd_op_list[op]);
+ break;
}
- ret = dict_get_int32 (op_ctx, "count", &i);
- i++;
-
- ret = dict_set_int32 (op_ctx, "count", i);
- if (ret)
- gf_log (THIS->name, GF_LOG_ERROR, "Failed to set count");
-
- snprintf (buf, 1024, "%s", uuid_utoa (MY_UUID));
- node_str = gf_strdup (buf);
-
- snprintf (key, 256, "node-uuid-%d",i);
- ret = dict_set_dynstr (op_ctx, key, node_str);
- if (ret)
- gf_log (THIS->name, GF_LOG_ERROR,
- "failed to set node-uuid");
-
- memset (key, 0 , 256);
- snprintf (key, 256, "files-%d", i);
- ret = dict_set_uint64 (op_ctx, key, volinfo->rebalance_files);
- if (ret)
- gf_log (THIS->name, GF_LOG_ERROR,
- "failed to set file count");
-
- memset (key, 0 , 256);
- snprintf (key, 256, "size-%d", i);
- ret = dict_set_uint64 (op_ctx, key, volinfo->rebalance_data);
- if (ret)
- gf_log (THIS->name, GF_LOG_ERROR,
- "failed to set size of xfer");
-
- memset (key, 0 , 256);
- snprintf (key, 256, "lookups-%d", i);
- ret = dict_set_uint64 (op_ctx, key, volinfo->lookedup_files);
- if (ret)
- gf_log (THIS->name, GF_LOG_ERROR,
- "failed to set lookedup file count");
-
- memset (key, 0 , 256);
- snprintf (key, 256, "status-%d", i);
- ret = dict_set_int32 (op_ctx, key, volinfo->defrag_status);
- if (ret)
- gf_log (THIS->name, GF_LOG_ERROR,
- "failed to set status");
-
- memset (key, 0 , 256);
- snprintf (key, 256, "failures-%d", i);
- ret = dict_set_uint64 (op_ctx, key, volinfo->rebalance_failures);
- if (ret)
- gf_log (THIS->name, GF_LOG_ERROR,
- "failed to set failure count");
-
- memset (key, 0, 256);
- snprintf (key, 256, "run-time-%d", i);
- ret = dict_set_double (op_ctx, key, volinfo->rebalance_time);
- if (ret)
- gf_log (THIS->name, GF_LOG_ERROR,
- "failed to set run-time");
+ if (ret == 0)
+ glusterd_op_commit_hook (op, dict, GD_COMMIT_HOOK_POST);
-out:
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
-int32_t
-glusterd_handle_node_rsp (glusterd_req_ctx_t *req_ctx, void *pending_entry,
- glusterd_op_t op, dict_t *rsp_dict, dict_t *op_ctx,
- char **op_errstr, gd_node_type type)
-{
- int ret = 0;
-
- GF_ASSERT (op_errstr);
-
- switch (op) {
- case GD_OP_PROFILE_VOLUME:
- ret = glusterd_profile_volume_brick_rsp (pending_entry,
- rsp_dict, op_ctx,
- op_errstr, type);
- break;
- case GD_OP_STATUS_VOLUME:
- ret = glusterd_status_volume_brick_rsp (rsp_dict, op_ctx,
- op_errstr);
- break;
-
- case GD_OP_DEFRAG_BRICK_VOLUME:
- glusterd_defrag_volume_node_rsp (req_ctx->dict,
- rsp_dict, op_ctx);
- break;
-
- case GD_OP_HEAL_VOLUME:
- ret = glusterd_heal_volume_brick_rsp (req_ctx->dict, rsp_dict,
- op_ctx, op_errstr);
- break;
- default:
- break;
- }
-
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
- return ret;
-}
static int
-glusterd_bricks_select_stop_volume (dict_t *dict, char **op_errstr)
+glusterd_bricks_select_stop_volume (dict_t *dict, char **op_errstr,
+ struct list_head *selected)
{
int ret = 0;
int flags = 0;
@@ -3453,15 +4801,17 @@ glusterd_bricks_select_stop_volume (dict_t *dict, char **op_errstr)
glusterd_brickinfo_t *brickinfo = NULL;
glusterd_pending_node_t *pending_node = NULL;
-
ret = glusterd_op_stop_volume_args_get (dict, &volname, &flags);
if (ret)
goto out;
ret = glusterd_volinfo_find (volname, &volinfo);
-
- if (ret)
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR, FMTSTR_CHECK_VOL_EXISTS,
+ volname);
+ gf_asprintf (op_errstr, FMTSTR_CHECK_VOL_EXISTS, volname);
goto out;
+ }
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
if (glusterd_is_brick_started (brickinfo)) {
@@ -3473,7 +4823,7 @@ glusterd_bricks_select_stop_volume (dict_t *dict, char **op_errstr)
} else {
pending_node->node = brickinfo;
pending_node->type = GD_NODE_BRICK;
- list_add_tail (&pending_node->list, &opinfo.pending_bricks);
+ list_add_tail (&pending_node->list, selected);
pending_node = NULL;
}
}
@@ -3484,7 +4834,8 @@ out:
}
static int
-glusterd_bricks_select_remove_brick (dict_t *dict, char **op_errstr)
+glusterd_bricks_select_remove_brick (dict_t *dict, char **op_errstr,
+ struct list_head *selected)
{
int ret = -1;
char *volname = NULL;
@@ -3497,6 +4848,8 @@ glusterd_bricks_select_remove_brick (dict_t *dict, char **op_errstr)
glusterd_pending_node_t *pending_node = NULL;
int32_t force = 0;
+
+
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
@@ -3545,7 +4898,7 @@ glusterd_bricks_select_remove_brick (dict_t *dict, char **op_errstr)
} else {
pending_node->node = brickinfo;
pending_node->type = GD_NODE_BRICK;
- list_add_tail (&pending_node->list, &opinfo.pending_bricks);
+ list_add_tail (&pending_node->list, selected);
pending_node = NULL;
}
}
@@ -3557,7 +4910,8 @@ out:
}
static int
-glusterd_bricks_select_profile_volume (dict_t *dict, char **op_errstr)
+glusterd_bricks_select_profile_volume (dict_t *dict, char **op_errstr,
+ struct list_head *selected)
{
int ret = -1;
char *volname = NULL;
@@ -3570,6 +4924,8 @@ glusterd_bricks_select_profile_volume (dict_t *dict, char **op_errstr)
glusterd_pending_node_t *pending_node = NULL;
char *brick = NULL;
+
+
this = THIS;
GF_ASSERT (this);
priv = this->private;
@@ -3606,7 +4962,7 @@ glusterd_bricks_select_profile_volume (dict_t *dict, char **op_errstr)
case GF_CLI_STATS_INFO:
ret = dict_get_str_boolean (dict, "nfs", _gf_false);
if (ret) {
- if (!glusterd_nodesvc_is_running ("nfs")) {
+ if (!glusterd_is_nodesvc_online ("nfs")) {
ret = -1;
gf_log (this->name, GF_LOG_ERROR, "NFS server"
" is not running");
@@ -3620,8 +4976,7 @@ glusterd_bricks_select_profile_volume (dict_t *dict, char **op_errstr)
}
pending_node->node = priv->nfs;
pending_node->type = GD_NODE_NFS;
- list_add_tail (&pending_node->list,
- &opinfo.pending_bricks);
+ list_add_tail (&pending_node->list, selected);
pending_node = NULL;
ret = 0;
@@ -3639,7 +4994,7 @@ glusterd_bricks_select_profile_volume (dict_t *dict, char **op_errstr)
pending_node->node = brickinfo;
pending_node->type = GD_NODE_BRICK;
list_add_tail (&pending_node->list,
- &opinfo.pending_bricks);
+ selected);
pending_node = NULL;
}
}
@@ -3649,7 +5004,7 @@ glusterd_bricks_select_profile_volume (dict_t *dict, char **op_errstr)
case GF_CLI_STATS_TOP:
ret = dict_get_str_boolean (dict, "nfs", _gf_false);
if (ret) {
- if (!glusterd_nodesvc_is_running ("nfs")) {
+ if (!glusterd_is_nodesvc_online ("nfs")) {
ret = -1;
gf_log (this->name, GF_LOG_ERROR, "NFS server"
" is not running");
@@ -3663,8 +5018,7 @@ glusterd_bricks_select_profile_volume (dict_t *dict, char **op_errstr)
}
pending_node->node = priv->nfs;
pending_node->type = GD_NODE_NFS;
- list_add_tail (&pending_node->list,
- &opinfo.pending_bricks);
+ list_add_tail (&pending_node->list, selected);
pending_node = NULL;
ret = 0;
@@ -3690,7 +5044,7 @@ glusterd_bricks_select_profile_volume (dict_t *dict, char **op_errstr)
pending_node->node = brickinfo;
pending_node->type = GD_NODE_BRICK;
list_add_tail (&pending_node->list,
- &opinfo.pending_bricks);
+ selected);
pending_node = NULL;
goto out;
}
@@ -3707,7 +5061,7 @@ glusterd_bricks_select_profile_volume (dict_t *dict, char **op_errstr)
pending_node->node = brickinfo;
pending_node->type = GD_NODE_BRICK;
list_add_tail (&pending_node->list,
- &opinfo.pending_bricks);
+ selected);
pending_node = NULL;
}
}
@@ -3752,24 +5106,95 @@ out:
}
int
+get_replica_index_for_per_replica_cmd (glusterd_volinfo_t *volinfo,
+ dict_t *dict) {
+ int ret = 0;
+ char *hostname = NULL;
+ char *path = NULL;
+ int index = 0;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ int cmd_replica_index = -1;
+ int replica_count = -1;
+
+
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "per-replica-cmd-hostname", &hostname);
+ if (ret)
+ goto out;
+ ret = dict_get_str (dict, "per-replica-cmd-path", &path);
+ if (ret)
+ goto out;
+
+ replica_count = volinfo->replica_count;
+
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ if (uuid_is_null (brickinfo->uuid))
+ (void)glusterd_resolve_brick (brickinfo);
+ if (!strcmp (brickinfo->path, path) &&
+ !strcmp (brickinfo->hostname, hostname)) {
+ cmd_replica_index = index/(replica_count);
+ goto out;
+ }
+ index++;
+ }
+
+
+out:
+ if (ret)
+ cmd_replica_index = -1;
+
+ return cmd_replica_index;
+}
+
+int
_select_rxlators_with_local_bricks (xlator_t *this, glusterd_volinfo_t *volinfo,
- dict_t *dict)
+ dict_t *dict, cli_cmd_type type)
{
glusterd_brickinfo_t *brickinfo = NULL;
glusterd_conf_t *priv = NULL;
- int index = 1;
+ int index = 0;
int rxlator_count = 0;
int replica_count = 0;
gf_boolean_t add = _gf_false;
+ int ret = 0;
+ int cmd_replica_index = -1;
priv = this->private;
replica_count = volinfo->replica_count;
+
+ if (type == PER_REPLICA) {
+
+ cmd_replica_index = get_replica_index_for_per_replica_cmd
+ (volinfo, dict);
+ if (cmd_replica_index == -1) {
+ ret = -1;
+ goto err;
+ }
+ }
+
+ index = 1;
+
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
if (uuid_is_null (brickinfo->uuid))
(void)glusterd_resolve_brick (brickinfo);
- if (!uuid_compare (MY_UUID, brickinfo->uuid))
- add = _gf_true;
+ switch (type) {
+ case ALL_REPLICA:
+ if (!uuid_compare (MY_UUID, brickinfo->uuid))
+ add = _gf_true;
+ break;
+ case PER_REPLICA:
+ if (!uuid_compare (MY_UUID, brickinfo->uuid) &&
+ ((index-1)/replica_count == cmd_replica_index))
+
+ add = _gf_true;
+ break;
+ }
+
if (index % replica_count == 0) {
if (add) {
_add_rxlator_to_dict (dict, volinfo->volname,
@@ -3782,6 +5207,10 @@ _select_rxlators_with_local_bricks (xlator_t *this, glusterd_volinfo_t *volinfo,
index++;
}
+err:
+ if (ret)
+ rxlator_count = -1;
+
return rxlator_count;
}
@@ -3822,8 +5251,135 @@ _select_rxlators_for_full_self_heal (xlator_t *this,
return rxlator_count;
}
+
+static int
+glusterd_bricks_select_snap (dict_t *dict, char **op_errstr,
+ struct list_head *selected)
+{
+ int ret = -1;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+ glusterd_pending_node_t *pending_node = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ char *volname = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ int brick_index = -1;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ ret = dict_get_str (dict, "volname", &volname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get"
+ " volname");
+ goto out;
+ }
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret)
+ goto out;
+
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ brick_index++;
+ if (uuid_compare (brickinfo->uuid, MY_UUID) ||
+ !glusterd_is_brick_started (brickinfo)) {
+ continue;
+ }
+ pending_node = GF_CALLOC (1, sizeof (*pending_node),
+ gf_gld_mt_pending_node_t);
+ if (!pending_node) {
+ ret = -1;
+ goto out;
+ }
+ pending_node->node = brickinfo;
+ pending_node->type = GD_NODE_BRICK;
+ pending_node->index = brick_index;
+ list_add_tail (&pending_node->list,
+ selected);
+ pending_node = NULL;
+ }
+
+ ret = 0;
+
+out:
+ gf_log (THIS->name, GF_LOG_DEBUG, "Returning ret %d", ret);
+ return ret;
+}
+
+static int
+fill_shd_status_for_local_bricks (dict_t *dict, glusterd_volinfo_t *volinfo,
+ cli_cmd_type type, dict_t *req_dict)
+{
+ glusterd_brickinfo_t *brickinfo = NULL;
+ char msg[1024] = {0,};
+ char key[1024] = {0,};
+ char value[1024] = {0,};
+ int index = 0;
+ int ret = 0;
+ xlator_t *this = NULL;
+ int cmd_replica_index = -1;
+
+ this = THIS;
+ snprintf (msg, sizeof (msg), "self-heal-daemon is not running on");
+
+ if (type == PER_REPLICA) {
+ cmd_replica_index = get_replica_index_for_per_replica_cmd
+ (volinfo, req_dict);
+ if (cmd_replica_index == -1) {
+ gf_log (THIS->name, GF_LOG_ERROR, "Could not find the "
+ "replica index for per replica type command");
+ ret = -1;
+ goto out;
+ }
+ }
+
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ if (uuid_is_null (brickinfo->uuid))
+ (void)glusterd_resolve_brick (brickinfo);
+
+ if (uuid_compare (MY_UUID, brickinfo->uuid)) {
+ index++;
+ continue;
+ }
+
+ if (type == PER_REPLICA) {
+ if (cmd_replica_index != (index/volinfo->replica_count)) {
+ index++;
+ continue;
+ }
+
+ }
+ snprintf (key, sizeof (key), "%d-status",index);
+ snprintf (value, sizeof (value), "%s %s",msg,
+ uuid_utoa(MY_UUID));
+ ret = dict_set_dynstr (dict, key, gf_strdup(value));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to"
+ "set the dictionary for shd status msg");
+ goto out;
+ }
+ snprintf (key, sizeof (key), "%d-shd-status",index);
+ ret = dict_set_str (dict, key, "off");
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to"
+ " set dictionary for shd status msg");
+ goto out;
+ }
+
+ index++;
+ }
+
+out:
+ return ret;
+
+}
+
+
static int
-glusterd_bricks_select_heal_volume (dict_t *dict, char **op_errstr)
+glusterd_bricks_select_heal_volume (dict_t *dict, char **op_errstr,
+ struct list_head *selected,
+ dict_t *rsp_dict)
{
int ret = -1;
char *volname = NULL;
@@ -3863,19 +5419,79 @@ glusterd_bricks_select_heal_volume (dict_t *dict, char **op_errstr)
}
switch (heal_op) {
+ case GF_AFR_OP_INDEX_SUMMARY:
+ case GF_AFR_OP_STATISTICS_HEAL_COUNT:
+ if (!glusterd_is_nodesvc_online ("glustershd")) {
+ if (!rsp_dict) {
+ gf_log (this->name, GF_LOG_ERROR, "Received "
+ "empty ctx.");
+ goto out;
+ }
+
+ ret = fill_shd_status_for_local_bricks (rsp_dict,
+ volinfo,
+ ALL_REPLICA,
+ dict);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR, "Unable to "
+ "fill the shd status for the local "
+ "bricks");
+ goto out;
+
+ }
+ break;
+ case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA:
+ if (!glusterd_is_nodesvc_online ("glustershd")) {
+ if (!rsp_dict) {
+ gf_log (this->name, GF_LOG_ERROR, "Received "
+ "empty ctx.");
+ goto out;
+ }
+ ret = fill_shd_status_for_local_bricks (rsp_dict,
+ volinfo,
+ PER_REPLICA,
+ dict);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR, "Unable to "
+ "fill the shd status for the local"
+ " bricks.");
+ goto out;
+
+ }
+ break;
+ default:
+ break;
+ }
+
+
+ switch (heal_op) {
case GF_AFR_OP_HEAL_FULL:
rxlator_count = _select_rxlators_for_full_self_heal (this,
volinfo,
dict);
break;
+ case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA:
+ rxlator_count = _select_rxlators_with_local_bricks (this,
+ volinfo,
+ dict,
+ PER_REPLICA);
+ break;
default:
rxlator_count = _select_rxlators_with_local_bricks (this,
volinfo,
- dict);
+ dict,
+ ALL_REPLICA);
break;
}
if (!rxlator_count)
goto out;
+ if (rxlator_count == -1){
+ gf_log (this->name, GF_LOG_ERROR, "Could not determine the"
+ "translator count");
+ ret = -1;
+ goto out;
+ }
+
ret = dict_set_int32 (dict, "count", rxlator_count);
if (ret)
goto out;
@@ -3888,8 +5504,7 @@ glusterd_bricks_select_heal_volume (dict_t *dict, char **op_errstr)
} else {
pending_node->node = priv->shd;
pending_node->type = GD_NODE_SHD;
- list_add_tail (&pending_node->list,
- &opinfo.pending_bricks);
+ list_add_tail (&pending_node->list, selected);
pending_node = NULL;
}
@@ -3899,9 +5514,9 @@ out:
}
-
static int
-glusterd_bricks_select_rebalance_volume (dict_t *dict, char **op_errstr)
+glusterd_bricks_select_rebalance_volume (dict_t *dict, char **op_errstr,
+ struct list_head *selected)
{
int ret = -1;
char *volname = NULL;
@@ -3913,7 +5528,6 @@ glusterd_bricks_select_rebalance_volume (dict_t *dict, char **op_errstr)
this = THIS;
GF_ASSERT (this);
-
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
gf_log ("glusterd", GF_LOG_ERROR, "volume name get failed");
@@ -3946,11 +5560,9 @@ out:
return ret;
}
-
-
-
static int
-glusterd_bricks_select_status_volume (dict_t *dict, char **op_errstr)
+glusterd_bricks_select_status_volume (dict_t *dict, char **op_errstr,
+ struct list_head *selected)
{
int ret = -1;
int cmd = 0;
@@ -3987,6 +5599,7 @@ glusterd_bricks_select_status_volume (dict_t *dict, char **op_errstr)
case GF_CLI_STATUS_CALLPOOL:
case GF_CLI_STATUS_NFS:
case GF_CLI_STATUS_SHD:
+ case GF_CLI_STATUS_QUOTAD:
break;
default:
goto out;
@@ -4027,11 +5640,11 @@ glusterd_bricks_select_status_volume (dict_t *dict, char **op_errstr)
pending_node->node = brickinfo;
pending_node->type = GD_NODE_BRICK;
pending_node->index = 0;
- list_add_tail (&pending_node->list, &opinfo.pending_bricks);
+ list_add_tail (&pending_node->list, selected);
ret = 0;
} else if ((cmd & GF_CLI_STATUS_NFS) != 0) {
- if (!glusterd_nodesvc_is_running ("nfs")) {
+ if (!glusterd_is_nodesvc_online ("nfs")) {
ret = -1;
gf_log (this->name, GF_LOG_ERROR,
"NFS server is not running");
@@ -4046,11 +5659,11 @@ glusterd_bricks_select_status_volume (dict_t *dict, char **op_errstr)
pending_node->node = priv->nfs;
pending_node->type = GD_NODE_NFS;
pending_node->index = 0;
- list_add_tail (&pending_node->list, &opinfo.pending_bricks);
+ list_add_tail (&pending_node->list, selected);
ret = 0;
} else if ((cmd & GF_CLI_STATUS_SHD) != 0) {
- if (!glusterd_nodesvc_is_running ("glustershd")) {
+ if (!glusterd_is_nodesvc_online ("glustershd")) {
ret = -1;
gf_log (this->name, GF_LOG_ERROR,
"Self-heal daemon is not running");
@@ -4065,7 +5678,26 @@ glusterd_bricks_select_status_volume (dict_t *dict, char **op_errstr)
pending_node->node = priv->shd;
pending_node->type = GD_NODE_SHD;
pending_node->index = 0;
- list_add_tail (&pending_node->list, &opinfo.pending_bricks);
+ list_add_tail (&pending_node->list, selected);
+
+ ret = 0;
+ } else if ((cmd & GF_CLI_STATUS_QUOTAD) != 0) {
+ if (!glusterd_is_nodesvc_online ("quotad")) {
+ gf_log (this->name, GF_LOG_ERROR, "Quotad is not "
+ "running");
+ ret = -1;
+ goto out;
+ }
+ pending_node = GF_CALLOC (1, sizeof (*pending_node),
+ gf_gld_mt_pending_node_t);
+ if (!pending_node) {
+ ret = -1;
+ goto out;
+ }
+ pending_node->node = priv->quotad;
+ pending_node->type = GD_NODE_QUOTAD;
+ pending_node->index = 0;
+ list_add_tail (&pending_node->list, selected);
ret = 0;
} else {
@@ -4086,8 +5718,7 @@ glusterd_bricks_select_status_volume (dict_t *dict, char **op_errstr)
pending_node->node = brickinfo;
pending_node->type = GD_NODE_BRICK;
pending_node->index = brick_index;
- list_add_tail (&pending_node->list,
- &opinfo.pending_bricks);
+ list_add_tail (&pending_node->list, selected);
pending_node = NULL;
}
}
@@ -4120,8 +5751,11 @@ glusterd_op_ac_send_brick_op (glusterd_op_sm_event_t *event, void *ctx)
ret = glusterd_op_build_payload (&req_ctx->dict, &op_errstr,
NULL);
if (ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "Building payload failed");
+ gf_log (this->name, GF_LOG_ERROR, LOGSTR_BUILD_PAYLOAD,
+ gd_op_list[op]);
+ if (op_errstr == NULL)
+ gf_asprintf (&op_errstr,
+ OPERRSTR_BUILD_PAYLOAD);
opinfo.op_errstr = op_errstr;
goto out;
}
@@ -4136,11 +5770,12 @@ glusterd_op_ac_send_brick_op (glusterd_op_sm_event_t *event, void *ctx)
if (!opinfo.pending_count && !opinfo.brick_pending_count) {
glusterd_clear_pending_nodes (&opinfo.pending_bricks);
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK, req_ctx);
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK,
+ &event->txn_id, req_ctx);
}
out:
- gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning with %d", ret);
return ret;
}
@@ -4157,7 +5792,10 @@ glusterd_op_ac_rcvd_brick_op_acc (glusterd_op_sm_event_t *event, void *ctx)
dict_t *op_ctx = NULL;
glusterd_req_ctx_t *req_ctx = NULL;
void *pending_entry = NULL;
+ xlator_t *this = NULL;
+ this = THIS;
+ GF_ASSERT (this);
GF_ASSERT (event);
GF_ASSERT (ctx);
ev_ctx = ctx;
@@ -4173,7 +5811,7 @@ glusterd_op_ac_rcvd_brick_op_acc (glusterd_op_sm_event_t *event, void *ctx)
ret = glusterd_remove_pending_entry (&opinfo.pending_bricks,
pending_entry);
if (ret) {
- gf_log ("glusterd", GF_LOG_ERROR, "unknown response received ");
+ gf_log (this->name, GF_LOG_ERROR, "unknown response received ");
ret = -1;
goto out;
}
@@ -4181,25 +5819,27 @@ glusterd_op_ac_rcvd_brick_op_acc (glusterd_op_sm_event_t *event, void *ctx)
if (opinfo.brick_pending_count > 0)
opinfo.brick_pending_count--;
- glusterd_handle_node_rsp (req_ctx, pending_entry, op, ev_ctx->rsp_dict,
+ glusterd_handle_node_rsp (req_ctx->dict, pending_entry, op, ev_ctx->rsp_dict,
op_ctx, &op_errstr, type);
if (opinfo.brick_pending_count > 0)
goto out;
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK, ev_ctx->commit_ctx);
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_ALL_ACK, &event->txn_id,
+ ev_ctx->commit_ctx);
out:
if (ev_ctx->rsp_dict)
dict_unref (ev_ctx->rsp_dict);
GF_FREE (ev_ctx);
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
int32_t
-glusterd_op_bricks_select (glusterd_op_t op, dict_t *dict, char **op_errstr)
+glusterd_op_bricks_select (glusterd_op_t op, dict_t *dict, char **op_errstr,
+ struct list_head *selected, dict_t *rsp_dict)
{
int ret = 0;
@@ -4210,33 +5850,42 @@ glusterd_op_bricks_select (glusterd_op_t op, dict_t *dict, char **op_errstr)
switch (op) {
case GD_OP_STOP_VOLUME:
- ret = glusterd_bricks_select_stop_volume (dict, op_errstr);
+ ret = glusterd_bricks_select_stop_volume (dict, op_errstr,
+ selected);
break;
case GD_OP_REMOVE_BRICK:
- ret = glusterd_bricks_select_remove_brick (dict, op_errstr);
+ ret = glusterd_bricks_select_remove_brick (dict, op_errstr,
+ selected);
break;
case GD_OP_PROFILE_VOLUME:
- ret = glusterd_bricks_select_profile_volume (dict, op_errstr);
+ ret = glusterd_bricks_select_profile_volume (dict, op_errstr,
+ selected);
break;
case GD_OP_HEAL_VOLUME:
- ret = glusterd_bricks_select_heal_volume (dict, op_errstr);
+ ret = glusterd_bricks_select_heal_volume (dict, op_errstr,
+ selected, rsp_dict);
break;
case GD_OP_STATUS_VOLUME:
- ret = glusterd_bricks_select_status_volume (dict, op_errstr);
+ ret = glusterd_bricks_select_status_volume (dict, op_errstr,
+ selected);
break;
case GD_OP_DEFRAG_BRICK_VOLUME:
- ret = glusterd_bricks_select_rebalance_volume (dict, op_errstr);
+ ret = glusterd_bricks_select_rebalance_volume (dict, op_errstr,
+ selected);
+ break;
+ case GD_OP_SNAP:
+ ret = glusterd_bricks_select_snap (dict, op_errstr, selected);
break;
default:
break;
}
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -4553,7 +6202,7 @@ glusterd_op_sm_new_event (glusterd_op_sm_event_type_t event_type,
int
glusterd_op_sm_inject_event (glusterd_op_sm_event_type_t event_type,
- void *ctx)
+ uuid_t *txn_id, void *ctx)
{
int32_t ret = -1;
glusterd_op_sm_event_t *event = NULL;
@@ -4568,7 +6217,10 @@ glusterd_op_sm_inject_event (glusterd_op_sm_event_type_t event_type,
event->ctx = ctx;
- gf_log ("glusterd", GF_LOG_DEBUG, "Enqueue event: '%s'",
+ if (txn_id)
+ uuid_copy (event->txn_id, *txn_id);
+
+ gf_log (THIS->name, GF_LOG_DEBUG, "Enqueue event: '%s'",
glusterd_op_sm_event_name_get (event->event));
list_add_tail (&event->list, &gd_op_sm_queue);
@@ -4627,9 +6279,14 @@ glusterd_op_sm ()
glusterd_op_sm_ac_fn handler = NULL;
glusterd_op_sm_t *state = NULL;
glusterd_op_sm_event_type_t event_type = GD_OP_EVENT_NONE;
+ xlator_t *this = NULL;
+ glusterd_op_info_t txn_op_info;
+
+ this = THIS;
+ GF_ASSERT (this);
if ((lock_err = pthread_mutex_trylock (&gd_op_sm_lock))) {
- gf_log (THIS->name, GF_LOG_DEBUG, "lock failed due to %s",
+ gf_log (this->name, GF_LOG_ERROR, "lock failed due to %s",
strerror (lock_err));
goto lock_failed;
}
@@ -4640,9 +6297,24 @@ glusterd_op_sm ()
list_del_init (&event->list);
event_type = event->event;
- gf_log ("", GF_LOG_DEBUG, "Dequeued event of type: '%s'",
+ gf_log (this->name, GF_LOG_DEBUG, "Dequeued event of "
+ "type: '%s'",
glusterd_op_sm_event_name_get(event_type));
+ gf_log ("", GF_LOG_DEBUG, "transaction ID = %s",
+ uuid_utoa (event->txn_id));
+
+ ret = glusterd_get_txn_opinfo (&event->txn_id,
+ &txn_op_info);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to get transaction's opinfo");
+ glusterd_destroy_op_event_ctx (event);
+ GF_FREE (event);
+ continue;
+ } else
+ opinfo = txn_op_info;
+
state = glusterd_op_state_table[opinfo.state.state];
GF_ASSERT (state);
@@ -4653,7 +6325,7 @@ glusterd_op_sm ()
ret = handler (event, event->ctx);
if (ret) {
- gf_log ("glusterd", GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"handler returned: %d", ret);
glusterd_destroy_op_event_ctx (event);
GF_FREE (event);
@@ -4664,7 +6336,7 @@ glusterd_op_sm ()
event_type);
if (ret) {
- gf_log ("glusterd", GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"Unable to transition"
"state from '%s' to '%s'",
glusterd_op_sm_state_name_get(opinfo.state.state),
@@ -4673,8 +6345,27 @@ glusterd_op_sm ()
return ret;
}
+ if ((state[event_type].next_state ==
+ GD_OP_STATE_DEFAULT) &&
+ (event_type == GD_OP_EVENT_UNLOCK)) {
+ /* Clearing the transaction opinfo */
+ ret = glusterd_clear_txn_opinfo(&event->txn_id);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to clear "
+ "transaction's opinfo");
+ } else {
+ ret = glusterd_set_txn_opinfo (&event->txn_id,
+ &opinfo);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set "
+ "transaction's opinfo");
+ }
+
glusterd_destroy_op_event_ctx (event);
GF_FREE (event);
+
}
}
@@ -4728,48 +6419,6 @@ glusterd_op_clear_op (glusterd_op_t op)
}
int32_t
-glusterd_op_init_ctx (glusterd_op_t op)
-{
- int ret = 0;
- dict_t *dict = NULL;
-
- GF_ASSERT (GD_OP_NONE < op && op < GD_OP_MAX);
-
- if (_gf_false == glusterd_need_brick_op (op)) {
- gf_log ("", GF_LOG_DEBUG, "Received op: %d, returning", op);
- goto out;
- }
- dict = dict_new ();
- if (dict == NULL) {
- ret = -1;
- goto out;
- }
- ret = glusterd_op_set_ctx (dict);
- if (ret)
- goto out;
-out:
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
- return ret;
-}
-
-
-
-int32_t
-glusterd_op_fini_ctx ()
-{
- dict_t *dict = NULL;
-
- dict = glusterd_op_get_ctx ();
- if (dict)
- dict_unref (dict);
-
- glusterd_op_reset_ctx ();
- return 0;
-}
-
-
-
-int32_t
glusterd_op_free_ctx (glusterd_op_t op, void *ctx)
{
@@ -4823,4 +6472,3 @@ glusterd_op_sm_init ()
pthread_mutex_init (&gd_op_sm_lock, NULL);
return 0;
}
-
diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.h b/xlators/mgmt/glusterd/src/glusterd-op-sm.h
index 558c2d1f7..4a73b08f4 100644
--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.h
+++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.h
@@ -15,8 +15,8 @@
#include "config.h"
#endif
-#ifndef GSYNC_CONF
-#define GSYNC_CONF GEOREP"/gsyncd.conf"
+#ifndef GSYNC_CONF_TEMPLATE
+#define GSYNC_CONF_TEMPLATE GEOREP"/gsyncd_template.conf"
#endif
#include <pthread.h>
@@ -32,6 +32,8 @@
#include "protocol-common.h"
#define GD_VOLUME_NAME_MAX 256
+#define GD_OP_PROTECTED (0x02)
+#define GD_OP_UNPROTECTED (0x04)
typedef enum glusterd_op_sm_state_ {
GD_OP_STATE_DEFAULT = 0,
@@ -75,6 +77,7 @@ struct glusterd_op_sm_event_ {
struct list_head list;
void *ctx;
glusterd_op_sm_event_type_t event;
+ uuid_t txn_id;
};
typedef struct glusterd_op_sm_event_ glusterd_op_sm_event_t;
@@ -117,6 +120,7 @@ typedef struct glusterd_op_log_filename_ctx_ glusterd_op_log_filename_ctx_t;
struct glusterd_op_lock_ctx_ {
uuid_t uuid;
+ dict_t *dict;
rpcsvc_request_t *req;
};
@@ -160,13 +164,29 @@ typedef struct glusterd_status_rsp_conv_ {
typedef struct glusterd_gsync_status_temp {
dict_t *rsp_dict;
glusterd_volinfo_t *volinfo;
+ char *node;
}glusterd_gsync_status_temp_t;
+
+typedef struct gsync_status_param {
+ int is_active;
+ glusterd_volinfo_t *volinfo;
+}gsync_status_param_t;
+
+typedef struct glusterd_txn_opinfo_object_ {
+ glusterd_op_info_t opinfo;
+} glusterd_txn_opinfo_obj;
+
+typedef enum cli_cmd_type_ {
+ PER_REPLICA,
+ ALL_REPLICA,
+ } cli_cmd_type;
+
int
glusterd_op_sm_new_event (glusterd_op_sm_event_type_t event_type,
glusterd_op_sm_event_t **new_event);
int
glusterd_op_sm_inject_event (glusterd_op_sm_event_type_t event_type,
- void *ctx);
+ uuid_t *txn_id, void *ctx);
int
glusterd_op_sm_init ();
@@ -192,7 +212,8 @@ glusterd_op_commit_perform (glusterd_op_t op, dict_t *req, char **op_errstr,
dict_t* dict);
int32_t
-glusterd_op_txn_begin (rpcsvc_request_t *req, glusterd_op_t op, void *ctx);
+glusterd_op_txn_begin (rpcsvc_request_t *req, glusterd_op_t op, void *ctx,
+ char *err_str, size_t err_len);
int32_t
glusterd_op_txn_complete ();
@@ -231,7 +252,8 @@ glusterd_op_sm_state_name_get (int state);
char*
glusterd_op_sm_event_name_get (int event);
int32_t
-glusterd_op_bricks_select (glusterd_op_t op, dict_t *dict, char **op_errstr);
+glusterd_op_bricks_select (glusterd_op_t op, dict_t *dict, char **op_errstr,
+ struct list_head *selected, dict_t *rsp_dict);
int
glusterd_brick_op_build_payload (glusterd_op_t op, glusterd_brickinfo_t *brickinfo,
gd1_mgmt_brick_op_req **req, dict_t *dict);
@@ -246,10 +268,9 @@ glusterd_handle_brick_rsp (void *pending_entry, glusterd_op_t op,
dict_t*
glusterd_op_init_commit_rsp_dict (glusterd_op_t op);
-int32_t
-glusterd_op_init_ctx (glusterd_op_t op);
-int32_t
-glusterd_op_fini_ctx ();
+void
+glusterd_op_modify_op_ctx (glusterd_op_t op, void *op_ctx);
+
int32_t
glusterd_volume_stats_read_perf (char *brick_path, int32_t blk_size,
int32_t blk_count, double *throughput, double *time);
@@ -265,7 +286,7 @@ glusterd_are_all_volumes_stopped ();
int
glusterd_stop_bricks (glusterd_volinfo_t *volinfo);
int
-gsync_status (char *master, char *slave, int *status);
+gsync_status (char *master, char *slave, char *conf_path, int *status);
int
glusterd_check_gsync_running (glusterd_volinfo_t *volinfo, gf_boolean_t *flag);
@@ -273,4 +294,21 @@ glusterd_check_gsync_running (glusterd_volinfo_t *volinfo, gf_boolean_t *flag);
int
glusterd_defrag_volume_node_rsp (dict_t *req_dict, dict_t *rsp_dict,
dict_t *op_ctx);
+#ifdef HAVE_BD_XLATOR
+int
+glusterd_is_valid_vg (glusterd_brickinfo_t *brick, int check_tag, char *msg);
+#endif
+
+int32_t
+glusterd_get_txn_opinfo (uuid_t *txn_id, glusterd_op_info_t *opinfo);
+
+int32_t
+glusterd_set_txn_opinfo (uuid_t *txn_id, glusterd_op_info_t *opinfo);
+
+int32_t
+glusterd_clear_txn_opinfo (uuid_t *txn_id);
+
+int32_t
+glusterd_generate_txn_id (dict_t *dict, uuid_t **txn_id);
+
#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-pmap.c b/xlators/mgmt/glusterd/src/glusterd-pmap.c
index 00e589320..a153ca1a9 100644
--- a/xlators/mgmt/glusterd/src/glusterd-pmap.c
+++ b/xlators/mgmt/glusterd/src/glusterd-pmap.c
@@ -1,5 +1,5 @@
/*
- Copyright (c) 2010-2012 Red Hat, Inc. <http://www.redhat.com>
+ Copyright (c) 2010-2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
@@ -52,8 +52,8 @@ pmap_port_isfree (int port)
}
-struct pmap_registry *
-pmap_registry_new (void)
+static struct pmap_registry *
+pmap_registry_new (xlator_t *this)
{
struct pmap_registry *pmap = NULL;
int i = 0;
@@ -69,8 +69,8 @@ pmap_registry_new (void)
pmap->ports[i].type = GF_PMAP_PORT_FOREIGN;
}
- pmap->base_port = GF_IANA_PRIV_PORTS_START;
- pmap->last_alloc = GF_IANA_PRIV_PORTS_START;
+ pmap->base_port = pmap->last_alloc =
+ ((glusterd_conf_t *)(this->private))->base_port;
return pmap;
}
@@ -86,7 +86,7 @@ pmap_registry_get (xlator_t *this)
pmap = priv->pmap;
if (!pmap) {
- pmap = pmap_registry_new ();
+ pmap = pmap_registry_new (this);
if (!pmap)
return NULL;
priv->pmap = pmap;
@@ -280,15 +280,17 @@ out:
}
int
-gluster_pmap_portbybrick (rpcsvc_request_t *req)
+__gluster_pmap_portbybrick (rpcsvc_request_t *req)
{
pmap_port_by_brick_req args = {0,};
pmap_port_by_brick_rsp rsp = {0,};
char *brick = NULL;
int port = 0;
+ int ret = -1;
- if (!xdr_to_generic (req->msg[0], &args,
- (xdrproc_t)xdr_pmap_port_by_brick_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_pmap_port_by_brick_req);
+ if (ret < 0) {
req->rpc_err = GARBAGE_ARGS;
goto fail;
}
@@ -312,13 +314,22 @@ fail:
int
-gluster_pmap_brickbyport (rpcsvc_request_t *req)
+gluster_pmap_portbybrick (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req, __gluster_pmap_portbybrick);
+}
+
+
+int
+__gluster_pmap_brickbyport (rpcsvc_request_t *req)
{
pmap_brick_by_port_req args = {0,};
pmap_brick_by_port_rsp rsp = {0,};
+ int ret = -1;
- if (!xdr_to_generic (req->msg[0], &args,
- (xdrproc_t)xdr_pmap_brick_by_port_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_pmap_brick_by_port_req);
+ if (ret < 0) {
req->rpc_err = GARBAGE_ARGS;
goto fail;
}
@@ -336,6 +347,14 @@ fail:
return 0;
}
+
+int
+gluster_pmap_brickbyport (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req, __gluster_pmap_brickbyport);
+}
+
+
static int
glusterd_brick_update_signin (glusterd_brickinfo_t *brickinfo,
gf_boolean_t value)
@@ -346,14 +365,16 @@ glusterd_brick_update_signin (glusterd_brickinfo_t *brickinfo,
}
int
-gluster_pmap_signup (rpcsvc_request_t *req)
+__gluster_pmap_signup (rpcsvc_request_t *req)
{
pmap_signup_req args = {0,};
pmap_signup_rsp rsp = {0,};
+ int ret = -1;
- if (!xdr_to_generic (req->msg[0], &args,
- (xdrproc_t)xdr_pmap_signup_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_pmap_signup_req);
+ if (ret < 0) {
req->rpc_err = GARBAGE_ARGS;
goto fail;
}
@@ -370,15 +391,22 @@ fail:
}
int
-gluster_pmap_signin (rpcsvc_request_t *req)
+gluster_pmap_signup (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req, __gluster_pmap_signup);
+}
+
+int
+__gluster_pmap_signin (rpcsvc_request_t *req)
{
pmap_signin_req args = {0,};
pmap_signin_rsp rsp = {0,};
glusterd_brickinfo_t *brickinfo = NULL;
int ret = -1;
- if (!xdr_to_generic (req->msg[0], &args,
- (xdrproc_t)xdr_pmap_signin_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_pmap_signin_req);
+ if (ret < 0) {
req->rpc_err = GARBAGE_ARGS;
goto fail;
}
@@ -400,17 +428,24 @@ fail:
}
+int
+gluster_pmap_signin (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req, __gluster_pmap_signin);
+}
+
int
-gluster_pmap_signout (rpcsvc_request_t *req)
+__gluster_pmap_signout (rpcsvc_request_t *req)
{
pmap_signout_req args = {0,};
pmap_signout_rsp rsp = {0,};
int ret = -1;
glusterd_brickinfo_t *brickinfo = NULL;
- if (!xdr_to_generic (req->msg[0], &args,
- (xdrproc_t)xdr_pmap_signout_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_pmap_signout_req);
+ if (ret < 0) {
//failed to decode msg;
req->rpc_err = GARBAGE_ARGS;
goto fail;
@@ -432,18 +467,19 @@ fail:
return 0;
}
+int
+gluster_pmap_signout (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req, __gluster_pmap_signout);
+}
+
rpcsvc_actor_t gluster_pmap_actors[] = {
- [GF_PMAP_NULL] = {"NULL", GF_PMAP_NULL, NULL, NULL, 0},
- [GF_PMAP_PORTBYBRICK] = {"PORTBYBRICK", GF_PMAP_PORTBYBRICK,
- gluster_pmap_portbybrick, NULL, 0},
- [GF_PMAP_BRICKBYPORT] = {"BRICKBYPORT", GF_PMAP_BRICKBYPORT,
- gluster_pmap_brickbyport, NULL, 0},
- [GF_PMAP_SIGNIN] = {"SIGNIN", GF_PMAP_SIGNIN,
- gluster_pmap_signin, NULL, 0},
- [GF_PMAP_SIGNOUT] = {"SIGNOUT", GF_PMAP_SIGNOUT,
- gluster_pmap_signout, NULL, 0},
- [GF_PMAP_SIGNUP] = {"SIGNUP", GF_PMAP_SIGNUP,
- gluster_pmap_signup, NULL, 0},
+ [GF_PMAP_NULL] = {"NULL", GF_PMAP_NULL, NULL, NULL, 0, DRC_NA},
+ [GF_PMAP_PORTBYBRICK] = {"PORTBYBRICK", GF_PMAP_PORTBYBRICK, gluster_pmap_portbybrick, NULL, 0, DRC_NA},
+ [GF_PMAP_BRICKBYPORT] = {"BRICKBYPORT", GF_PMAP_BRICKBYPORT, gluster_pmap_brickbyport, NULL, 0, DRC_NA},
+ [GF_PMAP_SIGNIN] = {"SIGNIN", GF_PMAP_SIGNIN, gluster_pmap_signin, NULL, 0, DRC_NA},
+ [GF_PMAP_SIGNOUT] = {"SIGNOUT", GF_PMAP_SIGNOUT, gluster_pmap_signout, NULL, 0, DRC_NA},
+ [GF_PMAP_SIGNUP] = {"SIGNUP", GF_PMAP_SIGNUP, gluster_pmap_signup, NULL, 0, DRC_NA},
};
diff --git a/xlators/mgmt/glusterd/src/glusterd-quota.c b/xlators/mgmt/glusterd/src/glusterd-quota.c
index 103f88e7a..7f798ad26 100644
--- a/xlators/mgmt/glusterd/src/glusterd-quota.c
+++ b/xlators/mgmt/glusterd/src/glusterd-quota.c
@@ -21,24 +21,54 @@
#include "glusterd-utils.h"
#include "glusterd-volgen.h"
#include "run.h"
+#include "syscall.h"
+#include "byte-order.h"
+#include "compat-errno.h"
#include <sys/wait.h>
+#include <dlfcn.h>
+
+/* Any negative pid to make it special client */
+#define QUOTA_CRAWL_PID "-100"
+
+const char *gd_quota_op_list[GF_QUOTA_OPTION_TYPE_DEFAULT_SOFT_LIMIT+1] = {
+ [GF_QUOTA_OPTION_TYPE_NONE] = "none",
+ [GF_QUOTA_OPTION_TYPE_ENABLE] = "enable",
+ [GF_QUOTA_OPTION_TYPE_DISABLE] = "disable",
+ [GF_QUOTA_OPTION_TYPE_LIMIT_USAGE] = "limit-usage",
+ [GF_QUOTA_OPTION_TYPE_REMOVE] = "remove",
+ [GF_QUOTA_OPTION_TYPE_LIST] = "list",
+ [GF_QUOTA_OPTION_TYPE_VERSION] = "version",
+ [GF_QUOTA_OPTION_TYPE_ALERT_TIME] = "alert-time",
+ [GF_QUOTA_OPTION_TYPE_SOFT_TIMEOUT] = "soft-timeout",
+ [GF_QUOTA_OPTION_TYPE_HARD_TIMEOUT] = "hard-timeout",
+ [GF_QUOTA_OPTION_TYPE_DEFAULT_SOFT_LIMIT] = "default-soft-limit",
+};
int
-glusterd_handle_quota (rpcsvc_request_t *req)
+glusterd_store_quota_config (glusterd_volinfo_t *volinfo, char *path,
+ char *gfid_str, int opcode, char **op_errstr);
+int
+__glusterd_handle_quota (rpcsvc_request_t *req)
{
int32_t ret = -1;
gf_cli_req cli_req = {{0,}};
dict_t *dict = NULL;
glusterd_op_t cli_op = GD_OP_QUOTA;
- char operation[256] = {0, };
char *volname = NULL;
int32_t type = 0;
+ char msg[2048] = {0,};
+ xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
GF_ASSERT (req);
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
- if (!xdr_to_generic (req->msg[0], &cli_req,
- (xdrproc_t)xdr_gf_cli_req)) {
+ ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
//failed to decode msg;
req->rpc_err = GARBAGE_ARGS;
goto out;
@@ -52,8 +82,10 @@ glusterd_handle_quota (rpcsvc_request_t *req)
cli_req.dict.dict_len,
&dict);
if (ret < 0) {
- gf_log ("glusterd", GF_LOG_ERROR, "failed to "
+ gf_log (this->name, GF_LOG_ERROR, "failed to "
"unserialize req-buffer to dictionary");
+ snprintf (msg, sizeof (msg), "Unable to decode the "
+ "command");
goto out;
} else {
dict->extra_stdfree = cli_req.dict.dict_val;
@@ -62,51 +94,49 @@ glusterd_handle_quota (rpcsvc_request_t *req)
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log ("", GF_LOG_WARNING, "Unable to get volume name, while"
- "handling quota command");
+ snprintf (msg, sizeof (msg), "Unable to get volume name");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get volume name, "
+ "while handling quota command");
goto out;
}
ret = dict_get_int32 (dict, "type", &type);
if (ret) {
- gf_log ("", GF_LOG_WARNING, "Unable to get type of cmd. , while"
- "handling quota command");
- goto out;
+ snprintf (msg, sizeof (msg), "Unable to get type of command");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get type of cmd, "
+ "while handling quota command");
+ goto out;
}
- switch (type) {
- case GF_QUOTA_OPTION_TYPE_ENABLE:
- strncpy (operation, "enable", sizeof (operation));
- break;
-
- case GF_QUOTA_OPTION_TYPE_DISABLE:
- strncpy (operation, "disable", sizeof (operation));
- break;
-
- case GF_QUOTA_OPTION_TYPE_LIMIT_USAGE:
- strncpy (operation, "limit-usage", sizeof (operation));
- break;
+ if ((conf->op_version == GD_OP_VERSION_MIN) &&
+ (type > GF_QUOTA_OPTION_TYPE_VERSION)) {
+ snprintf (msg, sizeof (msg), "Cannot execute command. The "
+ "cluster is operating at version %d. Quota command %s "
+ "is unavailable in this version", conf->op_version,
+ gd_quota_op_list[type]);
+ ret = -1;
+ goto out;
+ }
- case GF_QUOTA_OPTION_TYPE_REMOVE:
- strncpy (operation, "remove", sizeof (operation));
- break;
- }
- ret = glusterd_op_begin (req, GD_OP_QUOTA, dict);
+ ret = glusterd_op_begin_synctask (req, GD_OP_QUOTA, dict);
out:
- glusterd_friend_sm ();
- glusterd_op_sm ();
-
if (ret) {
+ if (msg[0] == '\0')
+ snprintf (msg, sizeof (msg), "Operation failed");
ret = glusterd_op_send_cli_response (cli_op, ret, 0, req,
- dict, "operation failed");
- if (dict)
- dict_unref (dict);
+ dict, msg);
}
return ret;
}
+int
+glusterd_handle_quota (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req, __glusterd_handle_quota);
+}
+
int32_t
glusterd_check_if_quota_trans_enabled (glusterd_volinfo_t *volinfo)
{
@@ -121,7 +151,6 @@ glusterd_check_if_quota_trans_enabled (glusterd_volinfo_t *volinfo)
}
if (flag == _gf_false) {
- gf_log ("", GF_LOG_ERROR, "first enable the quota translator");
ret = -1;
goto out;
}
@@ -130,107 +159,9 @@ out:
return ret;
}
-/* At the end of the function, the variable found will be set
- * to true if the path to be removed was present in the limit-list,
- * else will be false.
- */
int32_t
-_glusterd_quota_remove_limits (char **quota_limits, char *path,
- gf_boolean_t *found)
-{
- int ret = 0;
- int i = 0;
- int size = 0;
- int len = 0;
- int pathlen = 0;
- int skiplen = 0;
- int flag = 0;
- char *limits = NULL;
- char *qlimits = NULL;
-
- if (found != NULL)
- *found = _gf_false;
-
- if (*quota_limits == NULL)
- return -1;
-
- qlimits = *quota_limits;
-
- pathlen = strlen (path);
-
- len = strlen (qlimits);
-
- limits = GF_CALLOC (len + 1, sizeof (char), gf_gld_mt_char);
- if (!limits)
- return -1;
-
- while (i < len) {
- if (!memcmp ((void *) &qlimits [i], (void *)path, pathlen))
- if (qlimits [i + pathlen] == ':') {
- flag = 1;
- if (found != NULL)
- *found = _gf_true;
- }
-
- while (qlimits [i + size] != ',' &&
- qlimits [i + size] != '\0')
- size++;
-
- if (!flag) {
- memcpy ((void *) &limits [i], (void *) &qlimits [i], size + 1);
- } else {
- skiplen = size + 1;
- size = len - i - size;
- memcpy ((void *) &limits [i], (void *) &qlimits [i + skiplen], size);
- break;
- }
-
- i += size + 1;
- size = 0;
- }
-
- if (!flag) {
- ret = 1;
- } else {
- len = strlen (limits);
-
- if (len == 0) {
- GF_FREE (qlimits);
-
- *quota_limits = NULL;
-
- goto out;
- }
-
- if (limits[len - 1] == ',') {
- limits[len - 1] = '\0';
- len --;
- }
-
- GF_FREE (qlimits);
-
- qlimits = GF_CALLOC (len + 1, sizeof (char), gf_gld_mt_char);
-
- if (!qlimits) {
- ret = -1;
- goto out;
- }
-
- memcpy ((void *) qlimits, (void *) limits, len + 1);
-
- *quota_limits = qlimits;
-
- ret = 0;
- }
-
-out:
- GF_FREE (limits);
-
- return ret;
-}
-
-int32_t
-glusterd_quota_initiate_fs_crawl (glusterd_conf_t *priv, char *volname)
+glusterd_quota_initiate_fs_crawl (glusterd_conf_t *priv, char *volname,
+ int type)
{
pid_t pid;
int32_t ret = 0;
@@ -249,10 +180,14 @@ glusterd_quota_initiate_fs_crawl (glusterd_conf_t *priv, char *volname)
runner_add_args (&runner, SBIN_DIR"/glusterfs",
"-s", "localhost",
"--volfile-id", volname,
+ "--use-readdirp=no",
+ "--client-pid", QUOTA_CRAWL_PID,
"-l", DEFAULT_LOG_FILE_DIRECTORY"/quota-crawl.log",
mountdir, NULL);
+ synclock_unlock (&priv->big_lock);
ret = runner_run_reuse (&runner);
+ synclock_lock (&priv->big_lock);
if (ret == -1) {
runner_log (&runner, "glusterd", GF_LOG_DEBUG, "command failed");
runner_end (&runner);
@@ -279,7 +214,19 @@ glusterd_quota_initiate_fs_crawl (glusterd_conf_t *priv, char *volname)
exit (EXIT_FAILURE);
}
runinit (&runner);
- runner_add_args (&runner, "/usr/bin/find", "find", ".", NULL);
+
+ if (type == GF_QUOTA_OPTION_TYPE_ENABLE)
+
+ runner_add_args (&runner, "/usr/bin/find", "find", ".",
+ NULL);
+
+ else if (type == GF_QUOTA_OPTION_TYPE_DISABLE)
+
+ runner_add_args (&runner, "/usr/bin/find", ".",
+ "-exec", "/usr/bin/setfattr", "-n",
+ VIRTUAL_QUOTA_XATTR_CLEANUP_KEY, "-v",
+ "1", "{}", "\\", ";", NULL);
+
if (runner_start (&runner) == -1)
_exit (EXIT_FAILURE);
@@ -299,115 +246,39 @@ out:
return ret;
}
-char *
-glusterd_quota_get_limit_value (char *quota_limits, char *path)
+int32_t
+glusterd_quota_get_default_soft_limit (glusterd_volinfo_t *volinfo,
+ dict_t *rsp_dict)
{
- int32_t i, j, k, l, len;
- int32_t pat_len, diff;
- char *ret_str = NULL;
-
- len = strlen (quota_limits);
- pat_len = strlen (path);
- i = 0;
- j = 0;
-
- while (i < len) {
- j = i;
- k = 0;
- while (path [k] == quota_limits [j]) {
- j++;
- k++;
- }
-
- l = j;
-
- while (quota_limits [j] != ',' &&
- quota_limits [j] != '\0')
- j++;
-
- if (quota_limits [l] == ':' && pat_len == (l - i)) {
- diff = j - i;
- ret_str = GF_CALLOC (diff + 1, sizeof (char),
- gf_gld_mt_char);
+ int32_t ret = 0;
+ xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
+ char *default_limit = NULL;
+ char *val = NULL;
- strncpy (ret_str, &quota_limits [i], diff);
-
- break;
- }
- i = ++j; //skip ','
- }
-
- return ret_str;
-}
-
-char*
-_glusterd_quota_get_limit_usages (glusterd_volinfo_t *volinfo,
- char *path, char **op_errstr)
-{
- int32_t ret = 0;
- char *quota_limits = NULL;
- char *ret_str = NULL;
+ if (rsp_dict == NULL)
+ return -1;
- if (volinfo == NULL)
- return NULL;
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
- ret = glusterd_volinfo_get (volinfo, VKEY_FEATURES_LIMIT_USAGE,
- &quota_limits);
- if (ret)
- return NULL;
- if (quota_limits == NULL) {
- ret_str = NULL;
- *op_errstr = gf_strdup ("Limit not set on any directory");
- } else if (path == NULL)
- ret_str = gf_strdup (quota_limits);
+ ret = glusterd_volinfo_get (volinfo, "features.default-soft-limit",
+ &default_limit);
+ if (default_limit)
+ val = gf_strdup (default_limit);
else
- ret_str = glusterd_quota_get_limit_value (quota_limits, path);
-
- return ret_str;
-}
-
-int32_t
-glusterd_quota_get_limit_usages (glusterd_conf_t *priv,
- glusterd_volinfo_t *volinfo,
- char *volname,
- dict_t *dict,
- char **op_errstr)
-{
- int32_t i = 0;
- int32_t ret = 0;
- int32_t count = 0;
- char *path = NULL;
- dict_t *ctx = NULL;
- char cmd_str [1024] = {0, };
- char *ret_str = NULL;
+ val = gf_strdup ("80%");
- ctx = glusterd_op_get_ctx ();
- if (ctx == NULL)
- return 0;
-
- ret = dict_get_int32 (dict, "count", &count);
- if (ret < 0)
+ ret = dict_set_dynstr (rsp_dict, "default-soft-limit", val);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set default "
+ "soft-limit into dict");
goto out;
-
- if (count == 0) {
- ret_str = _glusterd_quota_get_limit_usages (volinfo, NULL,
- op_errstr);
- } else {
- i = 0;
- while (count--) {
- snprintf (cmd_str, 1024, "path%d", i++);
-
- ret = dict_get_str (dict, cmd_str, &path);
- if (ret < 0)
- goto out;
-
- ret_str = _glusterd_quota_get_limit_usages (volinfo, path, op_errstr);
- }
}
+ ret = 0;
- if (ret_str) {
- ret = dict_set_dynstr (ctx, "limit_list", ret_str);
- }
out:
return ret;
}
@@ -418,54 +289,79 @@ glusterd_quota_enable (glusterd_volinfo_t *volinfo, char **op_errstr,
{
int32_t ret = -1;
char *quota_status = NULL;
+ xlator_t *this = NULL;
- GF_VALIDATE_OR_GOTO ("glusterd", volinfo, out);
- GF_VALIDATE_OR_GOTO ("glusterd", crawl, out);
- GF_VALIDATE_OR_GOTO ("glusterd", op_errstr, out);
+ this = THIS;
+ GF_ASSERT (this);
+
+ GF_VALIDATE_OR_GOTO (this->name, volinfo, out);
+ GF_VALIDATE_OR_GOTO (this->name, crawl, out);
+ GF_VALIDATE_OR_GOTO (this->name, op_errstr, out);
if (glusterd_is_volume_started (volinfo) == 0) {
*op_errstr = gf_strdup ("Volume is stopped, start volume "
"to enable quota.");
+ ret = -1;
goto out;
}
ret = glusterd_check_if_quota_trans_enabled (volinfo);
if (ret == 0) {
*op_errstr = gf_strdup ("Quota is already enabled");
+ ret = -1;
goto out;
}
quota_status = gf_strdup ("on");
if (!quota_status) {
- gf_log ("", GF_LOG_ERROR, "memory allocation failed");
- *op_errstr = gf_strdup ("Enabling quota has been unsuccessful");
+ gf_log (this->name, GF_LOG_ERROR, "memory allocation failed");
+ ret = -1;
goto out;
}
- ret = dict_set_dynstr (volinfo->dict, VKEY_FEATURES_QUOTA, quota_status);
+ ret = dict_set_dynstr (volinfo->dict, VKEY_FEATURES_QUOTA,
+ quota_status);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "dict set failed");
- *op_errstr = gf_strdup ("Enabling quota has been unsuccessful");
+ gf_log (this->name, GF_LOG_ERROR, "dict set failed");
goto out;
}
- *op_errstr = gf_strdup ("Enabling quota has been successful");
-
*crawl = _gf_true;
+ ret = glusterd_store_quota_config (volinfo, NULL, NULL,
+ GF_QUOTA_OPTION_TYPE_ENABLE,
+ op_errstr);
+
ret = 0;
out:
+ if (ret && op_errstr && !*op_errstr)
+ gf_asprintf (op_errstr, "Enabling quota on volume %s has been "
+ "unsuccessful", volinfo->volname);
return ret;
}
int32_t
-glusterd_quota_disable (glusterd_volinfo_t *volinfo, char **op_errstr)
+glusterd_quota_disable (glusterd_volinfo_t *volinfo, char **op_errstr,
+ gf_boolean_t *crawl)
{
- int32_t ret = -1;
- char *quota_status = NULL, *quota_limits = NULL;
-
- GF_VALIDATE_OR_GOTO ("glusterd", volinfo, out);
- GF_VALIDATE_OR_GOTO ("glusterd", op_errstr, out);
+ int32_t ret = -1;
+ int i = 0;
+ char *quota_status = NULL;
+ char *value = NULL;
+ xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
+ char *quota_options[] = {"features.soft-timeout",
+ "features.hard-timeout",
+ "features.alert-time",
+ "features.default-soft-limit", NULL};
+
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ GF_VALIDATE_OR_GOTO (this->name, volinfo, out);
+ GF_VALIDATE_OR_GOTO (this->name, op_errstr, out);
ret = glusterd_check_if_quota_trans_enabled (volinfo);
if (ret == -1) {
@@ -475,271 +371,768 @@ glusterd_quota_disable (glusterd_volinfo_t *volinfo, char **op_errstr)
quota_status = gf_strdup ("off");
if (!quota_status) {
- gf_log ("", GF_LOG_ERROR, "memory allocation failed");
- *op_errstr = gf_strdup ("Disabling quota has been unsuccessful");
+ gf_log (this->name, GF_LOG_ERROR, "memory allocation failed");
+ ret = -1;
goto out;
}
ret = dict_set_dynstr (volinfo->dict, VKEY_FEATURES_QUOTA, quota_status);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "dict set failed");
- *op_errstr = gf_strdup ("Disabling quota has been unsuccessful");
+ gf_log (this->name, GF_LOG_ERROR, "dict set failed");
goto out;
}
- *op_errstr = gf_strdup ("Disabling quota has been successful");
+ for (i = 0; quota_options [i]; i++) {
+ ret = glusterd_volinfo_get (volinfo, quota_options[i], &value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_INFO, "failed to get option"
+ " %s",
+ quota_options[i]);
+ } else {
+ dict_del (volinfo->dict, quota_options[i]);
+ }
+ }
+
+ //Remove aux mount of the volume on every node in the cluster
+ ret = glusterd_remove_auxiliary_mount (volinfo->volname);
+ if (ret)
+ goto out;
+
+ *crawl = _gf_true;
+
+ (void) glusterd_clean_up_quota_store (volinfo);
+
+ ret = 0;
+out:
+ if (ret && op_errstr && !*op_errstr)
+ gf_asprintf (op_errstr, "Disabling quota on volume %s has been "
+ "unsuccessful", volinfo->volname);
+ return ret;
+}
+
- ret = glusterd_volinfo_get (volinfo, VKEY_FEATURES_LIMIT_USAGE,
- &quota_limits);
+static int
+glusterd_set_quota_limit (char *volname, char *path, char *hard_limit,
+ char *soft_limit, char **op_errstr)
+{
+ int ret = -1;
+ xlator_t *this = NULL;
+ char abspath[PATH_MAX] = {0,};
+ glusterd_conf_t *priv = NULL;
+ double soft_lim = 0;
+
+ typedef struct quota_limits {
+ int64_t hl;
+ int64_t sl;
+ } __attribute__ ((__packed__)) quota_limits_t;
+
+ quota_limits_t existing_limit = {0,};
+ quota_limits_t new_limit = {0,};
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ GLUSTERD_GET_QUOTA_AUX_MOUNT_PATH (abspath, volname, path);
+ ret = gf_lstat_dir (abspath, NULL);
if (ret) {
- gf_log ("", GF_LOG_WARNING, "failed to get the quota limits");
+ gf_asprintf (op_errstr, "Failed to find the directory %s. "
+ "Reason : %s", abspath, strerror (errno));
+ goto out;
+ }
+
+ if (!soft_limit) {
+ ret = sys_lgetxattr (abspath,
+ "trusted.glusterfs.quota.limit-set",
+ (void *)&existing_limit,
+ sizeof (existing_limit));
+ if (ret < 0) {
+ switch (errno) {
+ case ENOATTR:
+ existing_limit.sl = -1;
+ break;
+ default:
+ gf_asprintf (op_errstr, "Failed to get the xattr "
+ "'trusted.glusterfs.quota.limit-set' from "
+ "%s. Reason : %s", abspath,
+ strerror (errno));
+ goto out;
+ }
+ } else {
+ existing_limit.hl = ntoh64 (existing_limit.hl);
+ existing_limit.sl = ntoh64 (existing_limit.sl);
+ }
+ new_limit.sl = existing_limit.sl;
+
} else {
- GF_FREE (quota_limits);
+ ret = gf_string2percent (soft_limit, &soft_lim);
+ if (ret)
+ goto out;
+ new_limit.sl = soft_lim;
}
- dict_del (volinfo->dict, VKEY_FEATURES_LIMIT_USAGE);
+ new_limit.sl = hton64 (new_limit.sl);
+
+ ret = gf_string2bytesize_uint64 (hard_limit, (uint64_t*)&new_limit.hl);
+ if (ret)
+ goto out;
+
+ new_limit.hl = hton64 (new_limit.hl);
+
+ ret = sys_lsetxattr (abspath, "trusted.glusterfs.quota.limit-set",
+ (char *)(void *)&new_limit, sizeof (new_limit), 0);
+ if (ret) {
+ gf_asprintf (op_errstr, "setxattr of "
+ "'trusted.glusterfs.quota.limit-set' failed on %s."
+ " Reason : %s", abspath, strerror (errno));
+ goto out;
+ }
+ ret = 0;
out:
return ret;
}
-int32_t
-glusterd_quota_limit_usage (glusterd_volinfo_t *volinfo, dict_t *dict, char **op_errstr)
+static int
+glusterd_update_quota_conf_version (glusterd_volinfo_t *volinfo)
+{
+ volinfo->quota_conf_version++;
+ return 0;
+}
+
+/*The function glusterd_find_gfid_match () does the following:
+ * Given a buffer of gfids, the number of bytes read and the key gfid that needs
+ * to be found, the function compares 16 bytes at a time from @buf against
+ * @gfid.
+ *
+ * What happens when the match is found:
+ * i. If the function was called as part of 'limit-usage' operation, the call
+ * returns with write_byte_count = bytes_read
+ *ii. If the function as called as part of 'quota remove' operation, @buf
+ * is modified in memory such that the match is deleted from the buffer, and
+ * also @write_byte_count is set to original buf size minus the sixteen bytes
+ * that was deleted as part of 'remove'.
+ *
+ * What happens when the match is not found in the current buffer:
+ * The function returns with write_byte_count = bytes_read, which means to say
+ * that the caller of this function must write the entire buffer to the tmp file
+ * and continue the search.
+ */
+static gf_boolean_t
+glusterd_find_gfid_match (uuid_t gfid, unsigned char *buf, size_t bytes_read,
+ int opcode, size_t *write_byte_count)
{
- int32_t ret = -1;
- char *path = NULL;
- char *limit = NULL;
- char *value = NULL;
- char msg [1024] = {0,};
- char *quota_limits = NULL;
+ int gfid_index = 0;
+ int shift_count = 0;
+ unsigned char tmp_buf[17] = {0,};
+
+ while (gfid_index != bytes_read) {
+ memcpy ((void *)tmp_buf, (void *)&buf[gfid_index], 16);
+ if (!uuid_compare (gfid, tmp_buf)) {
+ if (opcode == GF_QUOTA_OPTION_TYPE_REMOVE) {
+ shift_count = bytes_read - (gfid_index + 16);
+ memmove ((void *)&buf[gfid_index],
+ (void *)&buf[gfid_index+16],
+ shift_count);
+ *write_byte_count = bytes_read - 16;
+ } else {
+ *write_byte_count = bytes_read;
+ }
+ return _gf_true;
+ } else {
+ gfid_index+=16;
+ }
+ }
+ if (gfid_index == bytes_read)
+ *write_byte_count = bytes_read;
- GF_VALIDATE_OR_GOTO ("glusterd", dict, out);
- GF_VALIDATE_OR_GOTO ("glusterd", volinfo, out);
- GF_VALIDATE_OR_GOTO ("glusterd", op_errstr, out);
+ return _gf_false;
+}
- ret = glusterd_check_if_quota_trans_enabled (volinfo);
- if (ret == -1) {
- *op_errstr = gf_strdup ("Quota is disabled, please enable "
- "quota");
+/* The function glusterd_copy_to_tmp_file() reads the "remaining" bytes from
+ * the source fd and writes them to destination fd, at the rate of 128K bytes
+ * of read+write at a time.
+ */
+
+static int
+glusterd_copy_to_tmp_file (int src_fd, int dst_fd)
+{
+ int ret = 0;
+ size_t entry_sz = 131072;
+ ssize_t bytes_read = 0;
+ unsigned char buf[131072] = {0,};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ while ((bytes_read = read (src_fd, (void *)&buf, entry_sz)) > 0) {
+ if (bytes_read % 16 != 0) {
+ gf_log (this->name, GF_LOG_ERROR, "quota.conf "
+ "corrupted");
+ ret = -1;
+ goto out;
+ }
+ ret = write (dst_fd, (void *) buf, bytes_read);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "write into quota.conf failed. Reason : %s",
+ strerror (errno));
+ goto out;
+ }
+ }
+ ret = 0;
+
+out:
+ return ret;
+}
+
+int
+glusterd_store_quota_config (glusterd_volinfo_t *volinfo, char *path,
+ char *gfid_str, int opcode, char **op_errstr)
+{
+ int ret = -1;
+ int fd = -1;
+ int conf_fd = -1;
+ size_t entry_sz = 131072;
+ ssize_t bytes_read = 0;
+ size_t bytes_to_write = 0;
+ unsigned char buf[131072] = {0,};
+ uuid_t gfid = {0,};
+ xlator_t *this = NULL;
+ gf_boolean_t found = _gf_false;
+ gf_boolean_t modified = _gf_false;
+ gf_boolean_t is_file_empty = _gf_false;
+ gf_boolean_t is_first_read = _gf_true;
+ glusterd_conf_t *conf = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ glusterd_store_create_quota_conf_sh_on_absence (volinfo);
+
+ fd = gf_store_mkstemp (volinfo->quota_conf_shandle);
+ if (fd < 0) {
+ ret = -1;
goto out;
}
- ret = glusterd_volinfo_get (volinfo, VKEY_FEATURES_LIMIT_USAGE,
- &quota_limits);
- if (ret) {
- gf_log ("", GF_LOG_ERROR, "failed to get the quota limits");
- *op_errstr = gf_strdup ("failed to set limit");
+ conf_fd = open (volinfo->quota_conf_shandle->path, O_RDONLY);
+ if (conf_fd == -1) {
+ ret = -1;
goto out;
}
- ret = dict_get_str (dict, "path", &path);
+ ret = glusterd_store_quota_conf_skip_header (this, conf_fd);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to fetch quota limits" );
- *op_errstr = gf_strdup ("failed to set limit");
goto out;
}
- ret = dict_get_str (dict, "limit", &limit);
+ ret = glusterd_store_quota_conf_stamp_header (this, fd);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to fetch quota limits" );
- *op_errstr = gf_strdup ("failed to set limit");
+ gf_log (this->name, GF_LOG_ERROR, "Failed to add header to tmp "
+ "file.");
goto out;
}
- if (quota_limits) {
- ret = _glusterd_quota_remove_limits (&quota_limits, path, NULL);
- if (ret == -1) {
- gf_log ("", GF_LOG_ERROR, "Unable to allocate memory");
- *op_errstr = gf_strdup ("failed to set limit");
+ /* Just create empty quota.conf file if create */
+ if (GF_QUOTA_OPTION_TYPE_ENABLE == opcode) {
+ modified = _gf_true;
+ goto out;
+ }
+
+ /* Check if gfid_str is given for opts other than ENABLE */
+ if (!gfid_str) {
+ ret = -1;
+ goto out;
+ }
+ uuid_parse (gfid_str, gfid);
+
+ for (;;) {
+ bytes_read = read (conf_fd, (void*)&buf, entry_sz);
+ if (bytes_read <= 0) {
+ /*The flag @is_first_read is TRUE when the loop is
+ * entered, and is set to false if the first read
+ * reads non-zero bytes of data. The flag is used to
+ * detect if quota.conf is an empty file, but for the
+ * header. This is done to log appropriate error message
+ * when 'quota remove' is attempted when there are no
+ * limits set on the given volume.
+ */
+ if (is_first_read)
+ is_file_empty = _gf_true;
+ break;
+ }
+ if ((bytes_read % 16) != 0) {
+ gf_log (this->name, GF_LOG_ERROR, "quota.conf "
+ "corrupted");
+ ret = -1;
goto out;
}
- }
+ found = glusterd_find_gfid_match (gfid, buf, bytes_read, opcode,
+ &bytes_to_write);
- if (quota_limits == NULL) {
- ret = gf_asprintf (&value, "%s:%s", path, limit);
+ ret = write (fd, (void *) buf, bytes_to_write);
if (ret == -1) {
- gf_log ("", GF_LOG_ERROR, "Unable to allocate memory");
- *op_errstr = gf_strdup ("failed to set limit");
+ gf_log (this->name, GF_LOG_ERROR,
+ "write into quota.conf failed. Reason : %s",
+ strerror (errno));
goto out;
}
- } else {
- ret = gf_asprintf (&value, "%s,%s:%s",
- quota_limits, path, limit);
- if (ret == -1) {
- gf_log ("", GF_LOG_ERROR, "Unable to allocate memory");
- *op_errstr = gf_strdup ("failed to set limit");
+
+ /*If the match is found in this iteration, copy the rest of
+ * quota.conf into quota.conf.tmp and break.
+ * Else continue with the search.
+ */
+ if (found) {
+ ret = glusterd_copy_to_tmp_file (conf_fd, fd);
+ if (ret)
+ goto out;
+ break;
+ }
+ is_first_read = _gf_false;
+ }
+
+ switch (opcode) {
+ case GF_QUOTA_OPTION_TYPE_LIMIT_USAGE:
+ if (!found) {
+ ret = write (fd, gfid, 16);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "write into quota.conf failed. "
+ "Reason : %s",
+ strerror (errno));
+ goto out;
+ }
+ modified = _gf_true;
+ }
+ break;
+
+ case GF_QUOTA_OPTION_TYPE_REMOVE:
+ if (is_file_empty) {
+ gf_asprintf (op_errstr, "Cannot remove limit on"
+ " %s. The quota configuration file"
+ " for volume %s is empty.", path,
+ volinfo->volname);
+ ret = -1;
+ goto out;
+ } else {
+ if (!found) {
+ gf_asprintf (op_errstr, "Error. gfid %s"
+ " for path %s not found in"
+ " store", gfid_str, path);
+ ret = -1;
+ goto out;
+ } else {
+ modified = _gf_true;
+ }
+ }
+ break;
+
+ default:
+ ret = 0;
+ break;
+ }
+
+ if (modified)
+ glusterd_update_quota_conf_version (volinfo);
+
+ ret = 0;
+out:
+ if (conf_fd != -1) {
+ close (conf_fd);
+ }
+
+ if (fd != -1) {
+ close (fd);
+ }
+
+ if (ret && (fd > 0)) {
+ gf_store_unlink_tmppath (volinfo->quota_conf_shandle);
+ } else if (!ret) {
+ ret = gf_store_rename_tmppath (volinfo->quota_conf_shandle);
+ if (modified) {
+ ret = glusterd_compute_cksum (volinfo, _gf_true);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "compute cksum for quota conf file");
+ goto out;
+ }
+
+ ret = glusterd_store_save_quota_version_and_cksum
+ (volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "store quota version and cksum");
+ goto out;
+ }
+ }
+ }
+
+ return ret;
+}
+
+int32_t
+glusterd_quota_limit_usage (glusterd_volinfo_t *volinfo, dict_t *dict,
+ int opcode, char **op_errstr)
+{
+ int32_t ret = -1;
+ char *path = NULL;
+ char *hard_limit = NULL;
+ char *soft_limit = NULL;
+ char *gfid_str = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ GF_VALIDATE_OR_GOTO (this->name, dict, out);
+ GF_VALIDATE_OR_GOTO (this->name, volinfo, out);
+ GF_VALIDATE_OR_GOTO (this->name, op_errstr, out);
+
+ ret = glusterd_check_if_quota_trans_enabled (volinfo);
+ if (ret == -1) {
+ *op_errstr = gf_strdup ("Quota is disabled, please enable "
+ "quota");
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "path", &path);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to fetch path");
+ goto out;
+ }
+ ret = gf_canonicalize_path (path);
+ if (ret)
+ goto out;
+
+ ret = dict_get_str (dict, "hard-limit", &hard_limit);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to fetch hard limit");
+ goto out;
+ }
+
+ if (dict_get (dict, "soft-limit")) {
+ ret = dict_get_str (dict, "soft-limit", &soft_limit);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to fetch "
+ "soft limit");
goto out;
}
+ }
- GF_FREE (quota_limits);
+ if (is_origin_glusterd (dict)) {
+ ret = glusterd_set_quota_limit (volinfo->volname, path,
+ hard_limit, soft_limit,
+ op_errstr);
+ if (ret)
+ goto out;
}
- quota_limits = value;
+ ret = dict_get_str (dict, "gfid", &gfid_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get gfid of path "
+ "%s", path);
+ goto out;
+ }
+
+ ret = glusterd_store_quota_config (volinfo, path, gfid_str, opcode,
+ op_errstr);
+ if (ret)
+ goto out;
+
+ ret = 0;
+out:
+
+ if (ret && op_errstr && !*op_errstr)
+ gf_asprintf (op_errstr, "Failed to set hard limit on path %s "
+ "for volume %s", path, volinfo->volname);
+ return ret;
+}
- ret = dict_set_str (volinfo->dict, VKEY_FEATURES_LIMIT_USAGE,
- quota_limits);
+static int
+glusterd_remove_quota_limit (char *volname, char *path, char **op_errstr)
+{
+ int ret = -1;
+ xlator_t *this = NULL;
+ char abspath[PATH_MAX] = {0,};
+ glusterd_conf_t *priv = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ GLUSTERD_GET_QUOTA_AUX_MOUNT_PATH (abspath, volname, path);
+ ret = gf_lstat_dir (abspath, NULL);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to set quota limits" );
- *op_errstr = gf_strdup ("failed to set limit");
+ gf_asprintf (op_errstr, "Failed to find the directory %s. "
+ "Reason : %s", abspath, strerror (errno));
goto out;
}
- snprintf (msg, 1024, "limit set on %s", path);
- *op_errstr = gf_strdup (msg);
+ ret = sys_lremovexattr (abspath, "trusted.glusterfs.quota.limit-set");
+ if (ret) {
+ gf_asprintf (op_errstr, "removexattr failed on %s. Reason : %s",
+ abspath, strerror (errno));
+ goto out;
+ }
ret = 0;
+
out:
return ret;
}
int32_t
-glusterd_quota_remove_limits (glusterd_volinfo_t *volinfo, dict_t *dict, char **op_errstr)
+glusterd_quota_remove_limits (glusterd_volinfo_t *volinfo, dict_t *dict,
+ int opcode, char **op_errstr)
{
int32_t ret = -1;
- char str [PATH_MAX + 1024] = {0,};
- char *quota_limits = NULL;
char *path = NULL;
- gf_boolean_t flag = _gf_false;
+ char *gfid_str = NULL;
+ xlator_t *this = NULL;
- GF_VALIDATE_OR_GOTO ("glusterd", dict, out);
- GF_VALIDATE_OR_GOTO ("glusterd", volinfo, out);
- GF_VALIDATE_OR_GOTO ("glusterd", op_errstr, out);
+ this = THIS;
+ GF_ASSERT (this);
+
+ GF_VALIDATE_OR_GOTO (this->name, dict, out);
+ GF_VALIDATE_OR_GOTO (this->name, volinfo, out);
+ GF_VALIDATE_OR_GOTO (this->name, op_errstr, out);
ret = glusterd_check_if_quota_trans_enabled (volinfo);
if (ret == -1) {
- *op_errstr = gf_strdup ("Quota is disabled, please enable quota");
+ *op_errstr = gf_strdup ("Quota is disabled, please enable "
+ "quota");
goto out;
}
- ret = glusterd_volinfo_get (volinfo, VKEY_FEATURES_LIMIT_USAGE,
- &quota_limits);
+ ret = dict_get_str (dict, "path", &path);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "failed to get the quota limits");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to fetch path");
goto out;
}
- ret = dict_get_str (dict, "path", &path);
- if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to fetch quota limits" );
+ ret = gf_canonicalize_path (path);
+ if (ret)
goto out;
+
+ if (is_origin_glusterd (dict)) {
+ ret = glusterd_remove_quota_limit (volinfo->volname, path,
+ op_errstr);
+ if (ret)
+ goto out;
}
- ret = _glusterd_quota_remove_limits (&quota_limits, path, &flag);
- if (ret == -1) {
- if (flag == _gf_true)
- snprintf (str, sizeof (str), "Removing limit on %s has "
- "been unsuccessful", path);
- else
- snprintf (str, sizeof (str), "%s has no limit set", path);
- *op_errstr = gf_strdup (str);
+ ret = dict_get_str (dict, "gfid", &gfid_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get gfid of path "
+ "%s", path);
goto out;
- } else {
- if (flag == _gf_true)
- snprintf (str, sizeof (str), "Removed quota limit on "
- "%s", path);
- else
- snprintf (str, sizeof (str), "no limit set on %s",
- path);
- *op_errstr = gf_strdup (str);
- }
-
- if (quota_limits) {
- ret = dict_set_str (volinfo->dict, VKEY_FEATURES_LIMIT_USAGE,
- quota_limits);
- if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to set quota limits" );
- goto out;
- }
- } else {
- dict_del (volinfo->dict, VKEY_FEATURES_LIMIT_USAGE);
}
+ ret = glusterd_store_quota_config (volinfo, path, gfid_str, opcode,
+ op_errstr);
+ if (ret)
+ goto out;
+
+
ret = 0;
out:
return ret;
}
+int
+glusterd_set_quota_option (glusterd_volinfo_t *volinfo, dict_t *dict,
+ char *key, char **op_errstr)
+{
+ int ret = 0;
+ char *value = NULL;
+ xlator_t *this = NULL;
+ char *option = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ ret = glusterd_check_if_quota_trans_enabled (volinfo);
+ if (ret == -1) {
+ gf_asprintf (op_errstr, "Cannot set %s. Quota on volume %s is "
+ "disabled", key, volinfo->volname);
+ return -1;
+ }
+
+ ret = dict_get_str (dict, "value", &value);
+ if(ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Option value absent.");
+ return -1;
+ }
+
+ option = gf_strdup (value);
+ ret = dict_set_dynstr (volinfo->dict, key, option);
+ if(ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set option %s",
+ key);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+glusterd_quotad_op (int opcode)
+{
+ int ret = -1;
+
+ switch (opcode) {
+ case GF_QUOTA_OPTION_TYPE_ENABLE:
+ case GF_QUOTA_OPTION_TYPE_DISABLE:
+
+ if (glusterd_all_volumes_with_quota_stopped ())
+ ret = glusterd_quotad_stop ();
+ else
+ ret = glusterd_check_generate_start_quotad ();
+ break;
+
+ default:
+ ret = 0;
+ break;
+ }
+ return ret;
+}
int
-glusterd_op_quota (dict_t *dict, char **op_errstr)
+glusterd_op_quota (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
{
glusterd_volinfo_t *volinfo = NULL;
int32_t ret = -1;
char *volname = NULL;
- dict_t *ctx = NULL;
int type = -1;
gf_boolean_t start_crawl = _gf_false;
glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
GF_ASSERT (dict);
GF_ASSERT (op_errstr);
- priv = THIS->private;
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get volume name " );
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get volume name");
goto out;
}
ret = glusterd_volinfo_find (volname, &volinfo);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to allocate memory");
+ gf_asprintf (op_errstr, FMTSTR_CHECK_VOL_EXISTS, volname);
goto out;
}
ret = dict_get_int32 (dict, "type", &type);
- if (type == GF_QUOTA_OPTION_TYPE_ENABLE) {
- ret = glusterd_quota_enable (volinfo, op_errstr, &start_crawl);
- if (ret < 0)
- goto out;
-
- goto create_vol;
+ if ((priv->op_version == GD_OP_VERSION_MIN) &&
+ (type > GF_QUOTA_OPTION_TYPE_VERSION)) {
+ gf_asprintf (op_errstr, "Volume quota failed. The cluster is "
+ "operating at version %d. Quota command"
+ " %s is unavailable in this version.",
+ priv->op_version,
+ gd_quota_op_list[type]);
+ ret = -1;
+ goto out;
}
- if (type == GF_QUOTA_OPTION_TYPE_DISABLE) {
- ret = glusterd_quota_disable (volinfo, op_errstr);
- if (ret < 0)
- goto out;
+ switch (type) {
+ case GF_QUOTA_OPTION_TYPE_ENABLE:
+ ret = glusterd_quota_enable (volinfo, op_errstr,
+ &start_crawl);
+ if (ret < 0)
+ goto out;
+ break;
- goto create_vol;
- }
+ case GF_QUOTA_OPTION_TYPE_DISABLE:
+ ret = glusterd_quota_disable (volinfo, op_errstr,
+ &start_crawl);
+ if (ret < 0)
+ goto out;
+
+ break;
- if (type == GF_QUOTA_OPTION_TYPE_LIMIT_USAGE) {
- ret = glusterd_quota_limit_usage (volinfo, dict, op_errstr);
- if (ret < 0)
+ case GF_QUOTA_OPTION_TYPE_LIMIT_USAGE:
+ ret = glusterd_quota_limit_usage (volinfo, dict, type,
+ op_errstr);
goto out;
- goto create_vol;
- }
+ case GF_QUOTA_OPTION_TYPE_REMOVE:
+ ret = glusterd_quota_remove_limits (volinfo, dict, type,
+ op_errstr);
+ goto out;
- if (type == GF_QUOTA_OPTION_TYPE_REMOVE) {
- ret = glusterd_quota_remove_limits (volinfo, dict, op_errstr);
- if (ret < 0)
+ case GF_QUOTA_OPTION_TYPE_LIST:
+ ret = glusterd_check_if_quota_trans_enabled (volinfo);
+ if (ret == -1) {
+ *op_errstr = gf_strdup ("Cannot list limits, "
+ "quota is disabled");
+ goto out;
+ }
+ ret = glusterd_quota_get_default_soft_limit (volinfo,
+ rsp_dict);
goto out;
- goto create_vol;
- }
+ case GF_QUOTA_OPTION_TYPE_SOFT_TIMEOUT:
+ ret = glusterd_set_quota_option (volinfo, dict,
+ "features.soft-timeout",
+ op_errstr);
+ if (ret)
+ goto out;
+ break;
- if (type == GF_QUOTA_OPTION_TYPE_LIST) {
- ret = glusterd_check_if_quota_trans_enabled (volinfo);
- if (ret == -1) {
- *op_errstr = gf_strdup ("cannot list the limits, "
- "quota is disabled");
- goto out;
- }
+ case GF_QUOTA_OPTION_TYPE_HARD_TIMEOUT:
+ ret = glusterd_set_quota_option (volinfo, dict,
+ "features.hard-timeout",
+ op_errstr);
+ if (ret)
+ goto out;
+ break;
+
+ case GF_QUOTA_OPTION_TYPE_ALERT_TIME:
+ ret = glusterd_set_quota_option (volinfo, dict,
+ "features.alert-time",
+ op_errstr);
+ if (ret)
+ goto out;
+ break;
- ret = glusterd_quota_get_limit_usages (priv, volinfo, volname,
- dict, op_errstr);
+ case GF_QUOTA_OPTION_TYPE_DEFAULT_SOFT_LIMIT:
+ ret = glusterd_set_quota_option (volinfo, dict,
+ "features.default-soft-limit",
+ op_errstr);
+ if (ret)
+ goto out;
+ break;
- goto out;
+ default:
+ gf_asprintf (op_errstr, "Quota command failed. Invalid "
+ "opcode");
+ ret = -1;
+ goto out;
}
-create_vol:
+
+ if (priv->op_version > GD_OP_VERSION_MIN) {
+ ret = glusterd_quotad_op (type);
+ if (ret)
+ goto out;
+ }
+
ret = glusterd_create_volfiles_and_notify_services (volinfo);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to re-create volfile for"
- " 'quota'");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to re-create "
+ "volfiles");
ret = -1;
goto out;
}
@@ -748,82 +1141,311 @@ create_vol:
if (ret)
goto out;
- if (GLUSTERD_STATUS_STARTED == volinfo->status)
- ret = glusterd_check_generate_start_nfs ();
+ if (GLUSTERD_STATUS_STARTED == volinfo->status) {
+ if (priv->op_version == GD_OP_VERSION_MIN)
+ ret = glusterd_check_generate_start_nfs ();
+ }
- ret = 0;
+ if (rsp_dict && start_crawl == _gf_true)
+ glusterd_quota_initiate_fs_crawl (priv, volname, type);
+ ret = 0;
out:
- ctx = glusterd_op_get_ctx ();
- if (ctx && start_crawl == _gf_true)
- glusterd_quota_initiate_fs_crawl (priv, volname);
+ return ret;
+}
+
+/*
+ * glusterd_get_gfid_from_brick() fetches the 'trusted.gfid' attribute of @path
+ * from each brick in the backend and places the same in the rsp_dict with the
+ * keys being gfid0, gfid1, gfid2 and so on. The absence of @path in the backend
+ * is not treated as error.
+ */
+static int
+glusterd_get_gfid_from_brick (dict_t *dict, glusterd_volinfo_t *volinfo,
+ dict_t *rsp_dict, char **op_errstr)
+{
+ int ret = -1;
+ int count = 0;
+ char *path = NULL;
+ char backend_path[PATH_MAX] = {0,};
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ char key[256] = {0,};
+ char *gfid_str = NULL;
+ uuid_t gfid;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ ret = dict_get_str (dict, "path", &path);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get path");
+ goto out;
+ }
- if (ctx && *op_errstr) {
- ret = dict_set_dynstr (ctx, "errstr", *op_errstr);
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ ret = glusterd_resolve_brick (brickinfo);
if (ret) {
- GF_FREE (*op_errstr);
- gf_log ("", GF_LOG_DEBUG,
- "failed to set error message in ctx");
+ gf_log (this->name, GF_LOG_ERROR, FMTSTR_RESOLVE_BRICK,
+ brickinfo->hostname, brickinfo->path);
+ goto out;
}
- *op_errstr = NULL;
+
+ if (uuid_compare (brickinfo->uuid, MY_UUID))
+ continue;
+
+ if (brickinfo->vg[0])
+ continue;
+
+ snprintf (backend_path, sizeof (backend_path), "%s%s",
+ brickinfo->path, path);
+
+ ret = gf_lstat_dir (backend_path, NULL);
+ if (ret) {
+ gf_log (this->name, GF_LOG_INFO, "Failed to find "
+ "directory %s. Reason : %s", backend_path,
+ strerror (errno));
+ ret = 0;
+ continue;
+ }
+ ret = sys_lgetxattr (backend_path, GFID_XATTR_KEY, gfid, 16);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_INFO, "Failed to get "
+ "extended attribute %s for directory %s. "
+ "Reason : %s", GFID_XATTR_KEY, backend_path,
+ strerror (errno));
+ ret = 0;
+ continue;
+ }
+ snprintf (key, sizeof (key), "gfid%d", count);
+
+ gfid_str = gf_strdup (uuid_utoa (gfid));
+ if (!gfid_str) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_dynstr (rsp_dict, key, gfid_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to place "
+ "gfid of %s in dict", backend_path);
+ GF_FREE (gfid_str);
+ goto out;
+ }
+ count++;
}
+ ret = dict_set_int32 (rsp_dict, "count", count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set count");
+ goto out;
+ }
+
+ ret = 0;
+out:
return ret;
}
+static int
+_glusterd_validate_quota_opts (dict_t *dict, int type, char **errstr)
+{
+ int ret = -1;
+ xlator_t *this = THIS;
+ void *quota_xl = NULL;
+ volume_opt_list_t opt_list = {{0},};
+ volume_option_t *opt = NULL;
+ char *key = NULL;
+ char *value = NULL;
+
+ GF_ASSERT (dict);
+ GF_ASSERT (this);
+
+ ret = xlator_volopt_dynload ("features/quota", &quota_xl, &opt_list);
+ if (ret)
+ goto out;
+
+ switch (type) {
+ case GF_QUOTA_OPTION_TYPE_SOFT_TIMEOUT:
+ case GF_QUOTA_OPTION_TYPE_HARD_TIMEOUT:
+ case GF_QUOTA_OPTION_TYPE_ALERT_TIME:
+ case GF_QUOTA_OPTION_TYPE_DEFAULT_SOFT_LIMIT:
+ key = (char *)gd_quota_op_list[type];
+ break;
+ default:
+ ret = -1;
+ goto out;
+ }
+
+ opt = xlator_volume_option_get_list (&opt_list, key);
+ if (!opt) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR, "Unknown option: %s", key);
+ goto out;
+ }
+ ret = dict_get_str (dict, "value", &value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Value not found for key %s",
+ key);
+ goto out;
+ }
+
+ ret = xlator_option_validate (this, key, value, opt, errstr);
+
+out:
+ if (quota_xl) {
+ dlclose (quota_xl);
+ quota_xl = NULL;
+ }
+ return ret;
+}
int
-glusterd_op_stage_quota (dict_t *dict, char **op_errstr)
+glusterd_op_stage_quota (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
{
- int ret = 0;
- char *volname = NULL;
- gf_boolean_t exists = _gf_false;
- int type = 0;
- dict_t *ctx = NULL;
+ int ret = 0;
+ char *volname = NULL;
+ gf_boolean_t exists = _gf_false;
+ int type = 0;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ char *hard_limit_str = NULL;
+ uint64_t hard_limit = 0;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
GF_ASSERT (dict);
GF_ASSERT (op_errstr);
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get volume name");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get volume name");
goto out;
}
exists = glusterd_check_volume_exists (volname);
if (!exists) {
- gf_log ("", GF_LOG_ERROR, "Volume with name: %s "
- "does not exist",
- volname);
- *op_errstr = gf_strdup ("Invalid volume name");
+ gf_asprintf (op_errstr, FMTSTR_CHECK_VOL_EXISTS, volname);
+ ret = -1;
+ goto out;
+ }
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret) {
+ gf_asprintf (op_errstr, FMTSTR_CHECK_VOL_EXISTS, volname);
+ goto out;
+ }
+
+ if (!glusterd_is_volume_started (volinfo)) {
+ *op_errstr = gf_strdup ("Volume is stopped, start volume "
+ "before executing quota command.");
ret = -1;
goto out;
}
ret = dict_get_int32 (dict, "type", &type);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get 'type' for quota op");
- *op_errstr = gf_strdup ("Volume quota failed, internal error "
- ", unable to get type of operation");
+ *op_errstr = gf_strdup ("Volume quota failed, internal error, "
+ "unable to get type of operation");
+ goto out;
+ }
+
+ if ((!glusterd_is_volume_quota_enabled (volinfo)) &&
+ (type != GF_QUOTA_OPTION_TYPE_ENABLE)) {
+ *op_errstr = gf_strdup ("Quota is disabled, please enable "
+ "quota");
+ ret = -1;
goto out;
}
+ if ((priv->op_version == GD_OP_VERSION_MIN) &&
+ (type > GF_QUOTA_OPTION_TYPE_VERSION)) {
+ gf_asprintf (op_errstr, "Volume quota failed. The cluster is "
+ "operating at version %d. Quota command"
+ " %s is unavailable in this version.",
+ priv->op_version,
+ gd_quota_op_list[type]);
+ ret = -1;
+ goto out;
+ }
+
+ if ((GF_QUOTA_OPTION_TYPE_ENABLE != type) &&
+ (glusterd_check_if_quota_trans_enabled (volinfo) != 0)) {
+ ret = -1;
+ gf_asprintf (op_errstr, "Quota is not enabled on volume %s",
+ volname);
+ goto out;
+ }
- ctx = glusterd_op_get_ctx();
- if (ctx && (type == GF_QUOTA_OPTION_TYPE_ENABLE
- || type == GF_QUOTA_OPTION_TYPE_LIST)) {
+ switch (type) {
+ case GF_QUOTA_OPTION_TYPE_ENABLE:
+ case GF_QUOTA_OPTION_TYPE_LIST:
/* Fuse mount req. only for enable & list-usage options*/
- if (!glusterd_is_fuse_available ()) {
- gf_log ("glusterd", GF_LOG_ERROR, "Unable to open /dev/"
- "fuse (%s), quota command failed",
- strerror (errno));
+ if (is_origin_glusterd (dict) &&
+ !glusterd_is_fuse_available ()) {
*op_errstr = gf_strdup ("Fuse unavailable");
ret = -1;
goto out;
}
+ break;
+
+ case GF_QUOTA_OPTION_TYPE_LIMIT_USAGE:
+ ret = dict_get_str (dict, "hard-limit", &hard_limit_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Faild to get hard-limit from dict");
+ goto out;
+ }
+ ret = gf_string2bytesize_uint64 (hard_limit_str, &hard_limit);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to convert hard-limit string to value");
+ goto out;
+ }
+ if (hard_limit > UINT64_MAX) {
+ ret = -1;
+ ret = gf_asprintf (op_errstr, "Hard-limit %s is greater"
+ " than %"PRId64"bytes. Please set a "
+ "smaller limit.", hard_limit_str,
+ INT64_MAX);
+ gf_log (this->name, GF_LOG_ERROR, "hard-limit %s "
+ "greater than INT64_MAX", hard_limit_str);
+ goto out;
+ }
+ /*The break statement is missing here to allow intentional fall
+ * through of code execution to the next switch case
+ */
+
+ case GF_QUOTA_OPTION_TYPE_REMOVE:
+ ret = glusterd_get_gfid_from_brick (dict, volinfo, rsp_dict,
+ op_errstr);
+ if (ret)
+ goto out;
+ break;
+
+ case GF_QUOTA_OPTION_TYPE_SOFT_TIMEOUT:
+ case GF_QUOTA_OPTION_TYPE_HARD_TIMEOUT:
+ case GF_QUOTA_OPTION_TYPE_ALERT_TIME:
+ case GF_QUOTA_OPTION_TYPE_DEFAULT_SOFT_LIMIT:
+ ret = _glusterd_validate_quota_opts (dict, type, op_errstr);
+ if (ret)
+ goto out;
+ break;
+
+ default:
+ break;
}
-out:
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ ret = 0;
- return ret;
+ out:
+ if (ret && op_errstr && *op_errstr)
+ gf_log (this->name, GF_LOG_ERROR, "%s", *op_errstr);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
+
+ return ret;
}
diff --git a/xlators/mgmt/glusterd/src/glusterd-rebalance.c b/xlators/mgmt/glusterd/src/glusterd-rebalance.c
index fa1af7d29..bdedf4c04 100644
--- a/xlators/mgmt/glusterd/src/glusterd-rebalance.c
+++ b/xlators/mgmt/glusterd/src/glusterd-rebalance.c
@@ -42,12 +42,27 @@ glusterd_brick_op_cbk (struct rpc_req *req, struct iovec *iov,
int count, void *myframe);
int
glusterd_defrag_start_validate (glusterd_volinfo_t *volinfo, char *op_errstr,
- size_t len)
+ size_t len, glusterd_op_t op)
{
- int ret = -1;
+ int ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ /* Check only if operation is not remove-brick */
+ if ((GD_OP_REMOVE_BRICK != op) &&
+ !gd_is_remove_brick_committed (volinfo)) {
+ gf_log (this->name, GF_LOG_DEBUG, "A remove-brick task on "
+ "volume %s is not yet committed", volinfo->volname);
+ snprintf (op_errstr, len, "A remove-brick task on volume %s is"
+ " not yet committed. Either commit or stop the "
+ "remove-brick task.", volinfo->volname);
+ goto out;
+ }
if (glusterd_is_defrag_on (volinfo)) {
- gf_log ("glusterd", GF_LOG_DEBUG,
+ gf_log (this->name, GF_LOG_DEBUG,
"rebalance on volume %s already started",
volinfo->volname);
snprintf (op_errstr, len, "Rebalance on %s is already started",
@@ -57,7 +72,7 @@ glusterd_defrag_start_validate (glusterd_volinfo_t *volinfo, char *op_errstr,
if (glusterd_is_rb_started (volinfo) ||
glusterd_is_rb_paused (volinfo)) {
- gf_log ("glusterd", GF_LOG_DEBUG,
+ gf_log (this->name, GF_LOG_DEBUG,
"Rebalance failed as replace brick is in progress on volume %s",
volinfo->volname);
snprintf (op_errstr, len, "Rebalance failed as replace brick is in progress on "
@@ -66,13 +81,14 @@ glusterd_defrag_start_validate (glusterd_volinfo_t *volinfo, char *op_errstr,
}
ret = 0;
out:
- gf_log ("glusterd", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
+
int32_t
-glusterd_defrag_notify (struct rpc_clnt *rpc, void *mydata,
- rpc_clnt_event_t event, void *data)
+__glusterd_defrag_notify (struct rpc_clnt *rpc, void *mydata,
+ rpc_clnt_event_t event, void *data)
{
glusterd_volinfo_t *volinfo = NULL;
glusterd_defrag_info_t *defrag = NULL;
@@ -88,12 +104,12 @@ glusterd_defrag_notify (struct rpc_clnt *rpc, void *mydata,
if (!volinfo)
return 0;
- defrag = volinfo->defrag;
+ defrag = volinfo->rebal.defrag;
if (!defrag)
return 0;
if ((event == RPC_CLNT_DISCONNECT) && defrag->connected)
- volinfo->defrag = NULL;
+ volinfo->rebal.defrag = NULL;
GLUSTERD_GET_DEFRAG_PID_FILE(pidfile, volinfo, priv);
@@ -110,7 +126,7 @@ glusterd_defrag_notify (struct rpc_clnt *rpc, void *mydata,
UNLOCK (&defrag->lock);
gf_log ("", GF_LOG_DEBUG, "%s got RPC_CLNT_CONNECT",
- rpc->conn.trans->name);
+ rpc->conn.name);
break;
}
@@ -125,30 +141,32 @@ glusterd_defrag_notify (struct rpc_clnt *rpc, void *mydata,
}
UNLOCK (&defrag->lock);
- if (!glusterd_is_service_running (pidfile, NULL)) {
- if (volinfo->defrag_status ==
- GF_DEFRAG_STATUS_STARTED) {
- volinfo->defrag_status =
- GF_DEFRAG_STATUS_FAILED;
- } else {
- volinfo->defrag_cmd = 0;
+ if (!gf_is_service_running (pidfile, NULL)) {
+ if (volinfo->rebal.defrag_status ==
+ GF_DEFRAG_STATUS_STARTED) {
+ volinfo->rebal.defrag_status =
+ GF_DEFRAG_STATUS_FAILED;
}
}
glusterd_store_perform_node_state_store (volinfo);
if (defrag->rpc) {
- rpc_clnt_unref (defrag->rpc);
+ glusterd_rpc_clnt_unref (priv, defrag->rpc);
defrag->rpc = NULL;
}
if (defrag->cbk_fn)
- defrag->cbk_fn (volinfo, volinfo->defrag_status);
+ defrag->cbk_fn (volinfo,
+ volinfo->rebal.defrag_status);
GF_FREE (defrag);
gf_log ("", GF_LOG_DEBUG, "%s got RPC_CLNT_DISCONNECT",
- rpc->conn.trans->name);
+ rpc->conn.name);
break;
}
+ case RPC_CLNT_DESTROY:
+ glusterd_volinfo_unref (volinfo);
+ break;
default:
gf_log ("", GF_LOG_TRACE,
"got some other RPC event %d", event);
@@ -159,9 +177,18 @@ glusterd_defrag_notify (struct rpc_clnt *rpc, void *mydata,
return ret;
}
+int32_t
+glusterd_defrag_notify (struct rpc_clnt *rpc, void *mydata,
+ rpc_clnt_event_t event, void *data)
+{
+ return glusterd_big_locked_notify (rpc, mydata, event,
+ data, __glusterd_defrag_notify);
+}
+
int
glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr,
- size_t len, int cmd, defrag_cbk_fn_t cbk)
+ size_t len, int cmd, defrag_cbk_fn_t cbk,
+ glusterd_op_t op)
{
int ret = -1;
glusterd_defrag_info_t *defrag = NULL;
@@ -171,34 +198,35 @@ glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr,
char sockfile[PATH_MAX] = {0,};
char pidfile[PATH_MAX] = {0,};
char logfile[PATH_MAX] = {0,};
- dict_t *options = NULL;
-#ifdef DEBUG
char valgrind_logfile[PATH_MAX] = {0,};
-#endif
+
priv = THIS->private;
GF_ASSERT (volinfo);
GF_ASSERT (op_errstr);
- ret = glusterd_defrag_start_validate (volinfo, op_errstr, len);
+ ret = glusterd_defrag_start_validate (volinfo, op_errstr, len, op);
if (ret)
goto out;
- if (!volinfo->defrag)
- volinfo->defrag = GF_CALLOC (1, sizeof (glusterd_defrag_info_t),
- gf_gld_mt_defrag_info);
- if (!volinfo->defrag)
+ if (!volinfo->rebal.defrag)
+ volinfo->rebal.defrag =
+ GF_CALLOC (1, sizeof (*volinfo->rebal.defrag),
+ gf_gld_mt_defrag_info);
+ if (!volinfo->rebal.defrag)
goto out;
- defrag = volinfo->defrag;
+ defrag = volinfo->rebal.defrag;
defrag->cmd = cmd;
+ volinfo->rebal.defrag_cmd = cmd;
+ volinfo->rebal.op = op;
+
LOCK_INIT (&defrag->lock);
- volinfo->defrag_status = GF_DEFRAG_STATUS_STARTED;
+ volinfo->rebal.defrag_status = GF_DEFRAG_STATUS_STARTED;
glusterd_volinfo_reset_defrag_stats (volinfo);
- volinfo->defrag_cmd = cmd;
glusterd_store_perform_node_state_store (volinfo);
GLUSTERD_GET_DEFRAG_DIR (defrag_path, volinfo, priv);
@@ -209,12 +237,12 @@ glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr,
goto out;
}
- GLUSTERD_GET_DEFRAG_SOCK_FILE (sockfile, volinfo, priv);
+ GLUSTERD_GET_DEFRAG_SOCK_FILE (sockfile, volinfo);
GLUSTERD_GET_DEFRAG_PID_FILE (pidfile, volinfo, priv);
snprintf (logfile, PATH_MAX, "%s/%s-rebalance.log",
DEFAULT_LOG_FILE_DIRECTORY, volinfo->volname);
runinit (&runner);
-#ifdef DEBUG
+
if (priv->valgrind) {
snprintf (valgrind_logfile, PATH_MAX,
"%s/valgrind-%s-rebalance.log",
@@ -222,10 +250,10 @@ glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr,
volinfo->volname);
runner_add_args (&runner, "valgrind", "--leak-check=full",
- "--trace-children=yes", NULL);
+ "--trace-children=yes", "--track-origins=yes",
+ NULL);
runner_argprintf (&runner, "--log-file=%s", valgrind_logfile);
}
-#endif
runner_add_args (&runner, SBIN_DIR"/glusterfs",
"-s", "localhost", "--volfile-id", volinfo->volname,
@@ -236,6 +264,8 @@ glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr,
"--xlator-option",
"*replicate*.metadata-self-heal=off",
"--xlator-option", "*replicate*.entry-self-heal=off",
+ "--xlator-option", "*replicate*.readdir-failover=off",
+ "--xlator-option", "*dht.readdir-optimize=on",
NULL);
runner_add_arg (&runner, "--xlator-option");
runner_argprintf ( &runner, "*dht.rebalance-cmd=%d",cmd);
@@ -258,24 +288,10 @@ glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr,
sleep (5);
- /* Setting frame-timeout to 10mins (600seconds).
- * Unix domain sockets ensures that the connection is reliable. The
- * default timeout of 30mins used for unreliable network connections is
- * too long for unix domain socket connections.
- */
- ret = rpc_clnt_transport_unix_options_build (&options, sockfile, 600);
- if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "Unix options build failed");
- goto out;
- }
-
- ret = glusterd_rpc_create (&defrag->rpc, options,
- glusterd_defrag_notify, volinfo);
- if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "RPC create failed");
- goto out;
- }
+ ret = glusterd_rebalance_rpc_create (volinfo, _gf_false);
+ //FIXME: this cbk is passed as NULL in all occurrences. May be
+ //we never needed it.
if (cbk)
defrag->cbk_fn = cbk;
@@ -287,40 +303,71 @@ out:
int
glusterd_rebalance_rpc_create (glusterd_volinfo_t *volinfo,
- glusterd_conf_t *priv, int cmd)
+ gf_boolean_t reconnect)
{
dict_t *options = NULL;
char sockfile[PATH_MAX] = {0,};
int ret = -1;
- glusterd_defrag_info_t *defrag = NULL;
-
- if (!volinfo->defrag)
- volinfo->defrag = GF_CALLOC (1, sizeof (glusterd_defrag_info_t),
- gf_gld_mt_defrag_info);
- if (!volinfo->defrag)
- goto out;
-
- defrag = volinfo->defrag;
+ glusterd_defrag_info_t *defrag = volinfo->rebal.defrag;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+ struct stat buf = {0,};
- defrag->cmd = cmd;
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
- LOCK_INIT (&defrag->lock);
+ //rebalance process is not started
+ if (!defrag)
+ goto out;
- GLUSTERD_GET_DEFRAG_SOCK_FILE (sockfile, volinfo, priv);
+ //rpc obj for rebalance process already in place.
+ if (defrag->rpc) {
+ ret = 0;
+ goto out;
+ }
+ GLUSTERD_GET_DEFRAG_SOCK_FILE (sockfile, volinfo);
+ /* If reconnecting check if defrag sockfile exists in the new location
+ * in /var/run/ , if it does not try the old location
+ */
+ if (reconnect) {
+ ret = sys_stat (sockfile, &buf);
+ /* TODO: Remove this once we don't need backward compatability
+ * with the older path
+ */
+ if (ret && (errno == ENOENT)) {
+ gf_log (this->name, GF_LOG_WARNING, "Rebalance sockfile "
+ "%s does not exist. Trying old path.",
+ sockfile);
+ GLUSTERD_GET_DEFRAG_SOCK_FILE_OLD (sockfile, volinfo,
+ priv);
+ ret =sys_stat (sockfile, &buf);
+ if (ret && (ENOENT == errno)) {
+ gf_log (this->name, GF_LOG_ERROR, "Rebalance "
+ "sockfile %s does not exist.",
+ sockfile);
+ goto out;
+ }
+ }
+ }
/* Setting frame-timeout to 10mins (600seconds).
* Unix domain sockets ensures that the connection is reliable. The
* default timeout of 30mins used for unreliable network connections is
* too long for unix domain socket connections.
*/
- ret = rpc_clnt_transport_unix_options_build (&options, sockfile, 600);
+ ret = rpc_transport_unix_options_build (&options, sockfile, 600);
if (ret) {
gf_log (THIS->name, GF_LOG_ERROR, "Unix options build failed");
goto out;
}
+ glusterd_volinfo_ref (volinfo);
+ synclock_unlock (&priv->big_lock);
ret = glusterd_rpc_create (&defrag->rpc, options,
glusterd_defrag_notify, volinfo);
+ synclock_lock (&priv->big_lock);
if (ret) {
gf_log (THIS->name, GF_LOG_ERROR, "RPC create failed");
goto out;
@@ -369,26 +416,32 @@ out:
}
int
-glusterd_handle_defrag_volume (rpcsvc_request_t *req)
+__glusterd_handle_defrag_volume (rpcsvc_request_t *req)
{
- int32_t ret = -1;
- gf_cli_req cli_req = {{0,}};
- glusterd_conf_t *priv = NULL;
- dict_t *dict = NULL;
- char *volname = NULL;
- gf_cli_defrag_type cmd = 0;
+ int32_t ret = -1;
+ gf_cli_req cli_req = {{0,}};
+ glusterd_conf_t *priv = NULL;
+ dict_t *dict = NULL;
+ char *volname = NULL;
+ gf_cli_defrag_type cmd = 0;
+ char msg[2048] = {0,};
+ xlator_t *this = NULL;
GF_ASSERT (req);
+ this = THIS;
+ GF_ASSERT (this);
- priv = THIS->private;
+ priv = this->private;
+ GF_ASSERT (priv);
- if (!xdr_to_generic (req->msg[0], &cli_req,
- (xdrproc_t)xdr_gf_cli_req)) {
+ ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
//failed to decode msg;
req->rpc_err = GARBAGE_ARGS;
goto out;
}
- if (cli_req.dict.dict_len) {
+
+ if (cli_req.dict.dict_len) {
/* Unserialize the dictionary */
dict = dict_new ();
@@ -396,24 +449,25 @@ glusterd_handle_defrag_volume (rpcsvc_request_t *req)
cli_req.dict.dict_len,
&dict);
if (ret < 0) {
- gf_log ("glusterd", GF_LOG_ERROR,
- "failed to "
+ gf_log (this->name, GF_LOG_ERROR, "failed to "
"unserialize req-buffer to dictionary");
+ snprintf (msg, sizeof (msg), "Unable to decode the "
+ "command");
goto out;
}
}
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR,
- "Failed to get volname");
+ snprintf (msg, sizeof (msg), "Failed to get volume name");
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg);
goto out;
}
ret = dict_get_int32 (dict, "rebalance-command", (int32_t*)&cmd);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR,
- "Failed to get command");
+ snprintf (msg, sizeof (msg), "Failed to get command");
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg);
goto out;
}
@@ -424,9 +478,10 @@ glusterd_handle_defrag_volume (rpcsvc_request_t *req)
if ((cmd == GF_DEFRAG_CMD_STATUS) ||
(cmd == GF_DEFRAG_CMD_STOP)) {
ret = glusterd_op_begin (req, GD_OP_DEFRAG_BRICK_VOLUME,
- dict);
+ dict, msg, sizeof (msg));
} else
- ret = glusterd_op_begin (req, GD_OP_REBALANCE, dict);
+ ret = glusterd_op_begin (req, GD_OP_REBALANCE, dict,
+ msg, sizeof (msg));
out:
@@ -434,10 +489,11 @@ out:
glusterd_op_sm ();
if (ret) {
- ret = glusterd_op_send_cli_response (GD_OP_REBALANCE, ret, 0, req,
- dict, "operation failed");
- if (dict)
- dict_unref (dict);
+ if (msg[0] == '\0')
+ snprintf (msg, sizeof (msg), "Operation failed");
+ ret = glusterd_op_send_cli_response (GD_OP_REBALANCE, ret, 0,
+ req, dict, msg);
+
}
free (cli_req.dict.dict_val);//malloced by xdr
@@ -445,42 +501,82 @@ out:
return 0;
}
+int
+glusterd_handle_defrag_volume (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req, __glusterd_handle_defrag_volume);
+}
+
int
glusterd_op_stage_rebalance (dict_t *dict, char **op_errstr)
{
- char *volname = NULL;
- int ret = 0;
- int32_t cmd = 0;
- char msg[2048] = {0};
- glusterd_volinfo_t *volinfo = NULL;
+ char *volname = NULL;
+ int ret = 0;
+ int32_t cmd = 0;
+ char msg[2048] = {0};
+ glusterd_volinfo_t *volinfo = NULL;
+ char *task_id_str = NULL;
+ dict_t *op_ctx = NULL;
+ xlator_t *this = 0;
+
+ this = THIS;
+ GF_ASSERT (this);
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log (THIS->name, GF_LOG_DEBUG, "volname not found");
+ gf_log (this->name, GF_LOG_DEBUG, "volname not found");
goto out;
}
+
ret = dict_get_int32 (dict, "rebalance-command", &cmd);
if (ret) {
- gf_log (THIS->name, GF_LOG_DEBUG, "cmd not found");
+ gf_log (this->name, GF_LOG_DEBUG, "cmd not found");
goto out;
}
ret = glusterd_rebalance_cmd_validate (cmd, volname, &volinfo,
msg, sizeof (msg));
if (ret) {
- gf_log (THIS->name, GF_LOG_DEBUG, "failed to validate");
+ gf_log (this->name, GF_LOG_DEBUG, "failed to validate");
goto out;
}
switch (cmd) {
case GF_DEFRAG_CMD_START:
case GF_DEFRAG_CMD_START_LAYOUT_FIX:
case GF_DEFRAG_CMD_START_FORCE:
- ret = glusterd_defrag_start_validate (volinfo,
- msg, sizeof (msg));
+ if (is_origin_glusterd (dict)) {
+ op_ctx = glusterd_op_get_ctx ();
+ if (!op_ctx) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get op_ctx");
+ goto out;
+ }
+
+ ret = glusterd_generate_and_set_task_id
+ (op_ctx, GF_REBALANCE_TID_KEY);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to generate task-id");
+ goto out;
+ }
+ } else {
+ ret = dict_get_str (dict, GF_REBALANCE_TID_KEY,
+ &task_id_str);
+ if (ret) {
+ snprintf (msg, sizeof (msg),
+ "Missing rebalance-id");
+ gf_log (this->name, GF_LOG_WARNING, "%s", msg);
+ ret = 0;
+ }
+ }
+ ret = glusterd_defrag_start_validate (volinfo, msg,
+ sizeof (msg),
+ GD_OP_REBALANCE);
if (ret) {
- gf_log (THIS->name, GF_LOG_DEBUG,
- "start validate failed");
+ gf_log (this->name, GF_LOG_DEBUG,
+ "start validate failed");
goto out;
}
break;
@@ -503,45 +599,104 @@ out:
int
glusterd_op_rebalance (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
{
- char *volname = NULL;
- int ret = 0;
- int32_t cmd = 0;
- char msg[2048] = {0};
- glusterd_volinfo_t *volinfo = NULL;
- glusterd_conf_t *priv = NULL;
- glusterd_brickinfo_t *brickinfo = NULL;
- glusterd_brickinfo_t *tmp = NULL;
- gf_boolean_t volfile_update = _gf_false;
-
- priv = THIS->private;
+ char *volname = NULL;
+ int ret = 0;
+ int32_t cmd = 0;
+ char msg[2048] = {0};
+ glusterd_volinfo_t *volinfo = NULL;
+ glusterd_conf_t *priv = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ glusterd_brickinfo_t *tmp = NULL;
+ gf_boolean_t volfile_update = _gf_false;
+ char *task_id_str = NULL;
+ dict_t *ctx = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log (THIS->name, GF_LOG_DEBUG, "volname not given");
+ gf_log (this->name, GF_LOG_DEBUG, "volname not given");
goto out;
}
ret = dict_get_int32 (dict, "rebalance-command", &cmd);
if (ret) {
- gf_log (THIS->name, GF_LOG_DEBUG, "command not given");
+ gf_log (this->name, GF_LOG_DEBUG, "command not given");
goto out;
}
+
ret = glusterd_rebalance_cmd_validate (cmd, volname, &volinfo,
msg, sizeof (msg));
if (ret) {
- gf_log (THIS->name, GF_LOG_DEBUG, "cmd validate failed");
+ gf_log (this->name, GF_LOG_DEBUG, "cmd validate failed");
goto out;
}
+ /* Set task-id, if available, in op_ctx dict for operations other than
+ * start
+ */
+ if (cmd == GF_DEFRAG_CMD_STATUS || cmd == GF_DEFRAG_CMD_STOP) {
+ if (!uuid_is_null (volinfo->rebal.rebalance_id)) {
+ ctx = glusterd_op_get_ctx ();
+ if (!ctx) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get op_ctx");
+ ret = -1;
+ goto out;
+ }
+
+ if (GD_OP_REMOVE_BRICK == volinfo->rebal.op)
+ ret = glusterd_copy_uuid_to_dict
+ (volinfo->rebal.rebalance_id, ctx,
+ GF_REMOVE_BRICK_TID_KEY);
+ else
+ ret = glusterd_copy_uuid_to_dict
+ (volinfo->rebal.rebalance_id, ctx,
+ GF_REBALANCE_TID_KEY);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set task-id");
+ goto out;
+ }
+ }
+ }
+
switch (cmd) {
case GF_DEFRAG_CMD_START:
case GF_DEFRAG_CMD_START_LAYOUT_FIX:
case GF_DEFRAG_CMD_START_FORCE:
+ /* Reset defrag status to 'NOT STARTED' whenever a
+ * remove-brick/rebalance command is issued to remove
+ * stale information from previous run.
+ */
+ volinfo->rebal.defrag_status = GF_DEFRAG_STATUS_NOT_STARTED;
+
+ ret = dict_get_str (dict, GF_REBALANCE_TID_KEY, &task_id_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "Missing rebalance "
+ "id");
+ ret = 0;
+ } else {
+ uuid_parse (task_id_str, volinfo->rebal.rebalance_id) ;
+ volinfo->rebal.op = GD_OP_REBALANCE;
+ }
+ if (!gd_should_i_start_rebalance (volinfo))
+ break;
ret = glusterd_handle_defrag_start (volinfo, msg, sizeof (msg),
- cmd, NULL);
- break;
+ cmd, NULL, GD_OP_REBALANCE);
+ break;
case GF_DEFRAG_CMD_STOP:
+ /* Clear task-id only on explicitly stopping rebalance.
+ * Also clear the stored operation, so it doesn't cause trouble
+ * with future rebalance/remove-brick starts
+ */
+ uuid_clear (volinfo->rebal.rebalance_id);
+ volinfo->rebal.op = GD_OP_NONE;
+
/* Fall back to the old volume file in case of decommission*/
list_for_each_entry_safe (brickinfo, tmp, &volinfo->bricks,
brick_list) {
@@ -558,7 +713,7 @@ glusterd_op_rebalance (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
ret = glusterd_create_volfiles_and_notify_services (volinfo);
if (ret) {
- gf_log (THIS->name, GF_LOG_WARNING,
+ gf_log (this->name, GF_LOG_WARNING,
"failed to create volfiles");
goto out;
}
@@ -566,7 +721,7 @@ glusterd_op_rebalance (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
ret = glusterd_store_volinfo (volinfo,
GLUSTERD_VOLINFO_VER_AC_INCREMENT);
if (ret) {
- gf_log (THIS->name, GF_LOG_WARNING,
+ gf_log (this->name, GF_LOG_WARNING,
"failed to store volinfo");
goto out;
}
diff --git a/xlators/mgmt/glusterd/src/glusterd-replace-brick.c b/xlators/mgmt/glusterd/src/glusterd-replace-brick.c
index edc2627c1..ed6d7fd57 100644
--- a/xlators/mgmt/glusterd/src/glusterd-replace-brick.c
+++ b/xlators/mgmt/glusterd/src/glusterd-replace-brick.c
@@ -27,9 +27,11 @@
#include <signal.h>
#define GLUSTERD_GET_RB_MNTPT(path, len, volinfo) \
- snprintf (path, len, "/tmp/%s-"RB_CLIENT_MOUNTPOINT, \
+ snprintf (path, len, \
+ DEFAULT_VAR_RUN_DIRECTORY"/%s-"RB_CLIENT_MOUNTPOINT, \
volinfo->volname);
+extern uuid_t global_txn_id;
int
glusterd_get_replace_op_str (gf1_cli_replace_op op, char *op_str)
@@ -69,7 +71,7 @@ out:
}
int
-glusterd_handle_replace_brick (rpcsvc_request_t *req)
+__glusterd_handle_replace_brick (rpcsvc_request_t *req)
{
int32_t ret = -1;
gf_cli_req cli_req = {{0,}};
@@ -80,17 +82,21 @@ glusterd_handle_replace_brick (rpcsvc_request_t *req)
char operation[256];
glusterd_op_t cli_op = GD_OP_REPLACE_BRICK;
char *volname = NULL;
+ char msg[2048] = {0,};
+ xlator_t *this = NULL;
GF_ASSERT (req);
+ this = THIS;
+ GF_ASSERT (this);
- if (!xdr_to_generic (req->msg[0], &cli_req,
- (xdrproc_t)xdr_gf_cli_req)) {
+ ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
//failed to decode msg;
req->rpc_err = GARBAGE_ARGS;
goto out;
}
- gf_log ("glusterd", GF_LOG_INFO, "Received replace brick req");
+ gf_log (this->name, GF_LOG_INFO, "Received replace brick req");
if (cli_req.dict.dict_len) {
/* Unserialize the dictionary */
@@ -100,48 +106,55 @@ glusterd_handle_replace_brick (rpcsvc_request_t *req)
cli_req.dict.dict_len,
&dict);
if (ret < 0) {
- gf_log ("glusterd", GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"failed to "
"unserialize req-buffer to dictionary");
+ snprintf (msg, sizeof (msg), "Unable to decode the "
+ "command");
goto out;
}
}
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "could not get volname");
+ snprintf (msg, sizeof (msg), "Could not get volume name");
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg);
goto out;
}
ret = dict_get_int32 (dict, "operation", &op);
if (ret) {
- gf_log ("", GF_LOG_DEBUG,
+ gf_log (this->name, GF_LOG_DEBUG,
"dict_get on operation failed");
+ snprintf (msg, sizeof (msg), "Could not get operation");
goto out;
}
ret = dict_get_str (dict, "src-brick", &src_brick);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get src brick");
+ snprintf (msg, sizeof (msg), "Failed to get src brick");
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg);
goto out;
}
- gf_log ("", GF_LOG_DEBUG,
+ gf_log (this->name, GF_LOG_DEBUG,
"src brick=%s", src_brick);
ret = dict_get_str (dict, "dst-brick", &dst_brick);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get dest brick");
+ snprintf (msg, sizeof (msg), "Failed to get dest brick");
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg);
goto out;
}
(void) glusterd_get_replace_op_str (op, operation);
- gf_log ("", GF_LOG_DEBUG, "dst brick=%s", dst_brick);
- gf_log ("glusterd", GF_LOG_INFO, "Received replace brick %s request",
+ gf_log (this->name, GF_LOG_DEBUG, "dst brick=%s", dst_brick);
+ gf_log (this->name, GF_LOG_INFO, "Received replace brick %s request",
operation);
- ret = glusterd_op_begin (req, GD_OP_REPLACE_BRICK, dict);
+ ret = glusterd_op_begin (req, GD_OP_REPLACE_BRICK, dict,
+ msg, sizeof (msg));
out:
free (cli_req.dict.dict_val);//malloced by xdr
@@ -150,15 +163,22 @@ out:
glusterd_op_sm ();
if (ret) {
+ if (msg[0] == '\0')
+ snprintf (msg, sizeof (msg), "Operation failed");
ret = glusterd_op_send_cli_response (cli_op, ret, 0, req,
- dict, "operation failed");
- if (dict)
- dict_unref (dict);
+ dict, msg);
}
return ret;
}
+int
+glusterd_handle_replace_brick (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ __glusterd_handle_replace_brick);
+}
+
static int
glusterd_get_rb_dst_brickinfo (glusterd_volinfo_t *volinfo,
glusterd_brickinfo_t **brickinfo)
@@ -168,7 +188,7 @@ glusterd_get_rb_dst_brickinfo (glusterd_volinfo_t *volinfo,
if (!volinfo || !brickinfo)
goto out;
- *brickinfo = volinfo->dst_brick;
+ *brickinfo = volinfo->rep_brick.dst_brick;
ret = 0;
@@ -181,6 +201,7 @@ glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr,
dict_t *rsp_dict)
{
int ret = 0;
+ int32_t port = 0;
char *src_brick = NULL;
char *dst_brick = NULL;
char *volname = NULL;
@@ -197,40 +218,45 @@ glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr,
dict_t *ctx = NULL;
glusterd_conf_t *priv = NULL;
char *savetok = NULL;
- char voldir[PATH_MAX] = {0};
char pidfile[PATH_MAX] = {0};
+ char *task_id_str = NULL;
+ xlator_t *this = NULL;
+ gf_boolean_t is_force = _gf_false;
- priv = THIS->private;
+ this = THIS;
+ GF_ASSERT (this);
+
+ priv = this->private;
GF_ASSERT (priv);
ret = dict_get_str (dict, "src-brick", &src_brick);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get src brick");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get src brick");
goto out;
}
- gf_log ("", GF_LOG_DEBUG, "src brick=%s", src_brick);
+ gf_log (this->name, GF_LOG_DEBUG, "src brick=%s", src_brick);
ret = dict_get_str (dict, "dst-brick", &dst_brick);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get dest brick");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get dest brick");
goto out;
}
- gf_log ("", GF_LOG_DEBUG, "dst brick=%s", dst_brick);
+ gf_log (this->name, GF_LOG_DEBUG, "dst brick=%s", dst_brick);
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get volume name");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get volume name");
goto out;
}
ret = dict_get_int32 (dict, "operation", (int32_t *)&replace_op);
if (ret) {
- gf_log ("", GF_LOG_DEBUG,
+ gf_log (this->name, GF_LOG_DEBUG,
"dict get on replace-brick operation failed");
goto out;
}
@@ -255,7 +281,7 @@ glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr,
!glusterd_is_valid_volfpath (volname, dst_brick)) {
snprintf (msg, sizeof (msg), "brick path %s is too "
"long.", dst_brick);
- gf_log ("", GF_LOG_ERROR, "%s", msg);
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg);
*op_errstr = gf_strdup (msg);
ret = -1;
@@ -264,10 +290,10 @@ glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr,
ret = glusterd_check_gsync_running (volinfo, &is_run);
if (ret && (is_run == _gf_false))
- gf_log ("", GF_LOG_WARNING, "Unable to get the status"
+ gf_log (this->name, GF_LOG_WARNING, "Unable to get the status"
" of active "GEOREP" session");
if (is_run) {
- gf_log ("", GF_LOG_WARNING, GEOREP" sessions active"
+ gf_log (this->name, GF_LOG_WARNING, GEOREP" sessions active"
"for the volume %s ", volname);
snprintf (msg, sizeof(msg), GEOREP" sessions are active "
"for the volume %s.\nStop "GEOREP " sessions "
@@ -282,29 +308,61 @@ glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr,
if (glusterd_is_defrag_on(volinfo)) {
snprintf (msg, sizeof(msg), "Volume name %s rebalance is in "
"progress. Please retry after completion", volname);
- gf_log ("glusterd", GF_LOG_ERROR, "%s", msg);
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg);
*op_errstr = gf_strdup (msg);
ret = -1;
goto out;
}
+ ctx = glusterd_op_get_ctx();
+
switch (replace_op) {
case GF_REPLACE_OP_START:
if (glusterd_is_rb_started (volinfo)) {
- gf_log ("", GF_LOG_ERROR, "Replace brick is already "
- "started for volume ");
+ snprintf (msg, sizeof (msg), "Replace brick is already "
+ "started for volume");
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg);
+ *op_errstr = gf_strdup (msg);
ret = -1;
goto out;
}
+ if (is_origin_glusterd (dict)) {
+ if (!ctx) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get op_ctx");
+ goto out;
+ }
+
+ ret = glusterd_generate_and_set_task_id
+ (ctx, GF_REPLACE_BRICK_TID_KEY);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to generate task-id");
+ goto out;
+ }
+
+ } else {
+ ret = dict_get_str (dict, GF_REPLACE_BRICK_TID_KEY,
+ &task_id_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Missing replace-brick-id");
+ ret = 0;
+ }
+ }
+ is_force = dict_get_str_boolean (dict, "force", _gf_false);
+
break;
+
case GF_REPLACE_OP_PAUSE:
if (glusterd_is_rb_paused (volinfo)) {
- gf_log ("", GF_LOG_ERROR, "Replace brick is already "
- "paused for volume ");
+ gf_log (this->name, GF_LOG_ERROR, "Replace brick is "
+ "already paused for volume ");
ret = -1;
goto out;
} else if (!glusterd_is_rb_started(volinfo)) {
- gf_log ("", GF_LOG_ERROR, "Replace brick is not"
+ gf_log (this->name, GF_LOG_ERROR, "Replace brick is not"
" started for volume ");
ret = -1;
goto out;
@@ -313,7 +371,7 @@ glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr,
case GF_REPLACE_OP_ABORT:
if (!glusterd_is_rb_ongoing (volinfo)) {
- gf_log ("", GF_LOG_ERROR, "Replace brick is not"
+ gf_log (this->name, GF_LOG_ERROR, "Replace brick is not"
" started or paused for volume ");
ret = -1;
goto out;
@@ -322,14 +380,16 @@ glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr,
case GF_REPLACE_OP_COMMIT:
if (!glusterd_is_rb_ongoing (volinfo)) {
- gf_log ("", GF_LOG_ERROR, "Replace brick is not "
+ gf_log (this->name, GF_LOG_ERROR, "Replace brick is not "
"started for volume ");
ret = -1;
goto out;
}
break;
- case GF_REPLACE_OP_COMMIT_FORCE: break;
+ case GF_REPLACE_OP_COMMIT_FORCE:
+ is_force = _gf_true;
+ break;
case GF_REPLACE_OP_STATUS:
@@ -342,7 +402,7 @@ glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr,
goto out;
}
- gf_log (THIS->name, GF_LOG_ERROR, "%s", *op_errstr);
+ gf_log (this->name, GF_LOG_ERROR, "%s", *op_errstr);
ret = -1;
goto out;
}
@@ -362,10 +422,9 @@ glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr,
goto out;
}
- ctx = glusterd_op_get_ctx();
if (ctx) {
if (!glusterd_is_fuse_available ()) {
- gf_log ("glusterd", GF_LOG_ERROR, "Unable to open /dev/"
+ gf_log (this->name, GF_LOG_ERROR, "Unable to open /dev/"
"fuse (%s), replace-brick command failed",
strerror (errno));
snprintf (msg, sizeof(msg), "Fuse unavailable\n "
@@ -376,8 +435,8 @@ glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr,
}
}
- if (!glusterd_is_local_addr (src_brickinfo->hostname)) {
- gf_log ("", GF_LOG_DEBUG,
+ if (gf_is_local_addr (src_brickinfo->hostname)) {
+ gf_log (this->name, GF_LOG_DEBUG,
"I AM THE SOURCE HOST");
if (src_brickinfo->port && rsp_dict) {
ret = dict_set_int32 (rsp_dict, "src-brick-port",
@@ -389,12 +448,10 @@ glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr,
}
}
- GLUSTERD_GET_VOLUME_DIR (voldir, volinfo, priv);
- GLUSTERD_GET_BRICK_PIDFILE (pidfile, voldir,
- src_brickinfo->hostname,
- src_brickinfo->path);
+ GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, src_brickinfo,
+ priv);
if ((replace_op != GF_REPLACE_OP_COMMIT_FORCE) &&
- !glusterd_is_service_running (pidfile, NULL)) {
+ !gf_is_service_running (pidfile, NULL)) {
snprintf(msg, sizeof(msg), "Source brick %s:%s "
"is not online.", src_brickinfo->hostname,
src_brickinfo->path);
@@ -409,14 +466,14 @@ glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr,
dup_dstbrick = gf_strdup (dst_brick);
if (!dup_dstbrick) {
ret = -1;
- gf_log ("", GF_LOG_ERROR, "Memory allocation failed");
+ gf_log (this->name, GF_LOG_ERROR, "Memory allocation failed");
goto out;
}
host = strtok_r (dup_dstbrick, ":", &savetok);
path = strtok_r (NULL, ":", &savetok);
if (!host || !path) {
- gf_log ("", GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"dst brick %s is not of form <HOSTNAME>:<export-dir>",
dst_brick);
ret = -1;
@@ -432,7 +489,7 @@ glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr,
if (ret) {
*op_errstr = gf_strdup (msg);
ret = -1;
- gf_log (THIS->name, GF_LOG_ERROR, *op_errstr);
+ gf_log (this->name, GF_LOG_ERROR, "%s", *op_errstr);
goto out;
}
@@ -440,8 +497,8 @@ glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr,
(replace_op == GF_REPLACE_OP_START ||
replace_op == GF_REPLACE_OP_COMMIT_FORCE)) {
- volinfo->src_brick = src_brickinfo;
- volinfo->dst_brick = dst_brickinfo;
+ volinfo->rep_brick.src_brick = src_brickinfo;
+ volinfo->rep_brick.dst_brick = dst_brickinfo;
}
if (glusterd_rb_check_bricks (volinfo, src_brickinfo, dst_brickinfo)) {
@@ -450,20 +507,20 @@ glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr,
*op_errstr = gf_strdup ("Incorrect source or "
"destination brick");
if (*op_errstr)
- gf_log (THIS->name, GF_LOG_ERROR, "%s", *op_errstr);
+ gf_log (this->name, GF_LOG_ERROR, "%s", *op_errstr);
goto out;
}
if (!glusterd_is_rb_ongoing (volinfo) &&
- !glusterd_is_local_addr (host)) {
- ret = glusterd_brick_create_path (host, path,
+ gf_is_local_addr (host)) {
+ ret = glusterd_validate_and_create_brickpath (dst_brickinfo,
volinfo->volume_id,
- op_errstr);
+ op_errstr, is_force);
if (ret)
goto out;
}
- if (glusterd_is_local_addr (host)) {
+ if (!gf_is_local_addr (host)) {
ret = glusterd_friend_find (NULL, host, &peerinfo);
if (ret) {
snprintf (msg, sizeof (msg), "%s, is not a friend",
@@ -488,11 +545,33 @@ glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr,
goto out;
}
}
+
+ if (replace_op == GF_REPLACE_OP_START &&
+ gf_is_local_addr (volinfo->rep_brick.dst_brick->hostname)) {
+ port = pmap_registry_alloc (THIS);
+ if (!port) {
+ gf_log (THIS->name, GF_LOG_CRITICAL,
+ "No free ports available");
+ ret = -1;
+ goto out;
+ }
+
+ ctx = glusterd_op_get_ctx();
+ ret = dict_set_int32 ((ctx)?ctx:rsp_dict, "dst-brick-port",
+ port);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR, "Failed to set dst "
+ "brick port");
+ goto out;
+ }
+ volinfo->rep_brick.dst_brick->port = port;
+ }
+
ret = 0;
out:
GF_FREE (dup_dstbrick);
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -557,6 +636,8 @@ rb_regenerate_volfiles (glusterd_volinfo_t *volinfo,
ret = glusterd_create_rb_volfiles (volinfo, brickinfo);
+ dict_del (dict, "enable-pump");
+
out:
return ret;
}
@@ -607,7 +688,7 @@ rb_src_brick_restart (glusterd_volinfo_t *volinfo,
sleep (2);
ret = glusterd_volume_start_glusterfs (volinfo, src_brickinfo,
- _gf_false);
+ _gf_false);
if (ret) {
gf_log ("", GF_LOG_ERROR, "Unable to start "
"glusterfs, ret: %d", ret);
@@ -656,9 +737,7 @@ rb_spawn_dst_brick (glusterd_volinfo_t *volinfo,
priv = THIS->private;
- port = pmap_registry_alloc (THIS);
- brickinfo->port = port;
-
+ port = brickinfo->port;
GF_ASSERT (port);
runinit (&runner);
@@ -672,8 +751,10 @@ rb_spawn_dst_brick (glusterd_volinfo_t *volinfo,
if (volinfo->memory_accounting)
runner_add_arg (&runner, "--mem-accounting");
- ret = runner_run (&runner);
+ ret = runner_run_nowait (&runner);
if (ret) {
+ pmap_registry_remove (THIS, 0, brickinfo->path,
+ GF_PMAP_PORT_BRICKSERVER, NULL);
gf_log ("", GF_LOG_DEBUG,
"Could not start glusterfs");
goto out;
@@ -950,7 +1031,7 @@ rb_mountpoint_rmdir (glusterd_volinfo_t *volinfo,
GLUSTERD_GET_RB_MNTPT (mntpt, sizeof (mntpt), volinfo);
ret = rmdir (mntpt);
if (ret) {
- gf_log ("", GF_LOG_DEBUG, "rmdir failed, due to %s",
+ gf_log ("", GF_LOG_DEBUG, "rmdir failed, reason: %s",
strerror (errno));
goto out;
}
@@ -997,8 +1078,8 @@ rb_destroy_maintenance_client (glusterd_volinfo_t *volinfo,
ret = unlink (volfile);
if (ret) {
- gf_log (this->name, GF_LOG_DEBUG, "unlink of volfile %s "
- "failed", volfile);
+ gf_log ("", GF_LOG_DEBUG, "unlink of %s failed, reason: %s",
+ volfile, strerror (errno));
goto out;
}
@@ -1323,14 +1404,11 @@ rb_update_srcbrick_port (glusterd_brickinfo_t *src_brickinfo, dict_t *rsp_dict,
this = THIS;
- ctx = glusterd_op_get_ctx ();
- if (ctx) {
- dict_ret = dict_get_int32 (req_dict, "src-brick-port", &src_port);
- if (src_port)
- src_brickinfo->port = src_port;
- }
+ dict_ret = dict_get_int32 (req_dict, "src-brick-port", &src_port);
+ if (src_port)
+ src_brickinfo->port = src_port;
- if (!glusterd_is_local_addr (src_brickinfo->hostname)) {
+ if (gf_is_local_addr (src_brickinfo->hostname)) {
gf_log ("", GF_LOG_INFO,
"adding src-brick port no");
@@ -1379,15 +1457,12 @@ rb_update_dstbrick_port (glusterd_brickinfo_t *dst_brickinfo, dict_t *rsp_dict,
int dict_ret = 0;
int dst_port = 0;
- ctx = glusterd_op_get_ctx ();
- if (ctx) {
- dict_ret = dict_get_int32 (req_dict, "dst-brick-port", &dst_port);
- if (dst_port)
- dst_brickinfo->port = dst_port;
+ dict_ret = dict_get_int32 (req_dict, "dst-brick-port", &dst_port);
+ if (!dict_ret)
+ dst_brickinfo->port = dst_port;
- }
- if (!glusterd_is_local_addr (dst_brickinfo->hostname)) {
+ if (gf_is_local_addr (dst_brickinfo->hostname)) {
gf_log ("", GF_LOG_INFO,
"adding dst-brick port no");
@@ -1441,6 +1516,9 @@ glusterd_op_perform_replace_brick (glusterd_volinfo_t *volinfo,
if (ret)
goto out;
+ strncpy (new_brickinfo->brick_id, old_brickinfo->brick_id,
+ sizeof (new_brickinfo->brick_id));
+
list_add_tail (&new_brickinfo->brick_list,
&old_brickinfo->brick_list);
@@ -1480,6 +1558,7 @@ glusterd_op_replace_brick (dict_t *dict, dict_t *rsp_dict)
char *dst_brick = NULL;
glusterd_brickinfo_t *src_brickinfo = NULL;
glusterd_brickinfo_t *dst_brickinfo = NULL;
+ char *task_id_str = NULL;
this = THIS;
GF_ASSERT (this);
@@ -1489,26 +1568,23 @@ glusterd_op_replace_brick (dict_t *dict, dict_t *rsp_dict)
ret = dict_get_str (dict, "src-brick", &src_brick);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get src brick");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get src brick");
goto out;
}
- gf_log (this->name, GF_LOG_DEBUG,
- "src brick=%s", src_brick);
+ gf_log (this->name, GF_LOG_DEBUG, "src brick=%s", src_brick);
ret = dict_get_str (dict, "dst-brick", &dst_brick);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get dst brick");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get dst brick");
goto out;
}
- gf_log (this->name, GF_LOG_DEBUG,
- "dst brick=%s", dst_brick);
+ gf_log (this->name, GF_LOG_DEBUG, "dst brick=%s", dst_brick);
ret = dict_get_str (dict, "volname", &volname);
-
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get volume name");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get volume name");
goto out;
}
@@ -1521,28 +1597,30 @@ glusterd_op_replace_brick (dict_t *dict, dict_t *rsp_dict)
ret = glusterd_volinfo_find (volname, &volinfo);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to allocate memory");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to allocate memory");
goto out;
}
ret = glusterd_volume_brickinfo_get_by_brick (src_brick, volinfo,
&src_brickinfo);
if (ret) {
- gf_log ("", GF_LOG_DEBUG, "Unable to get src-brickinfo");
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Unable to get src-brickinfo");
goto out;
}
ret = glusterd_get_rb_dst_brickinfo (volinfo, &dst_brickinfo);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get "
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get "
"replace brick destination brickinfo");
goto out;
}
ret = glusterd_resolve_brick (dst_brickinfo);
if (ret) {
- gf_log ("", GF_LOG_DEBUG, "Unable to resolve dst-brickinfo");
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Unable to resolve dst-brickinfo");
goto out;
}
@@ -1551,45 +1629,82 @@ glusterd_op_replace_brick (dict_t *dict, dict_t *rsp_dict)
if (ret)
goto out;
+
if ((GF_REPLACE_OP_START != replace_op)) {
- ret = rb_update_dstbrick_port (dst_brickinfo, rsp_dict,
- dict, replace_op);
- if (ret)
- goto out;
+
+ /* Set task-id, if available, in op_ctx dict for operations
+ * other than start
+ */
+ if (is_origin_glusterd (dict)) {
+ ctx = glusterd_op_get_ctx();
+ if (!ctx) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "get op_ctx");
+ ret = -1;
+ goto out;
+ }
+ if (!uuid_is_null (volinfo->rep_brick.rb_id)) {
+ ret = glusterd_copy_uuid_to_dict
+ (volinfo->rep_brick.rb_id, ctx,
+ GF_REPLACE_BRICK_TID_KEY);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set "
+ "replace-brick-id");
+ goto out;
+ }
+ }
+ }
}
+ ret = rb_update_dstbrick_port (dst_brickinfo, rsp_dict,
+ dict, replace_op);
+ if (ret)
+ goto out;
switch (replace_op) {
case GF_REPLACE_OP_START:
{
- if (!glusterd_is_local_addr (dst_brickinfo->hostname)) {
- gf_log ("", GF_LOG_INFO,
+ ret = dict_get_str (dict, GF_REPLACE_BRICK_TID_KEY, &task_id_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Missing replace-brick-id");
+ ret = 0;
+ } else {
+ uuid_parse (task_id_str, volinfo->rep_brick.rb_id);
+ }
+
+ if (gf_is_local_addr (dst_brickinfo->hostname)) {
+ gf_log (this->name, GF_LOG_INFO,
"I AM THE DESTINATION HOST");
if (!glusterd_is_rb_paused (volinfo)) {
- ret = rb_spawn_destination_brick (volinfo, dst_brickinfo);
+ ret = rb_spawn_destination_brick
+ (volinfo, dst_brickinfo);
if (ret) {
- gf_log ("", GF_LOG_DEBUG,
- "Failed to spawn destination brick");
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Failed to spawn destination "
+ "brick");
goto out;
}
} else {
- gf_log ("", GF_LOG_ERROR, "Replace brick is already "
- "started=> no need to restart dst brick ");
+ gf_log (this->name, GF_LOG_ERROR,
+ "Replace brick is already started=> no "
+ "need to restart dst brick ");
}
}
- if (!glusterd_is_local_addr (src_brickinfo->hostname)) {
+ if (gf_is_local_addr (src_brickinfo->hostname)) {
ret = rb_src_brick_restart (volinfo, src_brickinfo,
1);
if (ret) {
- gf_log ("", GF_LOG_DEBUG,
- "Could not restart src-brick");
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Could not restart src-brick");
goto out;
}
}
- if (!glusterd_is_local_addr (dst_brickinfo->hostname)) {
- gf_log ("", GF_LOG_INFO,
+ if (gf_is_local_addr (dst_brickinfo->hostname)) {
+ gf_log (this->name, GF_LOG_INFO,
"adding dst-brick port no");
ret = rb_update_dstbrick_port (dst_brickinfo, rsp_dict,
@@ -1610,7 +1725,7 @@ glusterd_op_replace_brick (dict_t *dict, dict_t *rsp_dict)
dst_brickinfo,
GF_REPLACE_OP_COMMIT);
if (ret) {
- gf_log ("", GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"Commit operation failed");
goto out;
}
@@ -1619,17 +1734,13 @@ glusterd_op_replace_brick (dict_t *dict, dict_t *rsp_dict)
/* fall through */
case GF_REPLACE_OP_COMMIT_FORCE:
{
- ret = dict_set_int32 (volinfo->dict, "enable-pump", 0);
- gf_log (THIS->name, GF_LOG_DEBUG,
- "Received commit - will be adding dst brick and "
- "removing src brick");
-
- if (!glusterd_is_local_addr (dst_brickinfo->hostname)) {
- gf_log (THIS->name, GF_LOG_DEBUG,
+ if (gf_is_local_addr (dst_brickinfo->hostname)) {
+ gf_log (this->name, GF_LOG_DEBUG,
"I AM THE DESTINATION HOST");
- ret = rb_kill_destination_brick (volinfo, dst_brickinfo);
+ ret = rb_kill_destination_brick (volinfo,
+ dst_brickinfo);
if (ret) {
- gf_log (THIS->name, GF_LOG_CRITICAL,
+ gf_log (this->name, GF_LOG_CRITICAL,
"Unable to cleanup dst brick");
goto out;
}
@@ -1637,39 +1748,41 @@ glusterd_op_replace_brick (dict_t *dict, dict_t *rsp_dict)
ret = glusterd_nodesvcs_stop (volinfo);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"Unable to stop nfs server, ret: %d", ret);
}
ret = glusterd_op_perform_replace_brick (volinfo, src_brick,
dst_brick);
if (ret) {
- gf_log (THIS->name, GF_LOG_CRITICAL, "Unable to add "
- "dst-brick: %s to volume: %s",
- dst_brick, volinfo->volname);
+ gf_log (this->name, GF_LOG_CRITICAL, "Unable to add "
+ "dst-brick: %s to volume: %s", dst_brick,
+ volinfo->volname);
(void) glusterd_nodesvcs_handle_graph_change (volinfo);
goto out;
}
- volinfo->defrag_status = 0;
+ volinfo->rebal.defrag_status = 0;
ret = glusterd_nodesvcs_handle_graph_change (volinfo);
if (ret) {
- gf_log (THIS->name, GF_LOG_CRITICAL,
+ gf_log (this->name, GF_LOG_CRITICAL,
"Failed to generate nfs volume file");
}
ret = glusterd_fetchspec_notify (THIS);
glusterd_set_rb_status (volinfo, GF_RB_STATUS_NONE);
- glusterd_brickinfo_delete (volinfo->dst_brick);
- volinfo->src_brick = volinfo->dst_brick = NULL;
+ glusterd_brickinfo_delete (volinfo->rep_brick.dst_brick);
+ volinfo->rep_brick.src_brick = NULL;
+ volinfo->rep_brick.dst_brick = NULL;
+ uuid_clear (volinfo->rep_brick.rb_id);
}
break;
case GF_REPLACE_OP_PAUSE:
{
- gf_log ("", GF_LOG_DEBUG,
+ gf_log (this->name, GF_LOG_DEBUG,
"Received pause - doing nothing");
ctx = glusterd_op_get_ctx ();
if (ctx) {
@@ -1677,7 +1790,7 @@ glusterd_op_replace_brick (dict_t *dict, dict_t *rsp_dict)
dst_brickinfo,
GF_REPLACE_OP_PAUSE);
if (ret) {
- gf_log ("", GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"Pause operation failed");
goto out;
}
@@ -1696,18 +1809,13 @@ glusterd_op_replace_brick (dict_t *dict, dict_t *rsp_dict)
dst_brickinfo,
GF_REPLACE_OP_ABORT);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"Abort operation failed");
goto out;
}
}
- ret = dict_set_int32 (volinfo->dict, "enable-pump", 0);
- if (ret) {
- gf_log (THIS->name, GF_LOG_CRITICAL, "Unable to disable pump");
- }
-
- if (!glusterd_is_local_addr (src_brickinfo->hostname)) {
+ if (gf_is_local_addr (src_brickinfo->hostname)) {
ret = rb_src_brick_restart (volinfo, src_brickinfo,
0);
if (ret) {
@@ -1718,8 +1826,8 @@ glusterd_op_replace_brick (dict_t *dict, dict_t *rsp_dict)
}
}
- if (!glusterd_is_local_addr (dst_brickinfo->hostname)) {
- gf_log (THIS->name, GF_LOG_INFO,
+ if (gf_is_local_addr (dst_brickinfo->hostname)) {
+ gf_log (this->name, GF_LOG_INFO,
"I AM THE DESTINATION HOST");
ret = rb_kill_destination_brick (volinfo, dst_brickinfo);
if (ret) {
@@ -1729,14 +1837,15 @@ glusterd_op_replace_brick (dict_t *dict, dict_t *rsp_dict)
}
}
glusterd_set_rb_status (volinfo, GF_RB_STATUS_NONE);
- glusterd_brickinfo_delete (volinfo->dst_brick);
- volinfo->src_brick = volinfo->dst_brick = NULL;
+ glusterd_brickinfo_delete (volinfo->rep_brick.dst_brick);
+ volinfo->rep_brick.src_brick = NULL;
+ volinfo->rep_brick.dst_brick = NULL;
}
break;
case GF_REPLACE_OP_STATUS:
{
- gf_log ("", GF_LOG_DEBUG,
+ gf_log (this->name, GF_LOG_DEBUG,
"received status - doing nothing");
ctx = glusterd_op_get_ctx ();
if (ctx) {
@@ -1744,7 +1853,7 @@ glusterd_op_replace_brick (dict_t *dict, dict_t *rsp_dict)
ret = dict_set_str (ctx, "status-reply",
"replace brick has been paused");
if (ret)
- gf_log (THIS->name, GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"failed to set pump status"
" in ctx");
goto out;
@@ -1768,7 +1877,7 @@ glusterd_op_replace_brick (dict_t *dict, dict_t *rsp_dict)
ret = glusterd_store_volinfo (volinfo,
GLUSTERD_VOLINFO_VER_AC_INCREMENT);
if (ret)
- gf_log (THIS->name, GF_LOG_ERROR, "Couldn't store"
+ gf_log (this->name, GF_LOG_ERROR, "Couldn't store"
" replace brick operation's state");
out:
@@ -1789,14 +1898,17 @@ glusterd_do_replace_brick (void *data)
glusterd_brickinfo_t *src_brickinfo = NULL;
glusterd_brickinfo_t *dst_brickinfo = NULL;
glusterd_conf_t *priv = NULL;
+ uuid_t *txn_id = NULL;
int ret = 0;
dict = data;
GF_ASSERT (THIS);
-
priv = THIS->private;
+ GF_ASSERT (priv);
+
+ txn_id = &priv->global_txn_id;
if (priv->timer) {
gf_timer_call_cancel (THIS->ctx, priv->timer);
@@ -1808,6 +1920,10 @@ glusterd_do_replace_brick (void *data)
gf_log ("", GF_LOG_DEBUG,
"Replace brick operation detected");
+ ret = dict_get_bin (dict, "transaction_id", (void **)&txn_id);
+
+ gf_log ("", GF_LOG_DEBUG, "transaction ID = %s", uuid_utoa (*txn_id));
+
ret = dict_get_int32 (dict, "operation", &op);
if (ret) {
gf_log ("", GF_LOG_DEBUG,
@@ -1903,12 +2019,15 @@ glusterd_do_replace_brick (void *data)
out:
if (ret)
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT, NULL);
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT,
+ txn_id, NULL);
else
- ret = glusterd_op_sm_inject_event (GD_OP_EVENT_COMMIT_ACC, NULL);
+ ret = glusterd_op_sm_inject_event (GD_OP_EVENT_COMMIT_ACC,
+ txn_id, NULL);
- if (dict)
- dict_unref (dict);
-
- glusterd_op_sm ();
+ synclock_lock (&priv->big_lock);
+ {
+ glusterd_op_sm ();
+ }
+ synclock_unlock (&priv->big_lock);
}
diff --git a/xlators/mgmt/glusterd/src/glusterd-rpc-ops.c b/xlators/mgmt/glusterd/src/glusterd-rpc-ops.c
index c18c2b5e1..babd5a3be 100644
--- a/xlators/mgmt/glusterd/src/glusterd-rpc-ops.c
+++ b/xlators/mgmt/glusterd/src/glusterd-rpc-ops.c
@@ -33,6 +33,7 @@
extern glusterd_op_info_t opinfo;
+extern uuid_t global_txn_id;
int32_t
glusterd_op_send_cli_response (glusterd_op_t op, int32_t op_ret,
@@ -49,10 +50,11 @@ glusterd_op_send_cli_response (glusterd_op_t op, int32_t op_ret,
int32_t status = 0;
int32_t count = 0;
gf_cli_rsp rsp = {0,};
+ xlator_t *this = NULL;
- GF_ASSERT (THIS);
-
- conf = THIS->private;
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
GF_ASSERT (conf);
@@ -77,12 +79,13 @@ glusterd_op_send_cli_response (glusterd_op_t op, int32_t op_ret,
if (ctx) {
ret = dict_get_int32 (ctx, "status", &status);
if (ret) {
- gf_log (THIS->name, GF_LOG_TRACE,
+ gf_log (this->name, GF_LOG_TRACE,
"failed to get status");
}
}
break;
}
+ case GD_OP_GSYNC_CREATE:
case GD_OP_GSYNC_SET:
{
if (ctx) {
@@ -94,19 +97,12 @@ glusterd_op_send_cli_response (glusterd_op_t op, int32_t op_ret,
break;
}
- case GD_OP_QUOTA:
- {
- if (ctx && !op_errstr) {
- ret = dict_get_str (ctx, "errstr", &errstr);
- }
- break;
- }
case GD_OP_PROFILE_VOLUME:
{
if (ctx && dict_get_int32 (ctx, "count", &count)) {
ret = dict_set_int32 (ctx, "count", 0);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"failed to set count in dictionary");
}
}
@@ -115,13 +111,14 @@ glusterd_op_send_cli_response (glusterd_op_t op, int32_t op_ret,
case GD_OP_START_BRICK:
case GD_OP_STOP_BRICK:
{
- gf_log ("", GF_LOG_DEBUG, "not supported op %d", op);
+ gf_log (this->name, GF_LOG_DEBUG, "op '%s' not supported",
+ gd_op_list[op]);
break;
}
case GD_OP_NONE:
case GD_OP_MAX:
{
- gf_log ("", GF_LOG_ERROR, "invalid operation %d", op);
+ gf_log (this->name, GF_LOG_ERROR, "invalid operation");
break;
}
case GD_OP_CREATE_VOLUME:
@@ -139,10 +136,27 @@ glusterd_op_send_cli_response (glusterd_op_t op, int32_t op_ret,
case GD_OP_LIST_VOLUME:
case GD_OP_CLEARLOCKS_VOLUME:
case GD_OP_HEAL_VOLUME:
+ case GD_OP_QUOTA:
+ case GD_OP_SNAP:
{
/*nothing specific to be done*/
break;
}
+ case GD_OP_COPY_FILE:
+ {
+ if (ctx)
+ ret = dict_get_str (ctx, "errstr", &errstr);
+ break;
+ }
+ case GD_OP_SYS_EXEC:
+ {
+ if (ctx) {
+ ret = dict_get_str (ctx, "errstr", &errstr);
+ ret = dict_set_str (ctx, "glusterd_workdir",
+ conf->workdir);
+ }
+ break;
+ }
}
rsp.op_ret = op_ret;
@@ -159,7 +173,7 @@ glusterd_op_send_cli_response (glusterd_op_t op, int32_t op_ret,
ret = dict_allocate_and_serialize (ctx, &rsp.dict.dict_val,
&rsp.dict.dict_len);
if (ret < 0 )
- gf_log (THIS->name, GF_LOG_ERROR, "failed to "
+ gf_log (this->name, GF_LOG_ERROR, "failed to "
"serialize buffer");
else
free_ptr = rsp.dict.dict_val;
@@ -177,12 +191,26 @@ glusterd_op_send_cli_response (glusterd_op_t op, int32_t op_ret,
ret = 0;
GF_FREE (free_ptr);
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
int
-glusterd_probe_cbk (struct rpc_req *req, struct iovec *iov,
+glusterd_big_locked_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe, fop_cbk_fn_t fn)
+{
+ glusterd_conf_t *priv = THIS->private;
+ int ret = -1;
+
+ synclock_lock (&priv->big_lock);
+ ret = fn (req, iov, count, myframe);
+ synclock_unlock (&priv->big_lock);
+
+ return ret;
+}
+
+int
+__glusterd_probe_cbk (struct rpc_req *req, struct iovec *iov,
int count, void *myframe)
{
gd1_mgmt_probe_rsp rsp = {{0},};
@@ -216,7 +244,8 @@ glusterd_probe_cbk (struct rpc_req *req, struct iovec *iov,
glusterd_xfer_cli_probe_resp (ctx->req, rsp.op_ret,
rsp.op_errno,
rsp.op_errstr,
- ctx->hostname, ctx->port);
+ ctx->hostname, ctx->port,
+ ctx->dict);
}
glusterd_destroy_probe_ctx (ctx);
@@ -244,7 +273,8 @@ glusterd_probe_cbk (struct rpc_req *req, struct iovec *iov,
glusterd_xfer_cli_probe_resp (ctx->req, rsp.op_ret,
rsp.op_errno,
rsp.op_errstr,
- ctx->hostname, ctx->port);
+ ctx->hostname, ctx->port,
+ ctx->dict);
}
glusterd_destroy_probe_ctx (ctx);
@@ -284,7 +314,16 @@ out:
}
int
-glusterd_friend_add_cbk (struct rpc_req * req, struct iovec *iov,
+glusterd_probe_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ __glusterd_probe_cbk);
+}
+
+
+int
+__glusterd_friend_add_cbk (struct rpc_req * req, struct iovec *iov,
int count, void *myframe)
{
gd1_mgmt_friend_rsp rsp = {{0},};
@@ -364,7 +403,7 @@ out:
if (ctx->req)//reverse probe doesn't have req
ret = glusterd_xfer_cli_probe_resp (ctx->req, op_ret, op_errno,
NULL, ctx->hostname,
- ctx->port);
+ ctx->port, ctx->dict);
if (!ret) {
glusterd_friend_sm ();
glusterd_op_sm ();
@@ -377,7 +416,15 @@ out:
}
int
-glusterd_friend_remove_cbk (struct rpc_req * req, struct iovec *iov,
+glusterd_friend_add_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ __glusterd_friend_add_cbk);
+}
+
+int
+__glusterd_friend_remove_cbk (struct rpc_req * req, struct iovec *iov,
int count, void *myframe)
{
gd1_mgmt_friend_rsp rsp = {{0},};
@@ -454,7 +501,7 @@ inject:
respond:
ret = glusterd_xfer_cli_deprobe_resp (ctx->req, op_ret, op_errno, NULL,
- ctx->hostname);
+ ctx->hostname, ctx->dict);
if (!ret && move_sm_now) {
glusterd_friend_sm ();
glusterd_op_sm ();
@@ -470,8 +517,16 @@ respond:
return ret;
}
+int
+glusterd_friend_remove_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ __glusterd_friend_remove_cbk);
+}
+
int32_t
-glusterd_friend_update_cbk (struct rpc_req *req, struct iovec *iov,
+__glusterd_friend_update_cbk (struct rpc_req *req, struct iovec *iov,
int count, void *myframe)
{
int ret = -1;
@@ -503,8 +558,16 @@ out:
return ret;
}
+int
+glusterd_friend_update_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ __glusterd_friend_update_cbk);
+}
+
int32_t
-glusterd_cluster_lock_cbk (struct rpc_req *req, struct iovec *iov,
+__glusterd_cluster_lock_cbk (struct rpc_req *req, struct iovec *iov,
int count, void *myframe)
{
gd1_mgmt_cluster_lock_rsp rsp = {{0},};
@@ -512,9 +575,18 @@ glusterd_cluster_lock_cbk (struct rpc_req *req, struct iovec *iov,
int32_t op_ret = -1;
glusterd_op_sm_event_type_t event_type = GD_OP_EVENT_NONE;
glusterd_peerinfo_t *peerinfo = NULL;
+ xlator_t *this = NULL;
+ uuid_t *txn_id = NULL;
+ glusterd_conf_t *priv = NULL;
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
GF_ASSERT (req);
+ txn_id = &priv->global_txn_id;
+
if (-1 == req->rpc_status) {
rsp.op_ret = -1;
rsp.op_errno = EINVAL;
@@ -523,7 +595,8 @@ glusterd_cluster_lock_cbk (struct rpc_req *req, struct iovec *iov,
ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gd1_mgmt_cluster_lock_rsp);
if (ret < 0) {
- gf_log ("", GF_LOG_ERROR, "error");
+ gf_log (this->name, GF_LOG_ERROR, "Failed to decode lock "
+ "response received from peer");
rsp.op_ret = -1;
rsp.op_errno = EINVAL;
goto out;
@@ -532,25 +605,28 @@ glusterd_cluster_lock_cbk (struct rpc_req *req, struct iovec *iov,
out:
op_ret = rsp.op_ret;
- gf_log ("glusterd", GF_LOG_INFO,
- "Received %s from uuid: %s",
- (op_ret)?"RJT":"ACC", uuid_utoa (rsp.uuid));
+ gf_log (this->name, (op_ret) ? GF_LOG_ERROR : GF_LOG_DEBUG,
+ "Received lock %s from uuid: %s", (op_ret) ? "RJT" : "ACC",
+ uuid_utoa (rsp.uuid));
ret = glusterd_friend_find (rsp.uuid, NULL, &peerinfo);
if (ret) {
- gf_log ("", GF_LOG_CRITICAL, "Lock response received from "
- "unknown peer: %s", uuid_utoa (rsp.uuid));
+ gf_log (this->name, GF_LOG_CRITICAL, "Lock response received "
+ "from unknown peer: %s", uuid_utoa (rsp.uuid));
}
if (op_ret) {
event_type = GD_OP_EVENT_RCVD_RJT;
opinfo.op_ret = op_ret;
+ opinfo.op_errstr = gf_strdup ("Another transaction could be in "
+ "progress. Please try again after"
+ " sometime.");
} else {
event_type = GD_OP_EVENT_RCVD_ACC;
}
- ret = glusterd_op_sm_inject_event (event_type, NULL);
+ ret = glusterd_op_sm_inject_event (event_type, txn_id, NULL);
if (!ret) {
glusterd_friend_sm ();
@@ -562,18 +638,196 @@ out:
}
int32_t
-glusterd_cluster_unlock_cbk (struct rpc_req *req, struct iovec *iov,
+glusterd_cluster_lock_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ __glusterd_cluster_lock_cbk);
+}
+
+static int32_t
+glusterd_mgmt_v3_lock_peers_cbk_fn (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ gd1_mgmt_v3_lock_rsp rsp = {{0},};
+ int ret = -1;
+ int32_t op_ret = -1;
+ glusterd_op_sm_event_type_t event_type = GD_OP_EVENT_NONE;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ xlator_t *this = NULL;
+ uuid_t *txn_id = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ if (-1 == req->rpc_status) {
+ rsp.op_ret = -1;
+ rsp.op_errno = EINVAL;
+ goto out;
+ }
+
+ ret = xdr_to_generic (*iov, &rsp,
+ (xdrproc_t)xdr_gd1_mgmt_v3_lock_rsp);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to decode mgmt_v3 lock "
+ "response received from peer");
+ rsp.op_ret = -1;
+ rsp.op_errno = EINVAL;
+ goto out;
+ }
+
+ op_ret = rsp.op_ret;
+
+ txn_id = &rsp.txn_id;
+
+ gf_log (this->name, (op_ret) ? GF_LOG_ERROR : GF_LOG_DEBUG,
+ "Received mgmt_v3 lock %s from uuid: %s",
+ (op_ret) ? "RJT" : "ACC", uuid_utoa (rsp.uuid));
+
+ ret = glusterd_friend_find (rsp.uuid, NULL, &peerinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "mgmt_v3 lock response received "
+ "from unknown peer: %s. Ignoring response",
+ uuid_utoa (rsp.uuid));
+ goto out;
+ }
+
+ if (op_ret) {
+ event_type = GD_OP_EVENT_RCVD_RJT;
+ opinfo.op_ret = op_ret;
+ opinfo.op_errstr = gf_strdup ("Another transaction could be in "
+ "progress. Please try again after"
+ " sometime.");
+ } else {
+ event_type = GD_OP_EVENT_RCVD_ACC;
+ }
+
+ ret = glusterd_op_sm_inject_event (event_type, txn_id, NULL);
+
+ if (!ret) {
+ glusterd_friend_sm ();
+ glusterd_op_sm ();
+ }
+
+out:
+ GLUSTERD_STACK_DESTROY (((call_frame_t *)myframe));
+ return ret;
+}
+
+int32_t
+glusterd_mgmt_v3_lock_peers_cbk (struct rpc_req *req, struct iovec *iov,
int count, void *myframe)
{
- gd1_mgmt_cluster_lock_rsp rsp = {{0},};
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ glusterd_mgmt_v3_lock_peers_cbk_fn);
+}
+
+static int32_t
+glusterd_mgmt_v3_unlock_peers_cbk_fn (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ gd1_mgmt_v3_unlock_rsp rsp = {{0},};
int ret = -1;
int32_t op_ret = -1;
glusterd_op_sm_event_type_t event_type = GD_OP_EVENT_NONE;
glusterd_peerinfo_t *peerinfo = NULL;
+ xlator_t *this = NULL;
+ uuid_t *txn_id = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+
+ if (-1 == req->rpc_status) {
+ rsp.op_ret = -1;
+ rsp.op_errno = EINVAL;
+ goto out;
+ }
+
+ ret = xdr_to_generic (*iov, &rsp,
+ (xdrproc_t)xdr_gd1_mgmt_v3_unlock_rsp);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to decode mgmt_v3 unlock "
+ "response received from peer");
+ rsp.op_ret = -1;
+ rsp.op_errno = EINVAL;
+ goto out;
+ }
+
+ op_ret = rsp.op_ret;
+
+ txn_id = &rsp.txn_id;
+
+ gf_log (this->name, (op_ret) ? GF_LOG_ERROR : GF_LOG_DEBUG,
+ "Received mgmt_v3 unlock %s from uuid: %s",
+ (op_ret) ? "RJT" : "ACC",
+ uuid_utoa (rsp.uuid));
+
+ ret = glusterd_friend_find (rsp.uuid, NULL, &peerinfo);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "mgmt_v3 unlock response received "
+ "from unknown peer: %s. Ignoring response",
+ uuid_utoa (rsp.uuid));
+ goto out;
+ }
+ if (op_ret) {
+ event_type = GD_OP_EVENT_RCVD_RJT;
+ opinfo.op_ret = op_ret;
+ opinfo.op_errstr = gf_strdup ("Another transaction could be in "
+ "progress. Please try again after"
+ " sometime.");
+ } else {
+ event_type = GD_OP_EVENT_RCVD_ACC;
+ }
+
+ ret = glusterd_op_sm_inject_event (event_type, txn_id, NULL);
+
+ if (!ret) {
+ glusterd_friend_sm ();
+ glusterd_op_sm ();
+ }
+
+out:
+ GLUSTERD_STACK_DESTROY (((call_frame_t *)myframe));
+ return ret;
+}
+
+int32_t
+glusterd_mgmt_v3_unlock_peers_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ glusterd_mgmt_v3_unlock_peers_cbk_fn);
+}
+
+int32_t
+__glusterd_cluster_unlock_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ gd1_mgmt_cluster_lock_rsp rsp = {{0},};
+ int ret = -1;
+ int32_t op_ret = -1;
+ glusterd_op_sm_event_type_t event_type = GD_OP_EVENT_NONE;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ xlator_t *this = NULL;
+ uuid_t *txn_id = NULL;
+ glusterd_conf_t *priv = NULL;
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
GF_ASSERT (req);
+ txn_id = &priv->global_txn_id;
+
if (-1 == req->rpc_status) {
rsp.op_ret = -1;
rsp.op_errno = EINVAL;
@@ -582,7 +836,8 @@ glusterd_cluster_unlock_cbk (struct rpc_req *req, struct iovec *iov,
ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gd1_mgmt_cluster_unlock_rsp);
if (ret < 0) {
- gf_log ("", GF_LOG_ERROR, "error");
+ gf_log (this->name, GF_LOG_ERROR, "Failed to decode unlock "
+ "response received from peer");
rsp.op_ret = -1;
rsp.op_errno = EINVAL;
goto out;
@@ -591,15 +846,15 @@ glusterd_cluster_unlock_cbk (struct rpc_req *req, struct iovec *iov,
out:
op_ret = rsp.op_ret;
- gf_log ("glusterd", GF_LOG_INFO,
- "Received %s from uuid: %s",
+ gf_log (this->name, (op_ret) ? GF_LOG_ERROR : GF_LOG_DEBUG,
+ "Received unlock %s from uuid: %s",
(op_ret)?"RJT":"ACC", uuid_utoa (rsp.uuid));
ret = glusterd_friend_find (rsp.uuid, NULL, &peerinfo);
if (ret) {
- gf_log ("", GF_LOG_CRITICAL, "Unlock response received from "
- "unknown peer %s", uuid_utoa (rsp.uuid));
+ gf_log (this->name, GF_LOG_CRITICAL, "Unlock response received "
+ "from unknown peer %s", uuid_utoa (rsp.uuid));
}
if (op_ret) {
@@ -609,7 +864,7 @@ out:
event_type = GD_OP_EVENT_RCVD_ACC;
}
- ret = glusterd_op_sm_inject_event (event_type, NULL);
+ ret = glusterd_op_sm_inject_event (event_type, txn_id, NULL);
if (!ret) {
glusterd_friend_sm ();
@@ -621,7 +876,15 @@ out:
}
int32_t
-glusterd_stage_op_cbk (struct rpc_req *req, struct iovec *iov,
+glusterd_cluster_unlock_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ __glusterd_cluster_unlock_cbk);
+}
+
+int32_t
+__glusterd_stage_op_cbk (struct rpc_req *req, struct iovec *iov,
int count, void *myframe)
{
gd1_mgmt_stage_op_rsp rsp = {{0},};
@@ -632,7 +895,11 @@ glusterd_stage_op_cbk (struct rpc_req *req, struct iovec *iov,
dict_t *dict = NULL;
char err_str[2048] = {0};
char *peer_str = NULL;
+ xlator_t *this = NULL;
+ uuid_t *txn_id = NULL;
+ this = THIS;
+ GF_ASSERT (this);
GF_ASSERT (req);
if (-1 == req->rpc_status) {
@@ -646,12 +913,14 @@ glusterd_stage_op_cbk (struct rpc_req *req, struct iovec *iov,
ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gd1_mgmt_stage_op_rsp);
if (ret < 0) {
- gf_log ("", GF_LOG_ERROR, "error");
+ gf_log (this->name, GF_LOG_ERROR, "Failed to decode stage "
+ "response received from peer");
rsp.op_ret = -1;
rsp.op_errno = EINVAL;
/* use standard allocation because to keep uniformity
in freeing it */
- rsp.op_errstr = strdup ("xdr decoding failed");
+ rsp.op_errstr = strdup ("Failed to decode stage response "
+ "received from peer.");
goto out;
}
@@ -663,7 +932,7 @@ glusterd_stage_op_cbk (struct rpc_req *req, struct iovec *iov,
rsp.dict.dict_len,
&dict);
if (ret < 0) {
- gf_log ("glusterd", GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"failed to "
"unserialize rsp-buffer to dictionary");
event_type = GD_OP_EVENT_RCVD_RJT;
@@ -676,15 +945,21 @@ glusterd_stage_op_cbk (struct rpc_req *req, struct iovec *iov,
out:
op_ret = rsp.op_ret;
- gf_log ("glusterd", GF_LOG_INFO,
- "Received %s from uuid: %s",
- (op_ret)?"RJT":"ACC", uuid_utoa (rsp.uuid));
+ gf_log (this->name, (op_ret) ? GF_LOG_ERROR : GF_LOG_DEBUG,
+ "Received stage %s from uuid: %s",
+ (op_ret) ? "RJT" : "ACC", uuid_utoa (rsp.uuid));
+
+ ret = dict_get_bin (dict, "transaction_id", (void **)&txn_id);
+
+ gf_log ("", GF_LOG_DEBUG, "transaction ID = %s", uuid_utoa (*txn_id));
ret = glusterd_friend_find (rsp.uuid, NULL, &peerinfo);
if (ret) {
- gf_log ("", GF_LOG_CRITICAL, "Stage response received from "
- "unknown peer: %s", uuid_utoa (rsp.uuid));
+ gf_log (this->name, GF_LOG_CRITICAL, "Stage response received "
+ "from unknown peer: %s. Ignoring response.",
+ uuid_utoa (rsp.uuid));
+ goto out;
}
if (op_ret) {
@@ -697,12 +972,11 @@ out:
peer_str = peerinfo->hostname;
else
peer_str = uuid_utoa (rsp.uuid);
- snprintf (err_str, sizeof (err_str), "Operation failed "
- "on %s", peer_str);
+ snprintf (err_str, sizeof (err_str),
+ OPERRSTR_STAGE_FAIL, peer_str);
opinfo.op_errstr = gf_strdup (err_str);
}
if (!opinfo.op_errstr) {
- gf_log ("", GF_LOG_ERROR, "memory allocation failed");
ret = -1;
goto out;
}
@@ -716,7 +990,7 @@ out:
break;
}
- ret = glusterd_op_sm_inject_event (event_type, NULL);
+ ret = glusterd_op_sm_inject_event (event_type, txn_id, NULL);
if (!ret) {
glusterd_friend_sm ();
@@ -736,7 +1010,15 @@ out:
}
int32_t
-glusterd_commit_op_cbk (struct rpc_req *req, struct iovec *iov,
+glusterd_stage_op_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ __glusterd_stage_op_cbk);
+}
+
+int32_t
+__glusterd_commit_op_cbk (struct rpc_req *req, struct iovec *iov,
int count, void *myframe)
{
gd1_mgmt_commit_op_rsp rsp = {{0},};
@@ -747,8 +1029,12 @@ glusterd_commit_op_cbk (struct rpc_req *req, struct iovec *iov,
dict_t *dict = NULL;
char err_str[2048] = {0};
char *peer_str = NULL;
+ xlator_t *this = NULL;
+ uuid_t *txn_id = NULL;
+ this = THIS;
+ GF_ASSERT (this);
GF_ASSERT (req);
if (-1 == req->rpc_status) {
@@ -763,12 +1049,14 @@ glusterd_commit_op_cbk (struct rpc_req *req, struct iovec *iov,
ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gd1_mgmt_commit_op_rsp);
if (ret < 0) {
- gf_log ("", GF_LOG_ERROR, "xdr decoding error");
+ gf_log (this->name, GF_LOG_ERROR, "Failed to decode commit "
+ "response received from peer");
rsp.op_ret = -1;
rsp.op_errno = EINVAL;
/* use standard allocation because to keep uniformity
in freeing it */
- rsp.op_errstr = strdup ("xdr decoding error");
+ rsp.op_errstr = strdup ("Failed to decode commit response "
+ "received from peer.");
event_type = GD_OP_EVENT_RCVD_RJT;
goto out;
}
@@ -781,7 +1069,7 @@ glusterd_commit_op_cbk (struct rpc_req *req, struct iovec *iov,
rsp.dict.dict_len,
&dict);
if (ret < 0) {
- gf_log ("glusterd", GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"failed to "
"unserialize rsp-buffer to dictionary");
event_type = GD_OP_EVENT_RCVD_RJT;
@@ -793,15 +1081,20 @@ glusterd_commit_op_cbk (struct rpc_req *req, struct iovec *iov,
op_ret = rsp.op_ret;
- gf_log ("glusterd", GF_LOG_INFO,
- "Received %s from uuid: %s",
+ gf_log (this->name, (op_ret) ? GF_LOG_ERROR : GF_LOG_DEBUG,
+ "Received commit %s from uuid: %s",
(op_ret)?"RJT":"ACC", uuid_utoa (rsp.uuid));
+ ret = dict_get_bin (dict, "transaction_id", (void **)&txn_id);
+
+ gf_log ("", GF_LOG_DEBUG, "transaction ID = %s", uuid_utoa (*txn_id));
+
ret = glusterd_friend_find (rsp.uuid, NULL, &peerinfo);
if (ret) {
- gf_log ("", GF_LOG_CRITICAL, "Commit response received from "
- "unknown peer: %s", uuid_utoa (rsp.uuid));
+ gf_log (this->name, GF_LOG_CRITICAL, "Commit response for "
+ "'Volume %s' received from unknown peer: %s",
+ gd_op_list[opinfo.op], uuid_utoa (rsp.uuid));
}
if (op_ret) {
@@ -814,12 +1107,11 @@ glusterd_commit_op_cbk (struct rpc_req *req, struct iovec *iov,
peer_str = peerinfo->hostname;
else
peer_str = uuid_utoa (rsp.uuid);
- snprintf (err_str, sizeof (err_str), "Operation failed "
- "on %s", peer_str);
+ snprintf (err_str, sizeof (err_str),
+ OPERRSTR_COMMIT_FAIL, peer_str);
opinfo.op_errstr = gf_strdup (err_str);
}
if (!opinfo.op_errstr) {
- gf_log ("", GF_LOG_ERROR, "memory allocation failed");
ret = -1;
goto out;
}
@@ -876,7 +1168,7 @@ glusterd_commit_op_cbk (struct rpc_req *req, struct iovec *iov,
}
out:
- ret = glusterd_op_sm_inject_event (event_type, NULL);
+ ret = glusterd_op_sm_inject_event (event_type, txn_id, NULL);
if (!ret) {
glusterd_friend_sm ();
@@ -890,7 +1182,13 @@ out:
return ret;
}
-
+int32_t
+glusterd_commit_op_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ __glusterd_commit_op_cbk);
+}
int32_t
glusterd_rpc_probe (call_frame_t *frame, xlator_t *this,
@@ -944,12 +1242,12 @@ int32_t
glusterd_rpc_friend_add (call_frame_t *frame, xlator_t *this,
void *data)
{
- gd1_mgmt_friend_req req = {{0},};
- int ret = 0;
- glusterd_peerinfo_t *peerinfo = NULL;
- glusterd_conf_t *priv = NULL;
- glusterd_friend_sm_event_t *event = NULL;
- dict_t *vols = NULL;
+ gd1_mgmt_friend_req req = {{0},};
+ int ret = 0;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ glusterd_conf_t *priv = NULL;
+ glusterd_friend_sm_event_t *event = NULL;
+ dict_t *peer_data = NULL;
if (!frame || !this || !data) {
@@ -964,15 +1262,37 @@ glusterd_rpc_friend_add (call_frame_t *frame, xlator_t *this,
peerinfo = event->peerinfo;
- ret = glusterd_build_volume_dict (&vols);
- if (ret)
+ ret = glusterd_add_volumes_to_export_dict (&peer_data);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to add list of volumes "
+ "in the peer_data dict for handshake");
goto out;
+ }
+
+ if (priv->op_version >= GD_OP_VERSION_4) {
+ ret = glusterd_add_missed_snaps_to_export_dict (peer_data);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to add list of missed snapshots "
+ "in the peer_data dict for handshake");
+ goto out;
+ }
+
+ ret = glusterd_add_snapshots_to_export_dict (peer_data);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to add list of snapshots "
+ "in the peer_data dict for handshake");
+ goto out;
+ }
+ }
uuid_copy (req.uuid, MY_UUID);
req.hostname = peerinfo->hostname;
req.port = peerinfo->port;
- ret = dict_allocate_and_serialize (vols, &req.vols.vols_val,
+ ret = dict_allocate_and_serialize (peer_data, &req.vols.vols_val,
&req.vols.vols_len);
if (ret)
goto out;
@@ -986,8 +1306,8 @@ glusterd_rpc_friend_add (call_frame_t *frame, xlator_t *this,
out:
GF_FREE (req.vols.vols_val);
- if (vols)
- dict_unref (vols);
+ if (peer_data)
+ dict_unref (peer_data);
gf_log ("glusterd", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
@@ -1102,7 +1422,140 @@ glusterd_cluster_lock (call_frame_t *frame, xlator_t *this,
this, glusterd_cluster_lock_cbk,
(xdrproc_t)xdr_gd1_mgmt_cluster_lock_req);
out:
- gf_log ("glusterd", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_mgmt_v3_lock_peers (call_frame_t *frame, xlator_t *this,
+ void *data)
+{
+ gd1_mgmt_v3_lock_req req = {{0},};
+ int ret = -1;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ glusterd_conf_t *priv = NULL;
+ call_frame_t *dummy_frame = NULL;
+ dict_t *dict = NULL;
+ uuid_t *txn_id = NULL;
+
+ if (!this)
+ goto out;
+
+ dict = data;
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ ret = dict_get_ptr (dict, "peerinfo", VOID (&peerinfo));
+ if (ret)
+ goto out;
+
+ //peerinfo should not be in payload
+ dict_del (dict, "peerinfo");
+
+ glusterd_get_uuid (&req.uuid);
+
+ ret = dict_allocate_and_serialize (dict, &req.dict.dict_val,
+ &req.dict.dict_len);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to serialize dict "
+ "to request buffer");
+ goto out;
+ }
+
+ /* Sending valid transaction ID to peers */
+ ret = dict_get_bin (dict, "transaction_id",
+ (void **)&txn_id);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get transaction id.");
+ goto out;
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Transaction_id = %s", uuid_utoa (*txn_id));
+ uuid_copy (req.txn_id, *txn_id);
+ }
+
+ dummy_frame = create_frame (this, this->ctx->pool);
+ if (!dummy_frame) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = glusterd_submit_request (peerinfo->rpc, &req, dummy_frame,
+ peerinfo->mgmt_v3,
+ GLUSTERD_MGMT_V3_LOCK, NULL,
+ this, glusterd_mgmt_v3_lock_peers_cbk,
+ (xdrproc_t)xdr_gd1_mgmt_v3_lock_req);
+out:
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_mgmt_v3_unlock_peers (call_frame_t *frame, xlator_t *this,
+ void *data)
+{
+ gd1_mgmt_v3_unlock_req req = {{0},};
+ int ret = -1;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ glusterd_conf_t *priv = NULL;
+ call_frame_t *dummy_frame = NULL;
+ dict_t *dict = NULL;
+ uuid_t *txn_id = NULL;
+
+ if (!this)
+ goto out;
+
+ dict = data;
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ ret = dict_get_ptr (dict, "peerinfo", VOID (&peerinfo));
+ if (ret)
+ goto out;
+
+ //peerinfo should not be in payload
+ dict_del (dict, "peerinfo");
+
+ glusterd_get_uuid (&req.uuid);
+
+ ret = dict_allocate_and_serialize (dict, &req.dict.dict_val,
+ &req.dict.dict_len);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to serialize dict "
+ "to request buffer");
+ goto out;
+ }
+
+ /* Sending valid transaction ID to peers */
+ ret = dict_get_bin (dict, "transaction_id",
+ (void **)&txn_id);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get transaction id.");
+ goto out;
+ } else {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Transaction_id = %s", uuid_utoa (*txn_id));
+ uuid_copy (req.txn_id, *txn_id);
+ }
+
+ dummy_frame = create_frame (this, this->ctx->pool);
+ if (!dummy_frame) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = glusterd_submit_request (peerinfo->rpc, &req, dummy_frame,
+ peerinfo->mgmt_v3,
+ GLUSTERD_MGMT_V3_UNLOCK, NULL,
+ this, glusterd_mgmt_v3_unlock_peers_cbk,
+ (xdrproc_t)
+ xdr_gd1_mgmt_v3_unlock_req);
+out:
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -1136,7 +1589,7 @@ glusterd_cluster_unlock (call_frame_t *frame, xlator_t *this,
this, glusterd_cluster_unlock_cbk,
(xdrproc_t)xdr_gd1_mgmt_cluster_unlock_req);
out:
- gf_log ("glusterd", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -1173,8 +1626,11 @@ glusterd_stage_op (call_frame_t *frame, xlator_t *this,
ret = dict_allocate_and_serialize (dict, &req.buf.buf_val,
&req.buf.buf_len);
- if (ret)
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to serialize dict "
+ "to request buffer");
goto out;
+ }
dummy_frame = create_frame (this, this->ctx->pool);
@@ -1191,7 +1647,7 @@ out:
if ((_gf_true == is_alloc) && req.buf.buf_val)
GF_FREE (req.buf.buf_val);
- gf_log ("glusterd", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -1227,8 +1683,11 @@ glusterd_commit_op (call_frame_t *frame, xlator_t *this,
ret = dict_allocate_and_serialize (dict, &req.buf.buf_val,
&req.buf.buf_len);
- if (ret)
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to serialize dict to "
+ "request buffer");
goto out;
+ }
dummy_frame = create_frame (this, this->ctx->pool);
if (!dummy_frame)
@@ -1244,12 +1703,12 @@ out:
if ((_gf_true == is_alloc) && req.buf.buf_val)
GF_FREE (req.buf.buf_val);
- gf_log ("glusterd", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
int32_t
-glusterd_brick_op_cbk (struct rpc_req *req, struct iovec *iov,
+__glusterd_brick_op_cbk (struct rpc_req *req, struct iovec *iov,
int count, void *myframe)
{
gd1_mgmt_brick_op_rsp rsp = {0};
@@ -1262,8 +1721,17 @@ glusterd_brick_op_cbk (struct rpc_req *req, struct iovec *iov,
int index = 0;
glusterd_req_ctx_t *req_ctx = NULL;
glusterd_pending_node_t *node = NULL;
+ xlator_t *this = NULL;
+ uuid_t *txn_id = NULL;
+ glusterd_conf_t *priv = NULL;
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
GF_ASSERT (req);
+
+ txn_id = &priv->global_txn_id;
frame = myframe;
req_ctx = frame->local;
@@ -1279,7 +1747,8 @@ glusterd_brick_op_cbk (struct rpc_req *req, struct iovec *iov,
ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gd1_mgmt_brick_op_rsp);
if (ret < 0) {
- gf_log ("", GF_LOG_ERROR, "error");
+ gf_log (this->name, GF_LOG_ERROR, "Failed to decode brick op "
+ "response received");
rsp.op_ret = -1;
rsp.op_errno = EINVAL;
rsp.op_errstr = strdup ("Unable to decode brick op response");
@@ -1295,8 +1764,7 @@ glusterd_brick_op_cbk (struct rpc_req *req, struct iovec *iov,
rsp.output.output_len,
&dict);
if (ret < 0) {
- gf_log ("glusterd", GF_LOG_ERROR,
- "failed to "
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
"unserialize rsp-buffer to dictionary");
event_type = GD_OP_EVENT_RCVD_RJT;
goto out;
@@ -1313,7 +1781,7 @@ glusterd_brick_op_cbk (struct rpc_req *req, struct iovec *iov,
index = node->index;
ret = dict_set_int32 (dict, "index", index);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"Error setting index on brick status rsp dict");
rsp.op_ret = -1;
event_type = GD_OP_EVENT_RCVD_RJT;
@@ -1321,6 +1789,11 @@ glusterd_brick_op_cbk (struct rpc_req *req, struct iovec *iov,
}
}
out:
+
+ ret = dict_get_bin (req_ctx->dict, "transaction_id", (void **)&txn_id);
+
+ gf_log ("", GF_LOG_DEBUG, "transaction ID = %s", uuid_utoa (*txn_id));
+
ev_ctx = GF_CALLOC (1, sizeof (*ev_ctx), gf_gld_mt_brick_rsp_ctx_t);
GF_ASSERT (ev_ctx);
if (op_ret) {
@@ -1333,7 +1806,7 @@ out:
ev_ctx->pending_node = frame->cookie;
ev_ctx->rsp_dict = dict;
ev_ctx->commit_ctx = frame->local;
- ret = glusterd_op_sm_inject_event (event_type, ev_ctx);
+ ret = glusterd_op_sm_inject_event (event_type, txn_id, ev_ctx);
if (!ret) {
glusterd_friend_sm ();
glusterd_op_sm ();
@@ -1347,9 +1820,18 @@ out:
}
int32_t
+glusterd_brick_op_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ __glusterd_brick_op_cbk);
+}
+
+int32_t
glusterd_brick_op (call_frame_t *frame, xlator_t *this,
- void *data)
+ void *data)
{
+
gd1_mgmt_brick_op_req *req = NULL;
int ret = 0;
glusterd_conf_t *priv = NULL;
@@ -1360,6 +1842,7 @@ glusterd_brick_op (call_frame_t *frame, xlator_t *this,
glusterd_req_ctx_t *req_ctx = NULL;
struct rpc_clnt *rpc = NULL;
dict_t *op_ctx = NULL;
+ uuid_t *txn_id = NULL;
if (!this) {
ret = -1;
@@ -1368,37 +1851,53 @@ glusterd_brick_op (call_frame_t *frame, xlator_t *this,
priv = this->private;
GF_ASSERT (priv);
+ txn_id = &priv->global_txn_id;
+
req_ctx = data;
GF_ASSERT (req_ctx);
INIT_LIST_HEAD (&opinfo.pending_bricks);
- ret = glusterd_op_bricks_select (req_ctx->op, req_ctx->dict, &op_errstr);
+ ret = glusterd_op_bricks_select (req_ctx->op, req_ctx->dict, &op_errstr,
+ &opinfo.pending_bricks, NULL);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Brick Op failed");
+ gf_log (this->name, GF_LOG_ERROR, "Failed to select bricks "
+ "while performing brick op during 'Volume %s'",
+ gd_op_list[opinfo.op]);
opinfo.op_errstr = op_errstr;
goto out;
}
+
+ ret = dict_get_bin (req_ctx->dict, "transaction_id", (void **)&txn_id);
+
+ gf_log ("", GF_LOG_DEBUG, "transaction ID = %s", uuid_utoa (*txn_id));
+
list_for_each_entry (pending_node, &opinfo.pending_bricks, list) {
dummy_frame = create_frame (this, this->ctx->pool);
if (!dummy_frame)
continue;
if ((pending_node->type == GD_NODE_NFS) ||
+ (pending_node->type == GD_NODE_QUOTAD) ||
((pending_node->type == GD_NODE_SHD) &&
- (req_ctx->op == GD_OP_STATUS_VOLUME)))
+ (req_ctx->op == GD_OP_STATUS_VOLUME)))
ret = glusterd_node_op_build_payload
(req_ctx->op,
(gd1_mgmt_brick_op_req **)&req,
req_ctx->dict);
- else
+ else {
ret = glusterd_brick_op_build_payload
(req_ctx->op, pending_node->node,
(gd1_mgmt_brick_op_req **)&req,
req_ctx->dict);
- if (ret)
- goto out;
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "build brick op payload during "
+ "'Volume %s'", gd_op_list[req_ctx->op]);
+ goto out;
+ }
+ }
dummy_frame->local = data;
dummy_frame->cookie = pending_node;
@@ -1444,16 +1943,18 @@ glusterd_brick_op (call_frame_t *frame, xlator_t *this,
pending_bricks++;
}
- gf_log ("glusterd", GF_LOG_DEBUG, "Sent op req to %d bricks",
+ gf_log (this->name, GF_LOG_DEBUG, "Sent brick op req for operation "
+ "'Volume %s' to %d bricks", gd_op_list[req_ctx->op],
pending_bricks);
opinfo.brick_pending_count = pending_bricks;
out:
if (ret) {
- glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT, data);
+ glusterd_op_sm_inject_event (GD_OP_EVENT_RCVD_RJT,
+ txn_id, data);
opinfo.op_ret = ret;
}
- gf_log ("glusterd", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -1478,6 +1979,12 @@ struct rpc_clnt_procedure gd_mgmt_actors[GLUSTERD_MGMT_MAXVALUE] = {
[GLUSTERD_MGMT_COMMIT_OP] = {"COMMIT_OP", glusterd_commit_op},
};
+struct rpc_clnt_procedure gd_mgmt_v3_actors[GLUSTERD_MGMT_V3_MAXVALUE] = {
+ [GLUSTERD_MGMT_V3_NULL] = {"NULL", NULL },
+ [GLUSTERD_MGMT_V3_LOCK] = {"MGMT_V3_LOCK", glusterd_mgmt_v3_lock_peers},
+ [GLUSTERD_MGMT_V3_UNLOCK] = {"MGMT_V3_UNLOCK", glusterd_mgmt_v3_unlock_peers},
+};
+
struct rpc_clnt_program gd_mgmt_prog = {
.progname = "glusterd mgmt",
.prognum = GD_MGMT_PROGRAM,
@@ -1502,3 +2009,10 @@ struct rpc_clnt_program gd_peer_prog = {
.numproc = GLUSTERD_FRIEND_MAXVALUE,
};
+struct rpc_clnt_program gd_mgmt_v3_prog = {
+ .progname = "glusterd mgmt v3",
+ .prognum = GD_MGMT_PROGRAM,
+ .progver = GD_MGMT_V3_VERSION,
+ .proctable = gd_mgmt_v3_actors,
+ .numproc = GLUSTERD_MGMT_V3_MAXVALUE,
+};
diff --git a/xlators/mgmt/glusterd/src/glusterd-sm.c b/xlators/mgmt/glusterd/src/glusterd-sm.c
index a60eb53e5..7a8b2c94f 100644
--- a/xlators/mgmt/glusterd/src/glusterd-sm.c
+++ b/xlators/mgmt/glusterd/src/glusterd-sm.c
@@ -34,6 +34,7 @@
#include "glusterd-op-sm.h"
#include "glusterd-utils.h"
#include "glusterd-store.h"
+#include "glusterd-etcd.h"
static struct list_head gd_friend_sm_queue;
@@ -400,7 +401,8 @@ glusterd_ac_send_friend_remove_req (glusterd_friend_sm_event_t *event,
if (ctx)
ret = glusterd_xfer_cli_deprobe_resp (ctx->req, ret, 0,
NULL,
- ctx->hostname);
+ ctx->hostname,
+ ctx->dict);
glusterd_friend_sm ();
glusterd_op_sm ();
@@ -526,6 +528,9 @@ out:
return ret;
}
+/* Clean up stale volumes on the peer being detached. The volumes which have
+ * bricks on other peers are stale with respect to the detached peer.
+ */
static int
glusterd_peer_detach_cleanup (glusterd_conf_t *priv)
{
@@ -537,6 +542,12 @@ glusterd_peer_detach_cleanup (glusterd_conf_t *priv)
list_for_each_entry_safe (volinfo,tmp_volinfo,
&priv->volumes, vol_list) {
+ /* The peer detach checks make sure that, at this point in the
+ * detach process, there are only volumes contained completely
+ * within or completely outside the detached peer.
+ * The only stale volumes at this point are the ones
+ * completely outside the peer and can be safely deleted.
+ */
if (!glusterd_friend_contains_vol_bricks (volinfo,
MY_UUID)) {
gf_log (THIS->name, GF_LOG_INFO,
@@ -595,6 +606,9 @@ glusterd_ac_handle_friend_remove_req (glusterd_friend_sm_event_t *event,
"Peer detach cleanup was not successful");
ret = 0;
}
+ gf_log (THIS->name, GF_LOG_INFO, "detached, stopping etcd");
+ stop_etcd(priv->etcd_pid);
+ nuke_etcd_dir();
out:
gf_log (THIS->name, GF_LOG_DEBUG, "Returning with %d", ret);
@@ -638,10 +652,14 @@ glusterd_ac_handle_friend_add_req (glusterd_friend_sm_event_t *event, void *ctx)
glusterd_friend_update_ctx_t *new_ev_ctx = NULL;
glusterd_friend_sm_event_t *new_event = NULL;
glusterd_friend_sm_event_type_t event_type = GD_FRIEND_EVENT_NONE;
+ glusterd_conf_t *conf = NULL;
int status = 0;
int32_t op_ret = -1;
int32_t op_errno = 0;
+ xlator_t *this = NULL;
+ this = THIS;
+ GF_ASSERT (this);
GF_ASSERT (ctx);
ev_ctx = ctx;
uuid_copy (uuid, ev_ctx->uuid);
@@ -649,8 +667,12 @@ glusterd_ac_handle_friend_add_req (glusterd_friend_sm_event_t *event, void *ctx)
GF_ASSERT (peerinfo);
uuid_copy (peerinfo->uuid, ev_ctx->uuid);
+ conf = this->private;
+ GF_ASSERT (conf);
+
//Build comparison logic here.
- ret = glusterd_compare_friend_data (ev_ctx->vols, &status);
+ ret = glusterd_compare_friend_data (ev_ctx->vols, &status,
+ peerinfo->hostname);
if (ret)
goto out;
@@ -663,6 +685,31 @@ glusterd_ac_handle_friend_add_req (glusterd_friend_sm_event_t *event, void *ctx)
op_ret = -1;
}
+ /* Compare missed_snapshot list with the peer *
+ * if volume comparison is successful */
+ if ((op_ret == 0) &&
+ (conf->op_version >= GD_OP_VERSION_4)) {
+ ret = glusterd_import_friend_missed_snap_list (ev_ctx->vols);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to import peer's "
+ "missed_snaps_list.");
+ event_type = GD_FRIEND_EVENT_LOCAL_RJT;
+ op_errno = GF_PROBE_MISSED_SNAP_CONFLICT;
+ op_ret = -1;
+ }
+
+ ret = glusterd_compare_friend_snapshots (ev_ctx->vols,
+ peerinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Conflict in comparing peer's snapshots");
+ event_type = GD_FRIEND_EVENT_LOCAL_RJT;
+ op_errno = GF_PROBE_SNAP_CONFLICT;
+ op_ret = -1;
+ }
+ }
+
ret = glusterd_friend_sm_new_event (event_type, &new_event);
if (ret) {
@@ -687,7 +734,15 @@ glusterd_ac_handle_friend_add_req (glusterd_friend_sm_event_t *event, void *ctx)
glusterd_friend_sm_inject_event (new_event);
ret = glusterd_xfer_friend_add_resp (ev_ctx->req, ev_ctx->hostname,
- ev_ctx->port, op_ret, op_errno);
+ peerinfo->hostname, ev_ctx->port,
+ op_ret, op_errno);
+
+ // apply a deterministic function to decide via whom we should join the cluster
+ if (strcmp(peerinfo->hostname, ev_ctx->hostname) > 0) {
+ stop_etcd(conf->etcd_pid);
+ nuke_etcd_dir();
+ conf->etcd_pid = start_etcd (uuid_utoa(MY_UUID), peerinfo->hostname);
+ }
out:
gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
@@ -959,6 +1014,27 @@ glusterd_destroy_friend_event_context (glusterd_friend_sm_event_t *event)
}
}
+gf_boolean_t
+gd_does_peer_affect_quorum (glusterd_friend_sm_state_t old_state,
+ glusterd_friend_sm_event_type_t event_type,
+ glusterd_peerinfo_t *peerinfo)
+{
+ gf_boolean_t affects = _gf_false;
+
+ //When glusterd comes up with friends in BEFRIENDED state in store,
+ //wait until compare-data happens.
+ if ((old_state == GD_FRIEND_STATE_BEFRIENDED) &&
+ (event_type != GD_FRIEND_EVENT_RCVD_ACC) &&
+ (event_type != GD_FRIEND_EVENT_LOCAL_ACC))
+ goto out;
+ if ((peerinfo->state.state == GD_FRIEND_STATE_BEFRIENDED)
+ && peerinfo->connected) {
+ affects = _gf_true;
+ }
+out:
+ return affects;
+}
+
int
glusterd_friend_sm ()
{
@@ -970,6 +1046,8 @@ glusterd_friend_sm ()
glusterd_peerinfo_t *peerinfo = NULL;
glusterd_friend_sm_event_type_t event_type = 0;
gf_boolean_t is_await_conn = _gf_false;
+ gf_boolean_t quorum_action = _gf_false;
+ glusterd_friend_sm_state_t old_state = GD_FRIEND_STATE_DEFAULT;
while (!list_empty (&gd_friend_sm_queue)) {
list_for_each_entry_safe (event, tmp, &gd_friend_sm_queue, list) {
@@ -989,6 +1067,7 @@ glusterd_friend_sm ()
glusterd_friend_sm_event_name_get (event_type));
+ old_state = peerinfo->state.state;
state = glusterd_friend_state_table[peerinfo->state.state];
GF_ASSERT (state);
@@ -1029,6 +1108,15 @@ glusterd_friend_sm ()
goto out;
}
+ if (gd_does_peer_affect_quorum (old_state, event_type,
+ peerinfo)) {
+ peerinfo->quorum_contrib = QUORUM_UP;
+ if (peerinfo->quorum_action) {
+ peerinfo->quorum_action = _gf_false;
+ quorum_action = _gf_true;
+ }
+ }
+
ret = glusterd_store_peerinfo (peerinfo);
glusterd_destroy_friend_event_context (event);
@@ -1042,6 +1130,25 @@ glusterd_friend_sm ()
ret = 0;
out:
+ if (quorum_action) {
+ /* When glusterd is restarted, it needs to wait until the 'friends' view
+ * of the volumes settle, before it starts any of the internal daemons.
+ *
+ * Every friend that was part of the cluster, would send its
+ * cluster-view, 'our' way. For every friend, who belongs to
+ * a partition which has a different cluster-view from our
+ * partition, we may update our cluster-view. For subsequent
+ * friends from that partition would agree with us, if the first
+ * friend wasn't rejected. For every first friend, whom we agreed with,
+ * we would need to start internal daemons/bricks belonging to the
+ * new volumes.
+ * glusterd_spawn_daemons calls functions that are idempotent. ie,
+ * the functions spawn process(es) only if they are not started yet.
+ *
+ * */
+ glusterd_spawn_daemons (NULL);
+ glusterd_do_quorum_action ();
+ }
return ret;
}
diff --git a/xlators/mgmt/glusterd/src/glusterd-sm.h b/xlators/mgmt/glusterd/src/glusterd-sm.h
index 011578da3..b9bedbe69 100644
--- a/xlators/mgmt/glusterd/src/glusterd-sm.h
+++ b/xlators/mgmt/glusterd/src/glusterd-sm.h
@@ -27,14 +27,21 @@
#include "byte-order.h"
//#include "glusterd.h"
#include "rpcsvc.h"
-
-struct glusterd_store_handle_ {
- char *path;
- int fd;
- FILE *read;
-};
-
-typedef struct glusterd_store_handle_ glusterd_store_handle_t;
+#include "store.h"
+
+typedef enum gd_quorum_contribution_ {
+ QUORUM_NONE,
+ QUORUM_WAITING,
+ QUORUM_DOWN,
+ QUORUM_UP
+} gd_quorum_contrib_t;
+
+typedef enum gd_quorum_status_ {
+ QUORUM_UNKNOWN,
+ QUORUM_NOT_APPLICABLE,
+ QUORUM_MEETS,
+ QUORUM_DOES_NOT_MEET
+} gd_quorum_status_t;
typedef enum glusterd_friend_sm_state_ {
GD_FRIEND_STATE_DEFAULT = 0,
@@ -79,7 +86,9 @@ typedef struct glusterd_sm_tr_log_ {
struct glusterd_peerinfo_ {
uuid_t uuid;
- char uuid_str[50];
+ char uuid_str[50]; /* Retrieve this using
+ * gd_peer_uuid_str ()
+ */
glusterd_peer_state_info_t state;
char *hostname;
int port;
@@ -88,9 +97,13 @@ struct glusterd_peerinfo_ {
struct rpc_clnt *rpc;
rpc_clnt_prog_t *mgmt;
rpc_clnt_prog_t *peer;
+ rpc_clnt_prog_t *mgmt_v3;
int connected;
- glusterd_store_handle_t *shandle;
+ gf_store_handle_t *shandle;
glusterd_sm_tr_log_t sm_log;
+ gf_boolean_t quorum_action;
+ gd_quorum_contrib_t quorum_contrib;
+ gf_boolean_t locked;
};
typedef struct glusterd_peerinfo_ glusterd_peerinfo_t;
@@ -104,6 +117,7 @@ typedef enum glusterd_ev_gen_mode_ {
typedef struct glusterd_peer_ctx_args_ {
rpcsvc_request_t *req;
glusterd_ev_gen_mode_t mode;
+ dict_t *dict;
} glusterd_peerctx_args_t;
typedef struct glusterd_peer_ctx_ {
@@ -170,6 +184,7 @@ typedef struct glusterd_probe_ctx_ {
char *hostname;
rpcsvc_request_t *req;
int port;
+ dict_t *dict;
} glusterd_probe_ctx_t;
int
glusterd_friend_sm_new_event (glusterd_friend_sm_event_type_t event_type,
diff --git a/xlators/mgmt/glusterd/src/glusterd-snapshot.c b/xlators/mgmt/glusterd/src/glusterd-snapshot.c
new file mode 100644
index 000000000..0e824a022
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-snapshot.c
@@ -0,0 +1,5787 @@
+/*
+ Copyright (c) 2013-2014 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <inttypes.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/resource.h>
+#include <sys/statvfs.h>
+#include <sys/mount.h>
+#include <signal.h>
+
+
+#if !defined(__NetBSD__) && !defined(GF_DARWIN_HOST_OS)
+#include <mntent.h>
+#else
+#include "mntent_compat.h"
+#endif
+
+#include "globals.h"
+#include "compat.h"
+#include "protocol-common.h"
+#include "xlator.h"
+#include "logging.h"
+#include "timer.h"
+#include "glusterd-mem-types.h"
+#include "glusterd.h"
+#include "glusterd-sm.h"
+#include "glusterd-op-sm.h"
+#include "glusterd-utils.h"
+#include "glusterd-store.h"
+#include "run.h"
+#include "glusterd-volgen.h"
+#include "glusterd-mgmt.h"
+#include "glusterd-syncop.h"
+
+#include "syscall.h"
+#include "cli1-xdr.h"
+#include "xdr-generic.h"
+
+#include "lvm-defaults.h"
+
+char snap_mount_folder[PATH_MAX];
+
+/* Look for disconnected peers, for missed snap creates or deletes */
+static int32_t
+glusterd_find_missed_snap (dict_t *rsp_dict, glusterd_volinfo_t *vol,
+ struct list_head *peers, int32_t op)
+{
+ int32_t brick_count = -1;
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (rsp_dict);
+ GF_ASSERT (peers);
+ GF_ASSERT (vol);
+
+ brick_count = 0;
+ list_for_each_entry (brickinfo, &vol->bricks, brick_list) {
+ if (!uuid_compare (brickinfo->uuid, MY_UUID)) {
+ /* If the brick belongs to the same node */
+ brick_count++;
+ continue;
+ }
+
+ list_for_each_entry (peerinfo, peers, uuid_list) {
+ if (uuid_compare (peerinfo->uuid, brickinfo->uuid)) {
+ /* If the brick doesnt belong to this peer */
+ continue;
+ }
+
+ /* Found peer who owns the brick, *
+ * if peer is not connected or not *
+ * friend add it to missed snap list */
+ if (!(peerinfo->connected) ||
+ (peerinfo->state.state !=
+ GD_FRIEND_STATE_BEFRIENDED)) {
+ ret = glusterd_add_missed_snaps_to_dict
+ (rsp_dict,
+ vol, brickinfo,
+ brick_count + 1,
+ op);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to add missed snapshot "
+ "info for %s:%s in the "
+ "rsp_dict", brickinfo->hostname,
+ brickinfo->path);
+ goto out;
+ }
+ }
+ }
+ brick_count++;
+ }
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* This function will restore a snapshot volumes
+ *
+ * @param dict dictionary containing snapshot restore request
+ * @param op_errstr In case of any failure error message will be returned
+ * in this variable
+ * @return Negative value on Failure and 0 in success
+ */
+int
+glusterd_snapshot_restore (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+ int ret = -1;
+ char *snapname = NULL;
+ xlator_t *this = NULL;
+ glusterd_volinfo_t *snap_volinfo = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ glusterd_snap_t *snap = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ this = THIS;
+
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (op_errstr);
+ GF_ASSERT (rsp_dict);
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ ret = dict_get_str (dict, "snapname", &snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get "
+ "snap name");
+ goto out;
+ }
+
+ snap = glusterd_find_snap_by_name (snapname);
+ if (NULL == snap) {
+ ret = gf_asprintf (op_errstr, "Snap (%s) not found",
+ snapname);
+ if (ret < 0) {
+ goto out;
+ }
+ gf_log (this->name, GF_LOG_ERROR, "%s", *op_errstr);
+ ret = -1;
+ goto out;
+ }
+
+
+ /* TODO : As of now there is only volume in snapshot.
+ * Change this when multiple volume snapshot is introduced
+ */
+ snap_volinfo = list_entry (snap->volumes.next, glusterd_volinfo_t,
+ vol_list);
+
+ ret = glusterd_volinfo_find (snap_volinfo->parent_volname, &volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not get volinfo of "
+ "%s", snap_volinfo->parent_volname);
+ goto out;
+ }
+
+ if (is_origin_glusterd (dict) == _gf_true) {
+ /* From origin glusterd check if *
+ * any peers with snap bricks is down */
+ ret = glusterd_find_missed_snap (rsp_dict, snap_volinfo,
+ &priv->peers,
+ GF_SNAP_OPTION_TYPE_RESTORE);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to find missed snap restores");
+ goto out;
+ }
+ }
+
+ ret = gd_restore_snap_volume (rsp_dict, volinfo, snap_volinfo);
+ if (ret) {
+ /* No need to update op_errstr because it is assumed
+ * that the called function will do that in case of
+ * failure.
+ */
+ gf_log (this->name, GF_LOG_ERROR, "Failed to restore "
+ "snap for %s", snapname);
+ goto out;
+ }
+
+ ret = 0;
+
+ /* TODO: Need to check if we need to delete the snap after the
+ * operation is successful or not. Also need to persist the state
+ * of restore operation in the store.
+ */
+out:
+ return ret;
+}
+
+/* This function is called before actual restore is taken place. This function
+ * will validate whether the snapshot volumes are ready to be restored or not.
+ *
+ * @param dict dictionary containing snapshot restore request
+ * @param op_errstr In case of any failure error message will be returned
+ * in this variable
+ * @param rsp_dict response dictionary
+ * @return Negative value on Failure and 0 in success
+ */
+int32_t
+glusterd_snapshot_restore_prevalidate (dict_t *dict, char **op_errstr,
+ dict_t *rsp_dict)
+{
+ int ret = -1;
+ int32_t i = 0;
+ int32_t volcount = 0;
+ gf_boolean_t snap_restored = _gf_false;
+ char key[PATH_MAX] = {0, };
+ char *volname = NULL;
+ char *snapname = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ glusterd_snap_t *snap = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (op_errstr);
+ GF_ASSERT (rsp_dict);
+
+ ret = dict_get_str (dict, "snapname", &snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get "
+ "snap name");
+ goto out;
+ }
+
+ snap = glusterd_find_snap_by_name (snapname);
+ if (NULL == snap) {
+ ret = gf_asprintf (op_errstr, "Snap (%s) not found",
+ snapname);
+ if (ret < 0) {
+ goto out;
+ }
+ gf_log (this->name, GF_LOG_ERROR, "%s", *op_errstr);
+ ret = -1;
+ goto out;
+ }
+
+ snap_restored = snap->snap_restored;
+
+ if (snap_restored) {
+ ret = gf_asprintf (op_errstr, "Snap (%s) is already "
+ "restored", snapname);
+ if (ret < 0) {
+ goto out;
+ }
+ gf_log (this->name, GF_LOG_ERROR, "%s", *op_errstr);
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_str (rsp_dict, "snapname", snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set "
+ "snap name(%s)", snapname);
+ goto out;
+ }
+
+ ret = dict_get_int32 (dict, "volcount", &volcount);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get volume count");
+ goto out;
+ }
+
+ /* Snapshot restore will only work if all the volumes,
+ that are part of the snapshot, are stopped. */
+ for (i = 1; i <= volcount; ++i) {
+ snprintf (key, sizeof (key), "volname%d", i);
+ ret = dict_get_str (dict, key, &volname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "get volume name");
+ goto out;
+ }
+
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret) {
+ ret = gf_asprintf (op_errstr, "Volume (%s) not found",
+ volname);
+ if (ret < 0) {
+ goto out;
+ }
+ gf_log (this->name, GF_LOG_ERROR, "%s", *op_errstr);
+ ret = -1;
+ goto out;
+ }
+
+ if (glusterd_is_volume_started (volinfo)) {
+ ret = gf_asprintf (op_errstr, "Volume (%s) has been "
+ "started. Volume needs to be stopped before restoring "
+ "a snapshot.", volname);
+ if (ret < 0) {
+ goto out;
+ }
+ gf_log (this->name, GF_LOG_ERROR, "%s", *op_errstr);
+ ret = -1;
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int
+snap_max_hard_limits_validate (dict_t *dict, char *volname,
+ uint64_t value, char **op_errstr)
+{
+ char err_str[PATH_MAX] = "";
+ glusterd_conf_t *conf = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ int ret = -1;
+ uint64_t max_limit = GLUSTERD_SNAPS_MAX_HARD_LIMIT;
+ xlator_t *this = NULL;
+
+ this = THIS;
+
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (op_errstr);
+
+ conf = this->private;
+
+ GF_ASSERT (conf);
+
+ if (volname) {
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (!ret) {
+ if (volinfo->is_snap_volume) {
+ ret = -1;
+ snprintf (err_str, PATH_MAX,
+ "%s is a snap volume. Configuring "
+ "snap-max-hard-limit for a snap "
+ "volume is prohibited.", volname);
+ goto out;
+ }
+ }
+ }
+
+ if (value) {
+ /* Max limit for the system is GLUSTERD_SNAPS_MAX_HARD_LIMIT
+ * but max limit for a volume is conf->snap_max_hard_limit.
+ */
+ if (volname) {
+ max_limit = conf->snap_max_hard_limit;
+ } else {
+ max_limit = GLUSTERD_SNAPS_MAX_HARD_LIMIT;
+ }
+ }
+
+ if ((value < 0) || (value > max_limit)) {
+ ret = -1;
+ snprintf (err_str, PATH_MAX, "Invalid snap-max-hard-limit"
+ "%"PRIu64 ". Expected range 0 - %"PRIu64,
+ value, max_limit);
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (ret) {
+ *op_errstr = gf_strdup (err_str);
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
+ }
+ return ret;
+}
+
+int
+glusterd_snapshot_config_prevalidate (dict_t *dict, char **op_errstr)
+{
+ char *volname = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ xlator_t *this = NULL;
+ int ret = -1;
+ int config_command = 0;
+ char err_str[PATH_MAX] = {0,};
+ glusterd_conf_t *conf = NULL;
+ uint64_t value = 0;
+ uint64_t hard_limit = 0;
+ uint64_t soft_limit = 0;
+ gf_loglevel_t loglevel = GF_LOG_ERROR;
+ uint64_t max_limit = GLUSTERD_SNAPS_MAX_HARD_LIMIT;
+
+ this = THIS;
+
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (op_errstr);
+
+ conf = this->private;
+
+ GF_ASSERT (conf);
+
+ ret = dict_get_int32 (dict, "config-command", &config_command);
+ if (ret) {
+ snprintf (err_str, sizeof (err_str),
+ "failed to get config-command type");
+ goto out;
+ }
+
+ ret = dict_get_uint64 (dict, "snap-max-hard-limit", &hard_limit);
+
+ ret = dict_get_uint64 (dict, "snap-max-soft-limit", &soft_limit);
+
+ ret = dict_get_str (dict, "volname", &volname);
+
+ if (volname) {
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret) {
+ snprintf (err_str, sizeof (err_str),
+ "Volume %s does not exist.", volname);
+ goto out;
+ }
+ }
+
+ switch (config_command) {
+ case GF_SNAP_CONFIG_TYPE_SET:
+ if (hard_limit) {
+ /* Validations for snap-max-hard-limits */
+ ret = snap_max_hard_limits_validate (dict, volname,
+ hard_limit, op_errstr);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "snap-max-hard-limit validation "
+ "failed.");
+ goto out;
+ }
+ }
+
+ if (soft_limit) {
+ max_limit = GLUSTERD_SNAPS_MAX_SOFT_LIMIT_PERCENT;
+ if ((soft_limit < 0) || (soft_limit > max_limit)) {
+ ret = -1;
+ snprintf (err_str, PATH_MAX, "Invalid "
+ "snap-max-soft-limit ""%"
+ PRIu64 ". Expected range 0 - %"PRIu64,
+ value, max_limit);
+ goto out;
+ }
+ break;
+ }
+ default:
+ break;
+ }
+
+ ret = 0;
+out:
+
+ if (ret && err_str[0] != '\0') {
+ gf_log (this->name, loglevel, "%s", err_str);
+ *op_errstr = gf_strdup (err_str);
+ }
+
+ return ret;
+}
+
+int
+glusterd_snap_create_pre_val_use_rsp_dict (dict_t *dst, dict_t *src)
+{
+ char *snap_brick_dir = NULL;
+ char *snap_device = NULL;
+ char *tmpstr = NULL;
+ char key[PATH_MAX] = "";
+ char snapbrckcnt[PATH_MAX] = "";
+ char snapbrckord[PATH_MAX] = "";
+ int ret = -1;
+ int64_t i = -1;
+ int64_t j = -1;
+ int64_t volume_count = 0;
+ int64_t brick_count = 0;
+ int64_t brick_order = 0;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (dst);
+ GF_ASSERT (src);
+
+ ret = dict_get_int64 (src, "volcount", &volume_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to "
+ "get the volume count");
+ goto out;
+ }
+
+ for (i = 0; i < volume_count; i++) {
+ memset (snapbrckcnt, '\0', sizeof(snapbrckcnt));
+ ret = snprintf (snapbrckcnt, sizeof(snapbrckcnt) - 1,
+ "vol%"PRId64"_brickcount", i+1);
+ ret = dict_get_int64 (src, snapbrckcnt, &brick_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_TRACE,
+ "No bricks for this volume in this dict");
+ continue;
+ }
+
+ for (j = 0; j < brick_count; j++) {
+ /* Fetching data from source dict */
+ snprintf (key, sizeof(key) - 1,
+ "vol%"PRId64".brickdir%"PRId64, i+1, j);
+
+ ret = dict_get_ptr (src, key,
+ (void **)&snap_brick_dir);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Unable to fetch %s", key);
+ continue;
+ }
+
+ snprintf (key, sizeof(key) - 1,
+ "vol%"PRId64".brick_snapdevice%"PRId64, i+1, j);
+
+ ret = dict_get_ptr (src, key,
+ (void **)&snap_device);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to fetch snap_device");
+ goto out;
+ }
+
+ snprintf (snapbrckord, sizeof(snapbrckord) - 1,
+ "vol%"PRId64".brick%"PRId64".order", i+1, j);
+
+ ret = dict_get_int64 (src, snapbrckord, &brick_order);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get brick order");
+ goto out;
+ }
+
+ /* Adding the data in the dst dict */
+ snprintf (key, sizeof(key) - 1,
+ "vol%"PRId64".brickdir%"PRId64, i+1, brick_order);
+
+ tmpstr = gf_strdup (snap_brick_dir);
+ if (!tmpstr) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Out Of Memory");
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_dynstr (dst, key, tmpstr);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set %s", key);
+ GF_FREE (tmpstr);
+ goto out;
+ }
+
+ snprintf (key, sizeof(key) - 1,
+ "vol%"PRId64".brick_snapdevice%"PRId64,
+ i+1, brick_order);
+
+ tmpstr = gf_strdup (snap_device);
+ if (!tmpstr) {
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_dynstr (dst, key, tmpstr);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set %s", key);
+ GF_FREE (tmpstr);
+ goto out;
+ }
+ }
+ }
+
+ ret = 0;
+out:
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_snap_pre_validate_use_rsp_dict (dict_t *dst, dict_t *src)
+{
+ int ret = -1;
+ int32_t snap_command = 0;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ if (!dst || !src) {
+ gf_log (this->name, GF_LOG_ERROR, "Source or Destination "
+ "dict is empty.");
+ goto out;
+ }
+
+ ret = dict_get_int32 (dst, "type", &snap_command);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "unable to get the type of "
+ "the snapshot command");
+ goto out;
+ }
+
+ switch (snap_command) {
+ case GF_SNAP_OPTION_TYPE_CREATE:
+ ret = glusterd_snap_create_pre_val_use_rsp_dict (dst, src);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to use "
+ "rsp dict");
+ goto out;
+ }
+ break;
+ default:
+ break;
+ }
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_snapshot_create_prevalidate (dict_t *dict, char **op_errstr,
+ dict_t *rsp_dict)
+{
+ char *volname = NULL;
+ char *snapname = NULL;
+ char *device = NULL;
+ char *tmpstr = NULL;
+ char *brick_dir = NULL;
+ char snap_brick_dir[PATH_MAX] = "";
+ char *mnt_pt = NULL;
+ char key[PATH_MAX] = "";
+ char snap_mount[PATH_MAX] = "";
+ char snap_volname[64] = "";
+ char err_str[PATH_MAX] = "";
+ int ret = -1;
+ int64_t i = 0;
+ int64_t volcount = 0;
+ int64_t brick_count = 0;
+ int64_t brick_order = 0;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ xlator_t *this = NULL;
+ uuid_t *snap_volid = NULL;
+ gf_loglevel_t loglevel = GF_LOG_ERROR;
+ glusterd_conf_t *conf = NULL;
+ int64_t effective_max_limit = 0;
+
+ this = THIS;
+ GF_ASSERT (op_errstr);
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ ret = dict_get_int64 (dict, "volcount", &volcount);
+ if (ret) {
+ snprintf (err_str, sizeof (err_str), "Failed to "
+ "get the volume count");
+ goto out;
+ }
+ if (volcount <= 0) {
+ snprintf (err_str, sizeof (err_str), "Invalid volume count %"PRId64
+ " supplied", volcount);
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "snapname", &snapname);
+ if (ret) {
+ snprintf (err_str, sizeof (err_str), "Failed to get snapname");
+ goto out;
+ }
+
+ if (glusterd_find_snap_by_name (snapname)) {
+ ret = -1;
+ snprintf (err_str, sizeof (err_str), "Snap %s already exists",
+ snapname);
+ goto out;
+ }
+
+ for (i = 1; i <= volcount; i++) {
+ snprintf (key, sizeof (key), "volname%"PRId64, i);
+ ret = dict_get_str (dict, key, &volname);
+ if (ret) {
+ snprintf (err_str, sizeof (err_str),
+ "failed to get volume name");
+ goto out;
+ }
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret) {
+ snprintf (err_str, sizeof (err_str),
+ "Volume (%s) does not exist ", volname);
+ goto out;
+ }
+
+ ret = -1;
+ if (!glusterd_is_volume_started (volinfo)) {
+ snprintf (err_str, sizeof (err_str), "volume %s is "
+ "not started", volinfo->volname);
+ loglevel = GF_LOG_WARNING;
+ goto out;
+ }
+ if (glusterd_is_defrag_on (volinfo)) {
+ snprintf (err_str, sizeof (err_str),
+ "rebalance process is running for the "
+ "volume %s", volname);
+ loglevel = GF_LOG_WARNING;
+ goto out;
+ }
+ /* TODO: Also check whether geo replication is running */
+
+ if (volinfo->is_snap_volume == _gf_true) {
+ snprintf (err_str, sizeof (err_str),
+ "Volume %s is a snap volume", volname);
+ loglevel = GF_LOG_WARNING;
+ goto out;
+ }
+
+ if (volinfo->snap_max_hard_limit < conf->snap_max_hard_limit)
+ effective_max_limit = volinfo->snap_max_hard_limit;
+ else
+ effective_max_limit = conf->snap_max_hard_limit;
+
+ if (volinfo->snap_count >= effective_max_limit) {
+ snprintf (err_str, sizeof (err_str),
+ "The number of existing snaps has reached "
+ "the effective maximum limit of %"PRIu64" ,"
+ "for the volume %s", effective_max_limit,
+ volname);
+ loglevel = GF_LOG_WARNING;
+ goto out;
+ }
+
+ snprintf (key, sizeof(key) - 1, "vol%"PRId64"_volid", i);
+ ret = dict_get_bin (dict, key, (void **)&snap_volid);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to fetch snap_volid");
+ goto out;
+ }
+
+ /* snap volume uuid is used as lvm snapshot name.
+ This will avoid restrictions on snapshot names
+ provided by user */
+ GLUSTERD_GET_UUID_NOHYPHEN (snap_volname, *snap_volid);
+
+ brick_count = 0;
+ brick_order = 0;
+ /* Adding snap bricks mount paths to the dict */
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ if (uuid_compare (brickinfo->uuid, MY_UUID)) {
+ brick_order++;
+ continue;
+ }
+
+ if (!glusterd_is_brick_started (brickinfo)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "brick %s:%s is not started",
+ brickinfo->hostname,
+ brickinfo->path);
+ brick_order++;
+ brick_count++;
+ continue;
+ }
+
+ device = glusterd_get_brick_mount_details (brickinfo);
+ if (!device) {
+ snprintf (err_str, sizeof (err_str),
+ "getting device name for the brick "
+ "%s:%s failed", brickinfo->hostname,
+ brickinfo->path);
+ ret = -1;
+ goto out;
+ }
+
+ device = glusterd_build_snap_device_path (device,
+ snap_volname);
+ if (!device) {
+ snprintf (err_str, sizeof (err_str),
+ "cannot copy the snapshot device "
+ "name (volname: %s, snapname: %s)",
+ volinfo->volname, snapname);
+ loglevel = GF_LOG_WARNING;
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (key, sizeof(key),
+ "vol%"PRId64".brick_snapdevice%"PRId64, i,
+ brick_count);
+ ret = dict_set_dynstr (rsp_dict, key, device);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set %s", key);
+ GF_FREE (device);
+ goto out;
+ }
+
+ ret = glusterd_get_brick_root (brickinfo->path,
+ &mnt_pt);
+ if (ret) {
+ snprintf (err_str, sizeof (err_str),
+ "could not get the root of the brick path %s",
+ brickinfo->path);
+ loglevel = GF_LOG_WARNING;
+ goto out;
+ }
+ if (strncmp (brickinfo->path, mnt_pt, strlen(mnt_pt))) {
+ snprintf (err_str, sizeof (err_str),
+ "brick: %s brick mount: %s",
+ brickinfo->path, mnt_pt);
+ loglevel = GF_LOG_WARNING;
+ goto out;
+ }
+
+ brick_dir = &brickinfo->path[strlen (mnt_pt)];
+ brick_dir++;
+
+ snprintf (snap_brick_dir, sizeof (snap_brick_dir),
+ "/%s", brick_dir);
+
+ tmpstr = gf_strdup (snap_brick_dir);
+ if (!tmpstr) {
+ ret = -1;
+ goto out;
+ }
+ snprintf (key, sizeof(key), "vol%"PRId64".brickdir%"PRId64, i,
+ brick_count);
+ ret = dict_set_dynstr (rsp_dict, key, tmpstr);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set %s", snap_mount);
+ goto out;
+ }
+ tmpstr = NULL;
+
+ snprintf (key, sizeof(key) - 1, "vol%"PRId64".brick%"PRId64".order",
+ i, brick_count);
+ ret = dict_set_int64 (rsp_dict, key, brick_order);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set %s", key);
+ goto out;
+ }
+
+ brick_count++;
+ brick_order++;
+ }
+ snprintf (key, sizeof(key) - 1, "vol%"PRId64"_brickcount", i);
+ ret = dict_set_int64 (rsp_dict, key, brick_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set %s",
+ key);
+ goto out;
+ }
+ }
+
+ ret = dict_set_int64 (rsp_dict, "volcount", volcount);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set volcount");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (ret)
+ GF_FREE (tmpstr);
+
+ if (ret && err_str[0] != '\0') {
+ gf_log (this->name, loglevel, "%s", err_str);
+ *op_errstr = gf_strdup (err_str);
+ }
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+glusterd_snap_t*
+glusterd_new_snap_object()
+{
+ glusterd_snap_t *snap = NULL;
+
+ snap = GF_CALLOC (1, sizeof (*snap), gf_gld_mt_snap_t);
+
+ if (snap) {
+ if (LOCK_INIT (&snap->lock)) {
+ gf_log (THIS->name, GF_LOG_ERROR, "Failed initiating"
+ " snap lock");
+ GF_FREE (snap);
+ return NULL;
+ }
+
+ INIT_LIST_HEAD (&snap->snap_list);
+ INIT_LIST_HEAD (&snap->volumes);
+ snap->snapname[0] = 0;
+ snap->snap_status = GD_SNAP_STATUS_INIT;
+ }
+
+ return snap;
+
+};
+
+/* Function glusterd_list_add_snapvol adds the volinfo object (snapshot volume)
+ to the snapshot object list and to the parent volume list */
+int32_t
+glusterd_list_add_snapvol (glusterd_volinfo_t *origin_vol,
+ glusterd_volinfo_t *snap_vol)
+{
+ int ret = -1;
+ glusterd_snap_t *snap = NULL;
+
+ GF_VALIDATE_OR_GOTO ("glusterd", origin_vol, out);
+ GF_VALIDATE_OR_GOTO ("glusterd", snap_vol, out);
+
+ snap = snap_vol->snapshot;
+ GF_ASSERT (snap);
+
+ list_add_tail (&snap_vol->vol_list, &snap->volumes);
+ LOCK (&origin_vol->lock);
+ {
+ list_add_order (&snap_vol->snapvol_list,
+ &origin_vol->snap_volumes,
+ glusterd_compare_snap_vol_time);
+ origin_vol->snap_count++;
+ }
+ UNLOCK (&origin_vol->lock);
+
+ gf_log (THIS->name, GF_LOG_DEBUG, "Snap %s added to the list",
+ snap->snapname);
+ ret = 0;
+ out:
+ return ret;
+}
+
+glusterd_snap_t*
+glusterd_find_snap_by_name (char *snapname)
+{
+ glusterd_snap_t *snap = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ priv = THIS->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (snapname);
+
+ list_for_each_entry (snap, &priv->snapshots, snap_list) {
+ if (!strcmp (snap->snapname, snapname)) {
+ gf_log (THIS->name, GF_LOG_DEBUG, "Found "
+ "snap %s (%s)", snap->snapname,
+ uuid_utoa (snap->snap_id));
+ goto out;
+ }
+ }
+ snap = NULL;
+out:
+ return snap;
+}
+
+glusterd_snap_t*
+glusterd_find_snap_by_id (uuid_t snap_id)
+{
+ glusterd_snap_t *snap = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ priv = THIS->private;
+ GF_ASSERT (priv);
+
+ if (uuid_is_null(snap_id))
+ goto out;
+
+ list_for_each_entry (snap, &priv->snapshots, snap_list) {
+ if (!uuid_compare (snap->snap_id, snap_id)) {
+ gf_log (THIS->name, GF_LOG_DEBUG, "Found "
+ "snap %s (%s)", snap->snapname,
+ uuid_utoa (snap->snap_id));
+ goto out;
+ }
+ }
+ snap = NULL;
+out:
+ return snap;
+}
+
+int
+glusterd_do_lvm_snapshot_remove (glusterd_volinfo_t *snap_vol,
+ glusterd_brickinfo_t *brickinfo,
+ const char *mount_pt, const char *snap_device)
+{
+ int ret = -1;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ runner_t runner = {0,};
+ char msg[1024] = {0, };
+ char pidfile[PATH_MAX] = {0, };
+ pid_t pid = -1;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ if (!brickinfo) {
+ gf_log (this->name, GF_LOG_ERROR, "brickinfo NULL");
+ goto out;
+ }
+ GF_ASSERT (snap_vol);
+ GF_ASSERT (mount_pt);
+ GF_ASSERT (snap_device);
+
+ GLUSTERD_GET_BRICK_PIDFILE (pidfile, snap_vol, brickinfo, priv);
+ if (gf_is_service_running (pidfile, &pid)) {
+ ret = kill (pid, SIGKILL);
+ if (ret && errno != ESRCH) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to kill pid "
+ "%d reason : %s", pid, strerror(errno));
+ goto out;
+ }
+ }
+
+ runinit (&runner);
+ snprintf (msg, sizeof (msg), "umount the snapshot mounted path %s",
+ mount_pt);
+ runner_add_args (&runner, "umount", mount_pt, NULL);
+ runner_log (&runner, "", GF_LOG_DEBUG, msg);
+
+ /* We need not do synclock_unlock => runner_run => synclock_lock here.
+ Because it is needed if we are running a glusterfs process in
+ runner_run, so that when the glusterfs process started wants to
+ communicate to glusterd, glusterd wont be able to respond if it
+ has held the big lock. So we do unlock, run glusterfs process
+ (thus communicate to glusterd), lock. But since this is not a
+ glusterfs command that is being run, unlocking and then relocking
+ is not needed.
+ */
+ ret = runner_run (&runner);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "unmounting the "
+ "path %s (brick: %s) failed (%s)", mount_pt,
+ brickinfo->path, strerror (errno));
+ goto out;
+ }
+
+ runinit (&runner);
+ snprintf (msg, sizeof(msg), "remove snapshot of the brick %s:%s, "
+ "device: %s", brickinfo->hostname, brickinfo->path,
+ snap_device);
+ runner_add_args (&runner, LVM_REMOVE, "-f", snap_device, NULL);
+ runner_log (&runner, "", GF_LOG_DEBUG, msg);
+
+ ret = runner_run (&runner);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "removing snapshot of the "
+ "brick (%s:%s) of device %s failed",
+ brickinfo->hostname, brickinfo->path, snap_device);
+ goto out;
+ }
+
+out:
+ return ret;
+}
+
+int32_t
+glusterd_lvm_snapshot_remove (dict_t *rsp_dict, glusterd_volinfo_t *snap_vol)
+{
+ char *mnt_pt = NULL;
+ struct mntent *entry = NULL;
+ int32_t brick_count = -1;
+ int32_t ret = -1;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ xlator_t *this = NULL;
+ FILE *mtab = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (snap_vol);
+
+ if ((snap_vol->is_snap_volume == _gf_false) &&
+ (uuid_is_null (snap_vol->restored_from_snap))) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Not a snap volume, or a restored snap volume.");
+ ret = 0;
+ goto out;
+ }
+
+ brick_count = -1;
+ list_for_each_entry (brickinfo, &snap_vol->bricks, brick_list) {
+ brick_count++;
+ if (uuid_compare (brickinfo->uuid, MY_UUID)) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "%s:%s belongs to a different node",
+ brickinfo->hostname, brickinfo->path);
+ continue;
+ }
+
+ if (brickinfo->snap_status == -1) {
+ gf_log (this->name, GF_LOG_INFO,
+ "snapshot was pending. lvm not present "
+ "for brick %s:%s of the snap %s.",
+ brickinfo->hostname, brickinfo->path,
+ snap_vol->snapshot->snapname);
+
+ if (rsp_dict &&
+ (snap_vol->is_snap_volume == _gf_true)) {
+ /* Adding missed delete to the dict */
+ ret = glusterd_add_missed_snaps_to_dict
+ (rsp_dict,
+ snap_vol,
+ brickinfo,
+ brick_count + 1,
+ GF_SNAP_OPTION_TYPE_DELETE);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to add missed snapshot "
+ "info for %s:%s in the "
+ "rsp_dict", brickinfo->hostname,
+ brickinfo->path);
+ goto out;
+ }
+ }
+
+ continue;
+ }
+
+ ret = glusterd_get_brick_root (brickinfo->path, &mnt_pt);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "getting the root "
+ "of the brick for volume %s (snap %s) failed ",
+ snap_vol->volname, snap_vol->snapshot->snapname);
+ goto out;
+ }
+
+ entry = glusterd_get_mnt_entry_info (mnt_pt, mtab);
+ if (!entry) {
+ gf_log (this->name, GF_LOG_WARNING, "getting the mount"
+ " entry for the brick %s:%s of the snap %s "
+ "(volume: %s) failed", brickinfo->hostname,
+ brickinfo->path, snap_vol->snapshot->snapname,
+ snap_vol->volname);
+ ret = -1;
+ goto out;
+ }
+ ret = glusterd_do_lvm_snapshot_remove (snap_vol, brickinfo,
+ mnt_pt,
+ entry->mnt_fsname);
+ if (mtab)
+ endmntent (mtab);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to "
+ "remove the snapshot %s (%s)",
+ brickinfo->path, entry->mnt_fsname);
+ goto out;
+ }
+
+ }
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+
+int32_t
+glusterd_snap_volume_remove (dict_t *rsp_dict,
+ glusterd_volinfo_t *snap_vol,
+ gf_boolean_t remove_lvm,
+ gf_boolean_t force)
+{
+ int ret = -1;
+ int save_ret = 0;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ glusterd_volinfo_t *origin_vol = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (rsp_dict);
+ GF_ASSERT (snap_vol);
+
+ if (!snap_vol) {
+ gf_log(this->name, GF_LOG_WARNING, "snap_vol in NULL");
+ ret = -1;
+ goto out;
+ }
+
+ list_for_each_entry (brickinfo, &snap_vol->bricks, brick_list) {
+ if (uuid_compare (brickinfo->uuid, MY_UUID))
+ continue;
+
+ ret = glusterd_brick_stop (snap_vol, brickinfo, _gf_false);
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING, "Failed to stop "
+ "brick for volume %s", snap_vol->volname);
+ save_ret = ret;
+
+ /* Continue to cleaning up the snap in case of error
+ if force flag is enabled */
+ if (!force)
+ goto out;
+ }
+ }
+
+ /* Only remove the backend lvm when required */
+ if (remove_lvm) {
+ ret = glusterd_lvm_snapshot_remove (rsp_dict, snap_vol);
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING, "Failed to remove "
+ "lvm snapshot volume %s", snap_vol->volname);
+ save_ret = ret;
+ if (!force)
+ goto out;
+ }
+ }
+
+ ret = glusterd_store_delete_volume (snap_vol);
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING, "Failed to remove volume %s "
+ "from store", snap_vol->volname);
+ save_ret = ret;
+ if (!force)
+ goto out;
+ }
+
+ if (!list_empty(&snap_vol->snapvol_list)) {
+ ret = glusterd_volinfo_find (snap_vol->parent_volname,
+ &origin_vol);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get "
+ "parent volinfo %s for volume %s",
+ snap_vol->parent_volname, snap_vol->volname);
+ save_ret = ret;
+ if (!force)
+ goto out;
+ }
+ origin_vol->snap_count--;
+ }
+
+ ret = glusterd_volinfo_delete (snap_vol);
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING, "Failed to remove volinfo "
+ "%s ", snap_vol->volname);
+ save_ret = ret;
+ if (!force)
+ goto out;
+ }
+
+ if (save_ret)
+ ret = save_ret;
+out:
+ gf_log (this->name, GF_LOG_TRACE, "returning %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_snapobject_delete (glusterd_snap_t *snap)
+{
+ if (snap == NULL) {
+ gf_log(THIS->name, GF_LOG_WARNING, "snap is NULL");
+ return -1;
+ }
+
+ list_del_init (&snap->snap_list);
+ list_del_init (&snap->volumes);
+ if (LOCK_DESTROY(&snap->lock))
+ gf_log (THIS->name, GF_LOG_WARNING, "Failed destroying lock"
+ "of snap %s", snap->snapname);
+
+ GF_FREE (snap->description);
+ GF_FREE (snap);
+
+ return 0;
+}
+
+int32_t
+glusterd_snap_remove (dict_t *rsp_dict,
+ glusterd_snap_t *snap,
+ gf_boolean_t remove_lvm,
+ gf_boolean_t force)
+{
+ int ret = -1;
+ int save_ret = 0;
+ glusterd_volinfo_t *snap_vol = NULL;
+ glusterd_volinfo_t *tmp = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (rsp_dict);
+ GF_ASSERT (snap);
+
+ if (!snap) {
+ gf_log(this->name, GF_LOG_WARNING, "snap is NULL");
+ ret = -1;
+ goto out;
+ }
+
+ list_for_each_entry_safe (snap_vol, tmp, &snap->volumes, vol_list) {
+ ret = glusterd_snap_volume_remove (rsp_dict, snap_vol,
+ remove_lvm, force);
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING, "Failed to remove "
+ "volinfo %s for snap %s", snap_vol->volname,
+ snap->snapname);
+ save_ret = ret;
+
+ /* Continue to cleaning up the snap in case of error
+ if force flag is enabled */
+ if (!force)
+ goto out;
+ }
+ }
+
+ ret = glusterd_store_delete_snap (snap);
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING, "Failed to remove snap %s "
+ "from store", snap->snapname);
+ save_ret = ret;
+ if (!force)
+ goto out;
+ }
+
+ ret = glusterd_snapobject_delete (snap);
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING, "Failed to delete "
+ "snap object %s", snap->snapname);
+
+ if (save_ret)
+ ret = save_ret;
+out:
+ gf_log (THIS->name, GF_LOG_TRACE, "returning %d", ret);
+ return ret;
+}
+
+static int
+glusterd_snapshot_get_snapvol_detail (dict_t *dict,
+ glusterd_volinfo_t *snap_vol,
+ char *keyprefix, int detail)
+{
+ int ret = -1;
+ int snap_limit = 0;
+ char key[PATH_MAX] = {0,};
+ char *value = NULL;
+ glusterd_volinfo_t *origin_vol = NULL;
+ glusterd_conf_t *conf = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ GF_ASSERT (dict);
+ GF_ASSERT (snap_vol);
+ GF_ASSERT (keyprefix);
+
+ /* Volume Name */
+ value = gf_strdup (snap_vol->volname);
+ if (!value)
+ goto out;
+
+ snprintf (key, sizeof (key), "%s.volname", keyprefix);
+ ret = dict_set_dynstr (dict, key, value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set "
+ "volume name in dictionary: %s", key);
+ goto out;
+ }
+
+ /* Volume ID */
+ value = gf_strdup (uuid_utoa (snap_vol->volume_id));
+ if (NULL == value) {
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "%s.vol-id", keyprefix);
+ ret = dict_set_dynstr (dict, key, value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set "
+ "volume id in dictionary: %s", key);
+ goto out;
+ }
+ value = NULL;
+
+ /* volume status */
+ snprintf (key, sizeof (key), "%s.vol-status", keyprefix);
+ switch (snap_vol->status) {
+ case GLUSTERD_STATUS_STARTED:
+ ret = dict_set_str (dict, key, "Started");
+ break;
+ case GLUSTERD_STATUS_STOPPED:
+ ret = dict_set_str (dict, key, "Stopped");
+ break;
+ case GD_SNAP_STATUS_NONE:
+ ret = dict_set_str (dict, key, "None");
+ break;
+ default:
+ gf_log (this->name, GF_LOG_ERROR, "Invalid volume status");
+ ret = -1;
+ goto out;
+ }
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set volume status"
+ " in dictionary: %s", key);
+ goto out;
+ }
+
+
+ ret = glusterd_volinfo_find (snap_vol->parent_volname, &origin_vol);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to get the parent "
+ "volinfo for the volume %s", snap_vol->volname);
+ goto out;
+ }
+
+ /* Snaps available */
+ if (conf->snap_max_hard_limit < origin_vol->snap_max_hard_limit) {
+ snap_limit = conf->snap_max_hard_limit;
+ gf_log(this->name, GF_LOG_DEBUG, "system snap-max-hard-limit is"
+ " lesser than volume snap-max-hard-limit, "
+ "snap-max-hard-limit value is set to %d", snap_limit);
+ } else {
+ snap_limit = origin_vol->snap_max_hard_limit;
+ gf_log(this->name, GF_LOG_DEBUG, "volume snap-max-hard-limit is"
+ " lesser than system snap-max-hard-limit, "
+ "snap-max-hard-limit value is set to %d", snap_limit);
+ }
+
+ snprintf (key, sizeof (key), "%s.snaps-available", keyprefix);
+ if (snap_limit > origin_vol->snap_count)
+ ret = dict_set_int32 (dict, key,
+ snap_limit - origin_vol->snap_count);
+ else
+ ret = dict_set_int32 (dict, key, 0);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set available snaps");
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "%s.snapcount", keyprefix);
+ ret = dict_set_int32 (dict, key, origin_vol->snap_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not save snapcount");
+ goto out;
+ }
+
+ if (!detail)
+ goto out;
+
+ /* Parent volume name */
+ value = gf_strdup (snap_vol->parent_volname);
+ if (!value)
+ goto out;
+
+ snprintf (key, sizeof (key), "%s.origin-volname", keyprefix);
+ ret = dict_set_dynstr (dict, key, value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set parent "
+ "volume name in dictionary: %s", key);
+ goto out;
+ }
+ value = NULL;
+
+ ret = 0;
+out:
+ if (value)
+ GF_FREE (value);
+
+ return ret;
+}
+
+static int
+glusterd_snapshot_get_snap_detail (dict_t *dict, glusterd_snap_t *snap,
+ char *keyprefix, glusterd_volinfo_t *volinfo)
+{
+ int ret = -1;
+ int volcount = 0;
+ char key[PATH_MAX] = {0,};
+ char *value = NULL;
+ char *timestr = NULL;
+ struct tm *tmptr = NULL;
+ glusterd_volinfo_t *snap_vol = NULL;
+ glusterd_volinfo_t *tmp_vol = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+
+ GF_ASSERT (dict);
+ GF_ASSERT (snap);
+ GF_ASSERT (keyprefix);
+
+ /* Snap Name */
+ value = gf_strdup (snap->snapname);
+ if (!value)
+ goto out;
+
+ snprintf (key, sizeof (key), "%s.snapname", keyprefix);
+ ret = dict_set_dynstr (dict, key, value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set "
+ "snap name in dictionary");
+ goto out;
+ }
+
+ /* Snap ID */
+ value = gf_strdup (uuid_utoa (snap->snap_id));
+ if (NULL == value) {
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "%s.snap-id", keyprefix);
+ ret = dict_set_dynstr (dict, key, value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set "
+ "snap id in dictionary");
+ goto out;
+ }
+ value = NULL;
+
+ tmptr = localtime (&(snap->time_stamp));
+ if (NULL == tmptr) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to convert "
+ "time_t to *tm");
+ ret = -1;
+ goto out;
+ }
+
+ timestr = GF_CALLOC (1, PATH_MAX, gf_gld_mt_char);
+ if (NULL == timestr) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = strftime (timestr, PATH_MAX, "%Y-%m-%d %H:%M:%S", tmptr);
+ if (0 == ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to convert time_t "
+ "to string");
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "%s.snap-time", keyprefix);
+ ret = dict_set_dynstr (dict, key, timestr);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set "
+ "snap time stamp in dictionary");
+ goto out;
+ }
+ timestr = NULL;
+
+ /* If snap description is provided then add that into dictionary */
+ if (NULL != snap->description) {
+ value = gf_strdup (snap->description);
+ if (NULL == value) {
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "%s.snap-desc", keyprefix);
+ ret = dict_set_dynstr (dict, key, value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set "
+ "snap description in dictionary");
+ goto out;
+ }
+ value = NULL;
+ }
+
+ snprintf (key, sizeof (key), "%s.snap-status", keyprefix);
+ switch (snap->snap_status) {
+ case GD_SNAP_STATUS_INIT:
+ ret = dict_set_str (dict, key, "Init");
+ break;
+ case GD_SNAP_STATUS_IN_USE:
+ ret = dict_set_str (dict, key, "In-use");
+ break;
+ case GD_SNAP_STATUS_DECOMMISSION:
+ ret = dict_set_str (dict, key, "Decommisioned");
+ break;
+ case GD_SNAP_STATUS_RESTORED:
+ ret = dict_set_str (dict, key, "Restored");
+ break;
+ case GD_SNAP_STATUS_NONE:
+ ret = dict_set_str (dict, key, "None");
+ break;
+ default:
+ gf_log (this->name, GF_LOG_ERROR, "Invalid snap status");
+ ret = -1;
+ goto out;
+ }
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set snap status "
+ "in dictionary");
+ goto out;
+ }
+
+ if (volinfo) {
+ volcount = 1;
+ snprintf (key, sizeof (key), "%s.vol%d", keyprefix, volcount);
+ ret = glusterd_snapshot_get_snapvol_detail (dict,
+ volinfo, key, 0);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "get volume detail %s for snap %s",
+ snap_vol->volname, snap->snapname);
+ goto out;
+ }
+ goto done;
+ }
+
+ list_for_each_entry_safe (snap_vol, tmp_vol, &snap->volumes, vol_list) {
+ volcount++;
+ snprintf (key, sizeof (key), "%s.vol%d", keyprefix, volcount);
+ ret = glusterd_snapshot_get_snapvol_detail (dict,
+ snap_vol, key, 1);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "get volume detail %s for snap %s",
+ snap_vol->volname, snap->snapname);
+ goto out;
+ }
+ }
+
+done:
+ snprintf (key, sizeof (key), "%s.vol-count", keyprefix);
+ ret = dict_set_int32 (dict, key, volcount);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set %s",
+ key);
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (value)
+ GF_FREE (value);
+
+ if (timestr)
+ GF_FREE(timestr);
+
+ return ret;
+}
+
+static int
+glusterd_snapshot_get_all_snap_info (dict_t *dict)
+{
+ int ret = -1;
+ int snapcount = 0;
+ char key[PATH_MAX] = {0,};
+ glusterd_snap_t *snap = NULL;
+ glusterd_snap_t *tmp_snap = NULL;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ /* General parameter validation */
+ GF_ASSERT (dict);
+
+ list_for_each_entry_safe (snap, tmp_snap, &priv->snapshots, snap_list) {
+ snapcount++;
+ snprintf (key, sizeof (key), "snap%d", snapcount);
+ ret = glusterd_snapshot_get_snap_detail (dict, snap, key, NULL);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get "
+ "snapdetail for snap %s", snap->snapname);
+ goto out;
+ }
+ }
+
+ ret = dict_set_int32 (dict, "snap-count", snapcount);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set snapcount");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int
+glusterd_snapshot_get_info_by_volume (dict_t *dict, char *volname,
+ char *err_str, size_t len)
+{
+ int ret = -1;
+ int snapcount = 0;
+ int snap_limit = 0;
+ char *value = NULL;
+ char key[PATH_MAX] = "";
+ glusterd_volinfo_t *volinfo = NULL;
+ glusterd_volinfo_t *snap_vol = NULL;
+ glusterd_volinfo_t *tmp_vol = NULL;
+ glusterd_conf_t *conf = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ GF_ASSERT (dict);
+ GF_ASSERT (volname);
+
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret) {
+ snprintf (err_str, len, "Volume (%s) does not exist", volname);
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
+ goto out;
+ }
+
+ /* Snaps available */
+ if (conf->snap_max_hard_limit < volinfo->snap_max_hard_limit) {
+ snap_limit = conf->snap_max_hard_limit;
+ gf_log(this->name, GF_LOG_DEBUG, "system snap-max-hard-limit is"
+ " lesser than volume snap-max-hard-limit, "
+ "snap-max-hard-limit value is set to %d", snap_limit);
+ } else {
+ snap_limit = volinfo->snap_max_hard_limit;
+ gf_log(this->name, GF_LOG_DEBUG, "volume snap-max-hard-limit is"
+ " lesser than system snap-max-hard-limit, "
+ "snap-max-hard-limit value is set to %d", snap_limit);
+ }
+
+ if (snap_limit > volinfo->snap_count)
+ ret = dict_set_int32 (dict, "snaps-available",
+ snap_limit - volinfo->snap_count);
+ else
+ ret = dict_set_int32 (dict, "snaps-available", 0);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set available snaps");
+ goto out;
+ }
+
+ /* Origin volume name */
+ value = gf_strdup (volinfo->volname);
+ if (!value)
+ goto out;
+
+ ret = dict_set_dynstr (dict, "origin-volname", value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set parent "
+ "volume name in dictionary: %s", key);
+ goto out;
+ }
+ value = NULL;
+
+ list_for_each_entry_safe (snap_vol, tmp_vol, &volinfo->snap_volumes,
+ snapvol_list) {
+ snapcount++;
+ snprintf (key, sizeof (key), "snap%d", snapcount);
+ ret = glusterd_snapshot_get_snap_detail (dict,
+ snap_vol->snapshot,
+ key, snap_vol);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get "
+ "snapdetail for snap %s",
+ snap_vol->snapshot->snapname);
+ goto out;
+ }
+ }
+ ret = dict_set_int32 (dict, "snap-count", snapcount);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set snapcount");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (value)
+ GF_FREE (value);
+
+ return ret;
+}
+
+/* This function will be called from RPC handler routine.
+ * This function is responsible for getting the requested
+ * snapshot info into the dictionary.
+ *
+ * @param req RPC request object. Required for sending a response back.
+ * @param op glusterd operation. Required for sending a response back.
+ * @param dict pointer to dictionary which will contain both
+ * request and response key-pair values.
+ * @return -1 on error and 0 on success
+ */
+int
+glusterd_handle_snapshot_info (rpcsvc_request_t *req, glusterd_op_t op,
+ dict_t *dict, char *err_str, size_t len)
+{
+ int ret = -1;
+ int8_t snap_driven = 1;
+ char *volname = NULL;
+ char *snapname = NULL;
+ glusterd_snap_t *snap = NULL;
+ xlator_t *this = NULL;
+ int32_t cmd = GF_SNAP_INFO_TYPE_ALL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ GF_VALIDATE_OR_GOTO (this->name, req, out);
+ GF_VALIDATE_OR_GOTO (this->name, dict, out);
+
+
+ ret = dict_get_int32 (dict, "cmd", &cmd);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get type "
+ "of snapshot info");
+ goto out;
+ }
+
+ switch (cmd) {
+ case GF_SNAP_INFO_TYPE_ALL:
+ {
+ ret = glusterd_snapshot_get_all_snap_info (dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get info of all snaps");
+ goto out;
+ }
+ break;
+ }
+
+ case GF_SNAP_INFO_TYPE_SNAP:
+ {
+ ret = dict_get_str (dict, "snapname", &snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get snap name");
+ goto out;
+ }
+
+ ret = dict_set_int32 (dict, "snap-count", 1);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set snapcount");
+ goto out;
+ }
+
+ snap = glusterd_find_snap_by_name (snapname);
+ if (!snap) {
+ snprintf (err_str, len,
+ "Snap (%s) does not exist", snapname);
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s", err_str);
+ ret = -1;
+ goto out;
+ }
+ ret = glusterd_snapshot_get_snap_detail (dict, snap,
+ "snap1", NULL);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get snap detail of snap "
+ "%s", snap->snapname);
+ goto out;
+ }
+ break;
+ }
+
+ case GF_SNAP_INFO_TYPE_VOL:
+ {
+ ret = dict_get_str (dict, "volname", &volname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get volname");
+ goto out;
+ }
+ ret = glusterd_snapshot_get_info_by_volume (dict,
+ volname, err_str, len);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get volume info of volume "
+ "%s", volname);
+ goto out;
+ }
+ snap_driven = 0;
+ break;
+ }
+ }
+
+ ret = dict_set_int8 (dict, "snap-driven", snap_driven);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set snap-driven");
+ goto out;
+ }
+
+ /* If everything is successful then send the response back to cli.
+ * In case of failure the caller of this function will take care
+ of the response */
+ ret = glusterd_op_send_cli_response (op, 0, 0, req, dict, err_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to send cli "
+ "response");
+ goto out;
+ }
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
+/* This function sets all the snapshot names in the dictionary */
+int
+glusterd_snapshot_get_all_snapnames (dict_t *dict)
+{
+ int ret = -1;
+ int snapcount = 0;
+ char *snapname = NULL;
+ char key[PATH_MAX] = {0,};
+ glusterd_snap_t *snap = NULL;
+ glusterd_snap_t *tmp_snap = NULL;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ priv = this->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (dict);
+
+ list_for_each_entry_safe (snap, tmp_snap, &priv->snapshots, snap_list) {
+ snapcount++;
+ snapname = gf_strdup (snap->snapname);
+ if (!snapname) {
+ gf_log (this->name, GF_LOG_ERROR, "strdup failed");
+ ret = -1;
+ goto out;
+ }
+ snprintf (key, sizeof (key), "snapname%d", snapcount);
+ ret = dict_set_dynstr (dict, key, snapname);
+ if (ret) {
+ GF_FREE (snapname);
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set %s",
+ key);
+ goto out;
+ }
+ }
+
+ ret = dict_set_int32 (dict, "snap-count", snapcount);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set snapcount");
+ goto out;
+ }
+
+ ret = 0;
+out:
+
+ return ret;
+}
+
+/* This function sets all the snapshot names
+ under a given volume in the dictionary */
+int
+glusterd_snapshot_get_vol_snapnames (dict_t *dict, glusterd_volinfo_t *volinfo)
+{
+ int ret = -1;
+ int snapcount = 0;
+ char *snapname = NULL;
+ char key[PATH_MAX] = {0,};
+ glusterd_volinfo_t *snap_vol = NULL;
+ glusterd_volinfo_t *tmp_vol = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (dict);
+ GF_ASSERT (volinfo);
+
+ list_for_each_entry_safe (snap_vol, tmp_vol,
+ &volinfo->snap_volumes, snapvol_list) {
+ snapcount++;
+ snapname = gf_strdup (snap_vol->snapshot->snapname);
+ if (!snapname) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "strdup failed");
+ ret = -1;
+ goto out;
+ }
+ snprintf (key, sizeof (key), "snapname%d", snapcount);
+ ret = dict_set_dynstr (dict, key, snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "set %s", key);
+ GF_FREE (snapname);
+ goto out;
+ }
+ }
+
+ ret = dict_set_int32 (dict, "snap-count", snapcount);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set snapcount");
+ goto out;
+ }
+
+ ret = 0;
+out:
+
+ return ret;
+}
+
+int
+glusterd_handle_snapshot_list (rpcsvc_request_t *req, glusterd_op_t op,
+ dict_t *dict, char *err_str, size_t len)
+{
+ int ret = -1;
+ char *volname = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+
+ GF_VALIDATE_OR_GOTO (this->name, req, out);
+ GF_VALIDATE_OR_GOTO (this->name, dict, out);
+
+ /* Ignore error for getting volname as it is optional */
+ ret = dict_get_str (dict, "volname", &volname);
+
+ if (NULL == volname) {
+ ret = glusterd_snapshot_get_all_snapnames (dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get snapshot list");
+ goto out;
+ }
+ } else {
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret) {
+ snprintf (err_str, len,
+ "Volume (%s) does not exist", volname);
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s", err_str);
+ goto out;
+ }
+
+ ret = glusterd_snapshot_get_vol_snapnames (dict, volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get snapshot list for volume %s",
+ volname);
+ goto out;
+ }
+ }
+
+ /* If everything is successful then send the response back to cli.
+ In case of failure the caller of this function will take of response.*/
+ ret = glusterd_op_send_cli_response (op, 0, 0, req, dict, err_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to send cli "
+ "response");
+ goto out;
+ }
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
+/* This is a snapshot create handler function. This function will be
+ * executed in the originator node. This function is responsible for
+ * calling mgmt_v3 framework to do the actual snap creation on all the bricks
+ *
+ * @param req RPC request object
+ * @param op gluster operation
+ * @param dict dictionary containing snapshot restore request
+ * @param err_str In case of an err this string should be populated
+ * @param len length of err_str buffer
+ *
+ * @return Negative value on Failure and 0 in success
+ */
+int
+glusterd_handle_snapshot_create (rpcsvc_request_t *req, glusterd_op_t op,
+ dict_t *dict, char *err_str, size_t len)
+{
+ int ret = -1;
+ char *volname = NULL;
+ char *snapname = NULL;
+ int64_t volcount = 0;
+ xlator_t *this = NULL;
+ char key[PATH_MAX] = "";
+ char *username = NULL;
+ char *password = NULL;
+ uuid_t *uuid_ptr = NULL;
+ uuid_t tmp_uuid = {0};
+ int i = 0;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (req);
+ GF_ASSERT (dict);
+ GF_ASSERT (err_str);
+
+ ret = dict_get_int64 (dict, "volcount", &volcount);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to "
+ "get the volume count");
+ goto out;
+ }
+ if (volcount <= 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Invalid volume count %"PRId64
+ " supplied", volcount);
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "snapname", &snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to get the snapname");
+ goto out;
+ }
+
+ if (strlen(snapname) >= GLUSTERD_MAX_SNAP_NAME) {
+ snprintf (err_str, len, "snapname cannot exceed 255 "
+ "characters");
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
+ ret = -1;
+ goto out;
+ }
+
+ uuid_ptr = GF_CALLOC (1, sizeof(uuid_t), gf_common_mt_uuid_t);
+ if (!uuid_ptr) {
+ gf_log (this->name, GF_LOG_ERROR, "Out Of Memory");
+ ret = -1;
+ goto out;
+ }
+
+ uuid_generate (*uuid_ptr);
+ ret = dict_set_bin (dict, "snap-id", uuid_ptr, sizeof(uuid_t));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to set snap-id");
+ GF_FREE (uuid_ptr);
+ goto out;
+ }
+ uuid_ptr = NULL;
+
+ ret = dict_set_int64 (dict, "snap-time", (int64_t)time(NULL));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to set snap-time");
+ goto out;
+ }
+
+ for (i = 1; i <= volcount; i++) {
+ snprintf (key, sizeof (key), "volname%d", i);
+ ret = dict_get_str (dict, key, &volname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get volume name");
+ goto out;
+ }
+
+ /* generate internal username and password for the snap*/
+ uuid_generate (tmp_uuid);
+ username = gf_strdup (uuid_utoa (tmp_uuid));
+ snprintf (key, sizeof(key), "volume%d_username", i);
+ ret = dict_set_dynstr (dict, key, username);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set snap "
+ "username for volume %s", volname);
+ GF_FREE (username);
+ goto out;
+ }
+
+ uuid_generate (tmp_uuid);
+ password = gf_strdup (uuid_utoa (tmp_uuid));
+ snprintf (key, sizeof(key), "volume%d_password", i);
+ ret = dict_set_dynstr (dict, key, password);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set snap "
+ "password for volume %s", volname);
+ GF_FREE (password);
+ goto out;
+ }
+
+ uuid_ptr = GF_CALLOC (1, sizeof(uuid_t), gf_common_mt_uuid_t);
+ if (!uuid_ptr) {
+ gf_log (this->name, GF_LOG_ERROR, "Out Of Memory");
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (key, sizeof(key) - 1, "vol%d_volid", i);
+ uuid_generate (*uuid_ptr);
+ ret = dict_set_bin (dict, key, uuid_ptr, sizeof(uuid_t));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set snap_volid");
+ GF_FREE (uuid_ptr);
+ goto out;
+ }
+ }
+
+ ret = glusterd_mgmt_v3_initiate_snap_phases (req, op, dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to initiate snap "
+ "phases");
+ }
+
+out:
+ return ret;
+}
+
+/* This is a snapshot status handler function. This function will be
+ * executed in a originator node. This function is responsible for
+ * calling mgmt v3 framework to get the actual snapshot status from
+ * all the bricks
+ *
+ * @param req RPC request object
+ * @param op gluster operation
+ * @param dict dictionary containing snapshot status request
+ * @param err_str In case of an err this string should be populated
+ * @param len length of err_str buffer
+ *
+ * return : 0 in case of success.
+ * -1 in case of failure.
+ *
+ */
+int
+glusterd_handle_snapshot_status (rpcsvc_request_t *req, glusterd_op_t op,
+ dict_t *dict, char *err_str, size_t len)
+{
+ int ret = -1;
+ char *volname = NULL;
+ char *snapname = NULL;
+ char *buf = NULL;
+ glusterd_conf_t *conf = NULL;
+ xlator_t *this = NULL;
+ int32_t cmd = -1;
+ int i = 0;
+ dict_t *voldict = NULL;
+ char key[PATH_MAX] = "";
+ glusterd_volinfo_t *volinfo = NULL;
+ glusterd_snap_t *snap = NULL;
+ glusterd_volinfo_t *snap_volinfo = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+
+ GF_ASSERT (conf);
+ GF_ASSERT (req);
+ GF_ASSERT (dict);
+ GF_ASSERT (err_str);
+
+ ret = dict_get_int32 (dict, "cmd", &cmd);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not get status type");
+ goto out;
+ }
+ switch (cmd) {
+ case GF_SNAP_STATUS_TYPE_ALL:
+ {
+ /* IF we give "gluster snapshot status"
+ * then lock is held on all snaps.
+ * This is the place where necessary information
+ * (snapname and snapcount)is populated in dictionary
+ * for locking.
+ */
+ ++i;
+ list_for_each_entry (snap, &conf->snapshots, snap_list)
+ {
+ snprintf (key, sizeof (key), "snapname%d", i);
+ buf = gf_strdup (snap->snapname);
+ if (!buf) {
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_dynstr (dict, key, buf);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not save snapname (%s) "
+ "in the dictionary",
+ snap->snapname);
+ GF_FREE (buf);
+ goto out;
+ }
+
+ buf = NULL;
+ i++;
+ }
+
+ ret = dict_set_int32 (dict, "snapcount", i - 1);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not "
+ "save snapcount in the dictionary");
+ goto out;
+ }
+ break;
+ }
+
+ case GF_SNAP_STATUS_TYPE_SNAP:
+ {
+ /* IF we give "gluster snapshot status <snapname>"
+ * then lock is held on single snap.
+ * This is the place where necessary information
+ * (snapname)is populated in dictionary
+ * for locking.
+ */
+ ret = dict_get_str (dict, "snapname", &snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to fetch snap name");
+ goto out;
+ }
+
+ snap = glusterd_find_snap_by_name (snapname);
+ if (!snap) {
+ snprintf (err_str, len, "Snap (%s)"
+ "does not exist", snapname);
+ gf_log(this->name, GF_LOG_ERROR,
+ "%s", err_str);
+ ret = -1;
+ goto out;
+ }
+ break;
+ }
+ case GF_SNAP_STATUS_TYPE_VOL:
+ ret = dict_get_str (dict, "volname", &volname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to fetch volname");
+ goto out;
+ }
+
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret) {
+ snprintf (err_str, len, "Volume (%s) "
+ "does not exist", volname);
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s", err_str);
+ goto out;
+ }
+
+ i = 1;
+ list_for_each_entry (snap_volinfo,
+ &volinfo->snap_volumes, snapvol_list) {
+ snprintf (key, sizeof (key), "snapname%d", i);
+
+ buf = gf_strdup
+ (snap_volinfo->snapshot->snapname);
+ if (!buf) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_dynstr (dict, key, buf);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not save snapname");
+ GF_FREE (buf);
+ goto out;
+ }
+
+ buf = NULL;
+ i++;
+ }
+
+ ret = dict_set_int32 (dict, "snapcount", i-1);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not save snapcount");
+ goto out;
+ }
+ break;
+ default:
+ {
+ gf_log (this->name, GF_LOG_ERROR, "Unknown type");
+ ret = -1;
+ goto out;
+ }
+ }
+
+ /* Volume lock is not necessary for snapshot status, hence
+ * turning it off
+ */
+ ret = dict_set_int8 (dict, "hold_vol_locks", 0);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Setting volume lock "
+ "flag failed");
+ goto out;
+ }
+
+ ret = glusterd_mgmt_v3_initiate_snap_phases (req, op, dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to initiate "
+ "snap phases");
+ goto out;
+ }
+
+ ret = 0;
+
+out:
+ if (voldict) {
+ dict_unref (voldict);
+ }
+ return ret;
+}
+
+
+/* This is a snapshot restore handler function. This function will be
+ * executed in the originator node. This function is responsible for
+ * calling mgmt_v3 framework to do the actual restore on all the bricks
+ *
+ * @param req RPC request object
+ * @param op gluster operation
+ * @param dict dictionary containing snapshot restore request
+ * @param err_str In case of an err this string should be populated
+ * @param len length of err_str buffer
+ *
+ * @return Negative value on Failure and 0 in success
+ */
+int
+glusterd_handle_snapshot_restore (rpcsvc_request_t *req, glusterd_op_t op,
+ dict_t *dict, char *err_str, size_t len)
+{
+ int ret = -1;
+ char *snapname = NULL;
+ char *buf = NULL;
+ glusterd_conf_t *conf = NULL;
+ xlator_t *this = NULL;
+ glusterd_snap_t *snap = NULL;
+ glusterd_volinfo_t *snap_volinfo = NULL;
+ int32_t i = 0;
+ char key[PATH_MAX] = "";
+
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+
+ GF_ASSERT (conf);
+ GF_ASSERT (req);
+ GF_ASSERT (dict);
+ GF_ASSERT (err_str);
+
+ ret = dict_get_str (dict, "snapname", &snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "get snapname");
+ goto out;
+ }
+
+ snap = glusterd_find_snap_by_name (snapname);
+ if (!snap) {
+ snprintf (err_str, len, "Snap (%s) does not exist", snapname);
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
+ ret = -1;
+ goto out;
+ }
+
+ list_for_each_entry (snap_volinfo, &snap->volumes, vol_list) {
+ i++;
+ snprintf (key, sizeof (key), "volname%d", i);
+ buf = gf_strdup (snap_volinfo->parent_volname);
+ if (!buf) {
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_dynstr (dict, key, buf);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not set "
+ "parent volume name %s in the dict",
+ snap_volinfo->parent_volname);
+ GF_FREE (buf);
+ goto out;
+ }
+ buf = NULL;
+ }
+
+ ret = dict_set_int32 (dict, "volcount", i);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not save volume count");
+ goto out;
+ }
+
+ ret = glusterd_mgmt_v3_initiate_snap_phases (req, op, dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to initiate snap "
+ "phases");
+ goto out;
+ }
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
+glusterd_snap_t*
+glusterd_create_snap_object (dict_t *dict, dict_t *rsp_dict)
+{
+ char *snapname = NULL;
+ uuid_t *snap_id = NULL;
+ char *description = NULL;
+ glusterd_snap_t *snap = NULL;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ int ret = -1;
+ int64_t time_stamp = 0;
+
+ this = THIS;
+ priv = this->private;
+
+ GF_ASSERT (dict);
+ GF_ASSERT (rsp_dict);
+
+ /* Fetch snapname, description, id and time from dict */
+ ret = dict_get_str (dict, "snapname", &snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to fetch snapname");
+ goto out;
+ }
+
+ /* Ignore ret value for description*/
+ ret = dict_get_str (dict, "description", &description);
+
+ ret = dict_get_bin (dict, "snap-id", (void **)&snap_id);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to fetch snap_id");
+ goto out;
+ }
+
+ ret = dict_get_int64 (dict, "snap-time", &time_stamp);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to fetch snap-time");
+ goto out;
+ }
+ if (time_stamp <= 0) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR, "Invalid time-stamp: %"PRId64,
+ time_stamp);
+ goto out;
+ }
+
+ list_for_each_entry (snap, &priv->snapshots, snap_list) {
+ if (!strcmp (snap->snapname, snapname) ||
+ !uuid_compare (snap->snap_id, *snap_id)) {
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "Found duplicate snap %s (%s)",
+ snap->snapname, uuid_utoa (snap->snap_id));
+ ret = -1;
+ break;
+ }
+ }
+ if (ret) {
+ snap = NULL;
+ goto out;
+ }
+
+ snap = glusterd_new_snap_object ();
+ if (!snap) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not create "
+ "the snap object for snap %s", snapname);
+ goto out;
+ }
+
+ strcpy (snap->snapname, snapname);
+ uuid_copy (snap->snap_id, *snap_id);
+ snap->time_stamp = (time_t)time_stamp;
+ /* Set the status as GD_SNAP_STATUS_INIT and once the backend snapshot
+ is taken and snap is really ready to use, set the status to
+ GD_SNAP_STATUS_IN_USE. This helps in identifying the incomplete
+ snapshots and cleaning them up.
+ */
+ snap->snap_status = GD_SNAP_STATUS_INIT;
+ if (description) {
+ snap->description = gf_strdup (description);
+ if (snap->description == NULL) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Saving the Snap Description Failed");
+ ret = -1;
+ goto out;
+ }
+ }
+
+ ret = glusterd_store_snap (snap);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Could not store snap"
+ "object %s", snap->snapname);
+ goto out;
+ }
+
+ list_add_order (&snap->snap_list, &priv->snapshots,
+ glusterd_compare_snap_time);
+
+ gf_log (this->name, GF_LOG_TRACE, "Snap %s added to the list",
+ snap->snapname);
+
+ ret = 0;
+
+out:
+ if (ret) {
+ if (snap)
+ glusterd_snap_remove (rsp_dict, snap,
+ _gf_true, _gf_true);
+ snap = NULL;
+ }
+
+ return snap;
+}
+
+/* Added missed_snap_entry to rsp_dict */
+int32_t
+glusterd_add_missed_snaps_to_dict (dict_t *rsp_dict,
+ glusterd_volinfo_t *snap_vol,
+ glusterd_brickinfo_t *brickinfo,
+ int32_t brick_number, int32_t op)
+{
+ char *snap_uuid = NULL;
+ char missed_snap_entry[PATH_MAX] = "";
+ char name_buf[PATH_MAX] = "";
+ int32_t missed_snap_count = -1;
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (rsp_dict);
+ GF_ASSERT (snap_vol);
+ GF_ASSERT (brickinfo);
+
+ snap_uuid = gf_strdup (uuid_utoa (snap_vol->snapshot->snap_id));
+ if (!snap_uuid) {
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (missed_snap_entry, sizeof(missed_snap_entry),
+ "%s:%s=%s:%d:%s:%d:%d", uuid_utoa(brickinfo->uuid),
+ snap_uuid, snap_vol->volname, brick_number, brickinfo->path,
+ op, GD_MISSED_SNAP_PENDING);
+
+ /* Fetch the missed_snap_count from the dict */
+ ret = dict_get_int32 (rsp_dict, "missed_snap_count",
+ &missed_snap_count);
+ if (ret) {
+ /* Initialize the missed_snap_count for the first time */
+ missed_snap_count = 0;
+ }
+
+ /* Setting the missed_snap_entry in the rsp_dict */
+ snprintf (name_buf, sizeof(name_buf), "missed_snaps_%d",
+ missed_snap_count);
+ ret = dict_set_dynstr_with_alloc (rsp_dict, name_buf,
+ missed_snap_entry);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set missed_snap_entry (%s) "
+ "in the rsp_dict.", missed_snap_entry);
+ goto out;
+ }
+ missed_snap_count++;
+
+ /* Setting the new missed_snap_count in the dict */
+ ret = dict_set_int32 (rsp_dict, "missed_snap_count",
+ missed_snap_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set missed_snap_count for %s "
+ "in the rsp_dict.", missed_snap_entry);
+ goto out;
+ }
+
+out:
+ if (snap_uuid)
+ GF_FREE (snap_uuid);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* This function is called to get the device path of the snap lvm. Usually
+ if /dev/mapper/<group-name>-<lvm-name> is the device for the lvm,
+ then the snap device will be /dev/<group-name>/<snapname>.
+ This function takes care of building the path for the snap device.
+*/
+char *
+glusterd_build_snap_device_path (char *device, char *snapname)
+{
+ char snap[PATH_MAX] = "";
+ char msg[1024] = "";
+ char volgroup[PATH_MAX] = "";
+ char *snap_device = NULL;
+ xlator_t *this = NULL;
+ runner_t runner = {0,};
+ char *ptr = NULL;
+ int ret = -1;
+
+ this = THIS;
+ GF_ASSERT (this);
+ if (!device) {
+ gf_log (this->name, GF_LOG_ERROR, "device is NULL");
+ goto out;
+ }
+ if (!snapname) {
+ gf_log (this->name, GF_LOG_ERROR, "snapname is NULL");
+ goto out;
+ }
+
+ runinit (&runner);
+ runner_add_args (&runner, "/sbin/lvs", "--noheadings", "-o", "vg_name",
+ device, NULL);
+ runner_redir (&runner, STDOUT_FILENO, RUN_PIPE);
+ snprintf (msg, sizeof (msg), "Get volume group for device %s", device);
+ runner_log (&runner, this->name, GF_LOG_DEBUG, msg);
+ ret = runner_start (&runner);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get volume group "
+ "for device %s", device);
+ runner_end (&runner);
+ goto out;
+ }
+ ptr = fgets(volgroup, sizeof(volgroup),
+ runner_chio (&runner, STDOUT_FILENO));
+ if (!ptr || !strlen(volgroup)) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get volume group "
+ "for snap %s", snapname);
+ runner_end (&runner);
+ ret = -1;
+ goto out;
+ }
+ runner_end (&runner);
+
+ snprintf (snap, sizeof(snap), "/dev/%s/%s", gf_trim(volgroup),
+ snapname);
+ snap_device = gf_strdup (snap);
+ if (!snap_device) {
+ gf_log (this->name, GF_LOG_WARNING, "Cannot copy the "
+ "snapshot device name for snapname: %s)", snapname);
+ }
+
+out:
+ return snap_device;
+}
+
+/* This function actually calls the command (or the API) for taking the
+ snapshot of the backend brick filesystem. If this is successful,
+ then call the glusterd_snap_create function to create the snap object
+ for glusterd
+*/
+char *
+glusterd_take_lvm_snapshot (glusterd_volinfo_t *snap_vol,
+ glusterd_brickinfo_t *brickinfo)
+{
+ char msg[NAME_MAX] = "";
+ char buf[PATH_MAX] = "";
+ char *snap_device = NULL;
+ char *ptr = NULL;
+ char *device = NULL;
+ int ret = -1;
+ gf_boolean_t match = _gf_false;
+ runner_t runner = {0,};
+ xlator_t *this = NULL;
+
+ this = THIS;
+
+ if (!brickinfo) {
+ gf_log (this->name, GF_LOG_ERROR, "brickinfo NULL");
+ goto out;
+ }
+
+ device = glusterd_get_brick_mount_details (brickinfo);
+ if (!device) {
+ gf_log (this->name, GF_LOG_ERROR, "getting device name for "
+ "the brick %s:%s failed", brickinfo->hostname,
+ brickinfo->path);
+ goto out;
+ }
+
+ /* Figuring out if setactivationskip flag is supported or not */
+ runinit (&runner);
+ snprintf (msg, sizeof (msg), "running lvcreate help");
+ runner_add_args (&runner, LVM_CREATE, "--help", NULL);
+ runner_log (&runner, "", GF_LOG_DEBUG, msg);
+ runner_redir (&runner, STDOUT_FILENO, RUN_PIPE);
+ ret = runner_start (&runner);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to run lvcreate help");
+ runner_end (&runner);
+ goto out;
+ }
+
+ /* Looking for setactivationskip in lvcreate --help */
+ do {
+ ptr = fgets(buf, sizeof(buf),
+ runner_chio (&runner, STDOUT_FILENO));
+ if (ptr) {
+ if (strstr(buf, "setactivationskip")) {
+ match = _gf_true;
+ break;
+ }
+ }
+ } while (ptr != NULL);
+ runner_end (&runner);
+
+ /* Takng the actual snapshot */
+ runinit (&runner);
+ snprintf (msg, sizeof (msg), "taking snapshot of the brick %s:%s",
+ brickinfo->hostname, brickinfo->path);
+ if (match == _gf_true)
+ runner_add_args (&runner, LVM_CREATE, "-s", device,
+ "--setactivationskip", "n", "--name",
+ snap_vol->volname, NULL);
+ else
+ runner_add_args (&runner, LVM_CREATE, "-s", device,
+ "--name", snap_vol->volname, NULL);
+ runner_log (&runner, "", GF_LOG_DEBUG, msg);
+ ret = runner_start (&runner);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "taking snapshot of the "
+ "brick (%s:%s) of device %s failed",
+ brickinfo->hostname, brickinfo->path, device);
+ runner_end (&runner);
+ goto out;
+ }
+ runner_end (&runner);
+
+ snap_device = glusterd_build_snap_device_path (device,
+ snap_vol->volname);
+ if (!snap_device) {
+ gf_log (this->name, GF_LOG_WARNING, "Cannot copy the snapshot "
+ "device name for snap %s (volume id: %s)",
+ snap_vol->snapshot->snapname, snap_vol->volname);
+ ret = -1;
+ goto out;
+ }
+
+out:
+ return snap_device;
+}
+
+int32_t
+glusterd_snap_brick_create (char *device, glusterd_volinfo_t *snap_volinfo,
+ glusterd_brickinfo_t *original_brickinfo,
+ int32_t brick_count, char *snap_brick_dir)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ char snap_brick_mount_path[PATH_MAX] = "";
+ char snap_brick_path[PATH_MAX] = "";
+ struct stat statbuf = {0, };
+
+ this = THIS;
+ priv = this->private;
+
+ GF_ASSERT (device);
+ GF_ASSERT (snap_volinfo);
+ GF_ASSERT (original_brickinfo);
+ GF_ASSERT (snap_brick_dir);
+
+ snprintf (snap_brick_mount_path, sizeof (snap_brick_mount_path),
+ "%s/%s/brick%d", snap_mount_folder, snap_volinfo->volname,
+ brick_count+1);
+
+ snprintf (snap_brick_path, sizeof (snap_brick_path), "%s%s",
+ snap_brick_mount_path, snap_brick_dir);
+
+ ret = mkdir_p (snap_brick_mount_path, 0777, _gf_true);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "creating the brick directory"
+ " %s for the snapshot %s(device: %s) failed",
+ snap_brick_mount_path, snap_volinfo->volname, device);
+ goto out;
+ }
+ /* mount the snap logical device on the directory inside
+ /run/gluster/snaps/<snapname>/@snap_brick_mount_path
+ Way to mount the snap brick via mount api is this.
+ ret = mount (device, snap_brick_mount_path, entry->mnt_type,
+ MS_MGC_VAL, "nouuid");
+ But for now, mounting using runner apis.
+ */
+ ret = glusterd_mount_lvm_snapshot (device, snap_brick_mount_path);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to mount lvm snapshot.");
+ goto out;
+ }
+
+ ret = stat (snap_brick_path, &statbuf);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "stat of the brick %s"
+ "(brick mount: %s) failed (%s)", snap_brick_path,
+ snap_brick_mount_path, strerror (errno));
+ goto out;
+ }
+ ret = sys_lsetxattr (snap_brick_path,
+ GF_XATTR_VOL_ID_KEY,
+ snap_volinfo->volume_id, 16,
+ XATTR_REPLACE);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set "
+ "extended attribute %s on %s. Reason: "
+ "%s, snap: %s", GF_XATTR_VOL_ID_KEY,
+ snap_brick_path, strerror (errno),
+ snap_volinfo->volname);
+ goto out;
+ }
+
+out:
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "unmounting the snap brick"
+ " mount %s", snap_brick_mount_path);
+#if !defined(GF_DARWIN_HOST_OS)
+ umount (snap_brick_mount_path);
+#else
+ unmount (snap_brick_mount_path, 0);
+#endif
+ }
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+static int32_t
+glusterd_add_bricks_to_snap_volume (dict_t *dict, dict_t *rsp_dict,
+ glusterd_volinfo_t *snap_vol,
+ glusterd_brickinfo_t *original_brickinfo,
+ glusterd_brickinfo_t *snap_brickinfo,
+ char **snap_brick_dir, int64_t volcount,
+ int32_t brick_count)
+{
+ char key[PATH_MAX] = "";
+ char snap_brick_path[PATH_MAX] = "";
+ char *snap_device = NULL;
+ gf_boolean_t add_missed_snap = _gf_false;
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (rsp_dict);
+ GF_ASSERT (snap_vol);
+ GF_ASSERT (original_brickinfo);
+ GF_ASSERT (snap_brickinfo);
+ GF_ASSERT (snap_brick_dir);
+
+ snprintf (key, sizeof(key) - 1, "vol%"PRId64".brickdir%d", volcount,
+ brick_count);
+ ret = dict_get_ptr (dict, key, (void **)snap_brick_dir);
+ if (ret) {
+ /* Using original brickinfo here because it will be a
+ * pending snapshot and storing the original brickinfo
+ * will help in mapping while recreating the missed snapshot
+ */
+ gf_log (this->name, GF_LOG_WARNING, "Unable to fetch "
+ "snap mount path (%s). Using original brickinfo", key);
+ snap_brickinfo->snap_status = -1;
+ strcpy (snap_brick_path, original_brickinfo->path);
+
+ /* In origiator node add snaps missed
+ * from different nodes to the dict
+ */
+ if (is_origin_glusterd (dict) == _gf_true)
+ add_missed_snap = _gf_true;
+ } else {
+ /* Create brick-path in the format /var/run/gluster/snaps/ *
+ * <snap-uuid>/<original-brick#>/snap-brick-dir *
+ */
+ snprintf (snap_brick_path, sizeof(snap_brick_path),
+ "%s/%s/brick%d%s", snap_mount_folder,
+ snap_vol->volname, brick_count+1,
+ *snap_brick_dir);
+ }
+
+ if ((snap_brickinfo->snap_status != -1) &&
+ (!uuid_compare (original_brickinfo->uuid, MY_UUID)) &&
+ (!glusterd_is_brick_started (original_brickinfo))) {
+ /* In case if the brick goes down after prevalidate. */
+ gf_log (this->name, GF_LOG_WARNING, "brick %s:%s is not"
+ " started (snap: %s)",
+ original_brickinfo->hostname,
+ original_brickinfo->path,
+ snap_vol->snapshot->snapname);
+
+ snap_brickinfo->snap_status = -1;
+ strcpy (snap_brick_path, original_brickinfo->path);
+ add_missed_snap = _gf_true;
+ }
+
+ if (add_missed_snap) {
+ ret = glusterd_add_missed_snaps_to_dict (rsp_dict,
+ snap_vol,
+ original_brickinfo,
+ brick_count + 1,
+ GF_SNAP_OPTION_TYPE_CREATE);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to add missed"
+ " snapshot info for %s:%s in the rsp_dict",
+ original_brickinfo->hostname,
+ original_brickinfo->path);
+ goto out;
+ }
+ }
+
+ snprintf (key, sizeof(key), "vol%"PRId64".brick_snapdevice%d",
+ volcount, brick_count);
+ ret = dict_get_ptr (dict, key, (void **)&snap_device);
+ if (ret) {
+ /* If the device name is empty, so will be the brick path
+ * Hence the missed snap has already been added above
+ */
+ gf_log (this->name, GF_LOG_ERROR, "Unable to fetch "
+ "snap device (%s). Leaving empty", key);
+ } else
+ strcpy (snap_brickinfo->device_path, snap_device);
+
+ ret = gf_canonicalize_path (snap_brick_path);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to canonicalize path");
+ goto out;
+ }
+
+ strcpy (snap_brickinfo->hostname, original_brickinfo->hostname);
+ strcpy (snap_brickinfo->path, snap_brick_path);
+ uuid_copy (snap_brickinfo->uuid, original_brickinfo->uuid);
+ /* AFR changelog names are based on brick_id and hence the snap
+ * volume's bricks must retain the same ID */
+ strcpy (snap_brickinfo->brick_id, original_brickinfo->brick_id);
+ list_add_tail (&snap_brickinfo->brick_list, &snap_vol->bricks);
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+static int32_t
+glusterd_take_brick_snapshot (glusterd_volinfo_t *origin_vol,
+ glusterd_volinfo_t *snap_vol, dict_t *rsp_dict,
+ glusterd_brickinfo_t *original_brickinfo,
+ glusterd_brickinfo_t *snap_brickinfo,
+ char *snap_brick_dir, int32_t brick_count)
+{
+ char *device = NULL;
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (origin_vol);
+ GF_ASSERT (snap_vol);
+ GF_ASSERT (rsp_dict);
+ GF_ASSERT (original_brickinfo);
+ GF_ASSERT (snap_brickinfo);
+ GF_ASSERT (snap_brick_dir);
+
+ device = glusterd_take_lvm_snapshot (snap_vol, original_brickinfo);
+ /* Fail the snapshot even though snapshot on one of
+ the bricks fails. At the end when we check whether
+ the snapshot volume meets quorum or not, then the
+ the snapshot can either be treated as success, or
+ in case of failure we can undo the changes and return
+ failure to cli. */
+ if (!device) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to take snapshot of %s:%s",
+ original_brickinfo->hostname,
+ original_brickinfo->path);
+ goto out;
+ }
+
+ /* create the complete brick here */
+ ret = glusterd_snap_brick_create (device, snap_vol,
+ original_brickinfo,
+ brick_count, snap_brick_dir);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "not able to"
+ " create the brickinfo for the snap %s"
+ ", volume %s", snap_vol->snapshot->snapname,
+ origin_vol->volname);
+ goto out;
+ }
+
+out:
+ if (device)
+ GF_FREE (device);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+glusterd_volinfo_t *
+glusterd_do_snap_vol (glusterd_volinfo_t *origin_vol, glusterd_snap_t *snap,
+ dict_t *dict, dict_t *rsp_dict, int64_t volcount)
+{
+ char key[PATH_MAX] = "";
+ char *snap_brick_dir = NULL;
+ char *username = NULL;
+ char *password = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ glusterd_conf_t *priv = NULL;
+ glusterd_volinfo_t *snap_vol = NULL;
+ uuid_t *snap_volid = NULL;
+ int32_t ret = -1;
+ int32_t brick_count = 0;
+ glusterd_brickinfo_t *snap_brickinfo = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ priv = this->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (dict);
+ GF_ASSERT (origin_vol);
+ GF_ASSERT (rsp_dict);
+
+ /* fetch username, password and vol_id from dict*/
+ snprintf (key, sizeof(key), "volume%"PRId64"_username", volcount);
+ ret = dict_get_str (dict, key, &username);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get %s for "
+ "snap %s", key, snap->snapname);
+ goto out;
+ }
+
+ snprintf (key, sizeof(key), "volume%"PRId64"_password", volcount);
+ ret = dict_get_str (dict, key, &password);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get %s for "
+ "snap %s", key, snap->snapname);
+ goto out;
+ }
+
+ snprintf (key, sizeof(key) - 1, "vol%"PRId64"_volid", volcount);
+ ret = dict_get_bin (dict, key, (void **)&snap_volid);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to fetch snap_volid");
+ goto out;
+ }
+
+ /* We are not setting the username and password here as
+ * we need to set the user name and password passed in
+ * the dictionary
+ */
+ ret = glusterd_volinfo_dup (origin_vol, &snap_vol, _gf_false);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to duplicate volinfo "
+ "for the snapshot %s", snap->snapname);
+ goto out;
+ }
+
+ /* uuid is used as lvm snapshot name.
+ This will avoid restrictions on snapshot names provided by user */
+ GLUSTERD_GET_UUID_NOHYPHEN (snap_vol->volname, *snap_volid);
+ uuid_copy (snap_vol->volume_id, *snap_volid);
+ snap_vol->is_snap_volume = _gf_true;
+ strcpy (snap_vol->parent_volname, origin_vol->volname);
+ snap_vol->snapshot = snap;
+
+ glusterd_auth_set_username (snap_vol, username);
+ glusterd_auth_set_password (snap_vol, password);
+
+ /* Adding snap brickinfos to the snap volinfo */
+ brick_count = 0;
+ list_for_each_entry (brickinfo, &origin_vol->bricks, brick_list) {
+ snap_brickinfo = NULL;
+
+ ret = glusterd_brickinfo_new (&snap_brickinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "initializing the brick for the snap "
+ "volume failed (snapname: %s)", snap->snapname);
+ goto out;
+ }
+
+ ret = glusterd_add_bricks_to_snap_volume (dict, rsp_dict,
+ snap_vol,
+ brickinfo,
+ snap_brickinfo,
+ &snap_brick_dir,
+ volcount,
+ brick_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to add the snap brick for "
+ "%s:%s to the snap volume",
+ brickinfo->hostname, brickinfo->path);
+ GF_FREE (snap_brickinfo);
+ goto out;
+ }
+
+ /* Take snapshot of the brick */
+ if ((uuid_compare (brickinfo->uuid, MY_UUID)) ||
+ (snap_brickinfo->snap_status == -1)) {
+ brick_count++;
+ continue;
+ }
+
+ ret = glusterd_take_brick_snapshot (origin_vol, snap_vol,
+ rsp_dict, brickinfo,
+ snap_brickinfo,
+ snap_brick_dir,
+ brick_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to take snapshot for %s:%s",
+ brickinfo->hostname, brickinfo->path);
+ goto out;
+ }
+ brick_count++;
+ }
+
+ /*TODO: the quorum check of the snap volume here */
+
+ ret = glusterd_store_volinfo (snap_vol,
+ GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to store snapshot "
+ "volinfo (%s) for snap %s", snap_vol->volname,
+ snap->snapname);
+ goto out;
+ }
+
+ ret = generate_brick_volfiles (snap_vol);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "generating the brick "
+ "volfiles for the snap %s (volume: %s) failed",
+ snap->snapname, origin_vol->volname);
+ goto out;
+ }
+
+ ret = generate_client_volfiles (snap_vol, GF_CLIENT_TRUSTED);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "generating the trusted "
+ "client volfiles for the snap %s (volume: %s) failed",
+ snap->snapname, origin_vol->volname);
+ goto out;
+ }
+
+ ret = generate_client_volfiles (snap_vol, GF_CLIENT_OTHER);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "generating the client "
+ "volfiles for the snap %s (volume: %s) failed",
+ snap->snapname, origin_vol->volname);
+ goto out;
+ }
+
+ ret = glusterd_list_add_snapvol (origin_vol, snap_vol);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "could not add the snap "
+ "volume %s to the list", snap_vol->volname);
+ goto out;
+ }
+
+ list_for_each_entry (brickinfo, &snap_vol->bricks, brick_list) {
+ if (uuid_compare (brickinfo->uuid, MY_UUID))
+ continue;
+
+ if (brickinfo->snap_status == -1) {
+ gf_log (this->name, GF_LOG_INFO,
+ "not starting snap brick %s:%s for "
+ "for the snap %s (volume: %s)",
+ brickinfo->hostname, brickinfo->path,
+ snap->snapname, origin_vol->volname);
+ continue;
+ }
+
+ ret = glusterd_brick_start (snap_vol, brickinfo, _gf_true);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "starting the "
+ "brick %s:%s for the snap %s (volume: %s) "
+ "failed", brickinfo->hostname, brickinfo->path,
+ snap->snapname, origin_vol->volname);
+ goto out;
+ }
+ }
+
+ snap_vol->status = GLUSTERD_STATUS_STARTED;
+ ret = glusterd_store_volinfo (snap_vol,
+ GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to store snap volinfo");
+ goto out;
+ }
+
+out:
+ if (ret) {
+ if (snap_vol)
+ glusterd_snap_volume_remove (rsp_dict, snap_vol,
+ _gf_true, _gf_true);
+ snap_vol = NULL;
+ }
+
+ return snap_vol;
+}
+
+/* This is a snapshot remove handler function. This function will be
+ * executed in the originator node. This function is responsible for
+ * calling mgmt v3 framework to do the actual remove on all the bricks
+ *
+ * @param req RPC request object
+ * @param op gluster operation
+ * @param dict dictionary containing snapshot remove request
+ * @param err_str In case of an err this string should be populated
+ * @param len length of err_str buffer
+ *
+ * @return Negative value on Failure and 0 in success
+ */
+int
+glusterd_handle_snapshot_remove (rpcsvc_request_t *req, glusterd_op_t op,
+ dict_t *dict, char *err_str, size_t len)
+{
+ int ret = -1;
+ int64_t volcount = 0;
+ char *snapname = NULL;
+ char *volname = NULL;
+ char key[PATH_MAX] = "";
+ glusterd_snap_t *snap = NULL;
+ glusterd_volinfo_t *snap_vol = NULL;
+ glusterd_volinfo_t *tmp = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+
+ GF_ASSERT (req);
+ GF_ASSERT (dict);
+ GF_ASSERT (err_str);
+
+ ret = dict_get_str (dict, "snapname", &snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get snapname");
+ goto out;
+ }
+
+ snap = glusterd_find_snap_by_name (snapname);
+ if (!snap) {
+ snprintf (err_str, len, "Snap (%s) does not exist", snapname);
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s", err_str);
+ ret = -1;
+ goto out;
+ }
+
+ /* Set volnames in the dict to get mgmt_v3 lock */
+ list_for_each_entry_safe (snap_vol, tmp, &snap->volumes, vol_list) {
+ volcount++;
+ volname = gf_strdup (snap_vol->parent_volname);
+ if (!volname) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR, "strdup failed");
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "volname%"PRId64, volcount);
+ ret = dict_set_dynstr (dict, key, volname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set "
+ "volume name in dictionary");
+ GF_FREE (volname);
+ goto out;
+ }
+ volname = NULL;
+ }
+ ret = dict_set_int64 (dict, "volcount", volcount);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set volcount");
+ goto out;
+ }
+
+ ret = glusterd_mgmt_v3_initiate_snap_phases (req, op, dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to initiate snap "
+ "phases");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int
+glusterd_snapshot_remove_prevalidate (dict_t *dict, char **op_errstr,
+ dict_t *rsp_dict)
+{
+ int32_t ret = -1;
+ char *snapname = NULL;
+ xlator_t *this = NULL;
+ glusterd_snap_t *snap = NULL;
+
+ this = THIS;
+
+ if (!dict || !op_errstr) {
+ gf_log (this->name, GF_LOG_ERROR, "input parameters NULL");
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "snapname", &snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Getting the snap name "
+ "failed");
+ goto out;
+ }
+
+ snap = glusterd_find_snap_by_name (snapname);
+ if (!snap) {
+ gf_log (this->name, GF_LOG_ERROR, "Snap %s does not exist",
+ snapname);
+ ret = -1;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int
+glusterd_snapshot_status_prevalidate (dict_t *dict, char **op_errstr,
+ dict_t *rsp_dict)
+{
+ int ret = -1;
+ char *snapname = NULL;
+ glusterd_conf_t *conf = NULL;
+ xlator_t *this = NULL;
+ int32_t cmd = -1;
+ glusterd_volinfo_t *volinfo = NULL;
+ char *volname = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+
+ GF_ASSERT (conf);
+ GF_ASSERT (op_errstr);
+ if (!dict) {
+ gf_log (this->name, GF_LOG_ERROR, "Input dict is NULL");
+ goto out;
+ }
+
+ ret = dict_get_int32 (dict, "cmd", &cmd);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not fetch status cmd");
+ goto out;
+ }
+
+ switch (cmd) {
+ case GF_SNAP_STATUS_TYPE_ALL:
+ {
+ break;
+ }
+ case GF_SNAP_STATUS_TYPE_SNAP:
+ {
+ ret = dict_get_str (dict, "snapname", &snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not fetch snapname");
+ goto out;
+ }
+
+ if (!glusterd_find_snap_by_name (snapname)) {
+ ret = gf_asprintf (op_errstr, "Snap (%s) "
+ "not found", snapname);
+ if (ret < 0) {
+ goto out;
+ }
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR, "Snap (%s) "
+ "not found", snapname);
+ goto out;
+ }
+ break;
+ }
+ case GF_SNAP_STATUS_TYPE_VOL:
+ {
+ ret = dict_get_str (dict, "volname", &volname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not fetch volname");
+ goto out;
+ }
+
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret) {
+ ret = gf_asprintf (op_errstr, "Volume (%s)"
+ "not found", volname);
+ if (ret < 0) {
+ goto out;
+ }
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR, "Volume "
+ "%s not present", volname);
+ goto out;
+ }
+ break;
+
+ }
+ default:
+ {
+ gf_log (this->name, GF_LOG_ERROR, "Invalid command");
+ break;
+ }
+ }
+ ret = 0;
+
+out:
+ return ret;
+}
+
+int32_t
+glusterd_snapshot_remove_commit (dict_t *dict, char **op_errstr,
+ dict_t *rsp_dict)
+{
+ int32_t ret = -1;
+ char *snapname = NULL;
+ char *dup_snapname = NULL;
+ glusterd_snap_t *snap = NULL;
+ glusterd_conf_t *priv = NULL;
+ glusterd_volinfo_t *snap_volinfo = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (rsp_dict);
+ GF_ASSERT (op_errstr);
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ if (!dict || !op_errstr) {
+ gf_log (this->name, GF_LOG_ERROR, "input parameters NULL");
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "snapname", &snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Getting the snap name "
+ "failed");
+ goto out;
+ }
+
+ snap = glusterd_find_snap_by_name (snapname);
+ if (!snap) {
+ gf_log (this->name, GF_LOG_ERROR, "Snap %s does not exist",
+ snapname);
+ ret = -1;
+ goto out;
+ }
+
+ if (is_origin_glusterd (dict) == _gf_true) {
+ /* TODO : As of now there is only volume in snapshot.
+ * Change this when multiple volume snapshot is introduced
+ */
+ snap_volinfo = list_entry (snap->volumes.next,
+ glusterd_volinfo_t,
+ vol_list);
+ if (!snap_volinfo) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to fetch snap_volinfo");
+ ret = -1;
+ goto out;
+ }
+
+ /* From origin glusterd check if *
+ * any peers with snap bricks is down */
+ ret = glusterd_find_missed_snap (rsp_dict, snap_volinfo,
+ &priv->peers,
+ GF_SNAP_OPTION_TYPE_DELETE);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to find missed snap deletes");
+ goto out;
+ }
+ }
+
+ ret = glusterd_snap_remove (rsp_dict, snap, _gf_true, _gf_false);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to remove snap %s",
+ snapname);
+ goto out;
+ }
+
+ dup_snapname = gf_strdup (snapname);
+ if (!dup_snapname) {
+ gf_log (this->name, GF_LOG_ERROR, "Strdup failed");
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_dynstr (rsp_dict, "snapname", dup_snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set the snapname");
+ GF_FREE (dup_snapname);
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int32_t
+glusterd_do_snap_cleanup (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+ int32_t ret = -1;
+ char *name = NULL;
+ xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ glusterd_snap_t *snap = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ if (!dict || !op_errstr) {
+ gf_log (this->name, GF_LOG_ERROR, "input parameters NULL");
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "snapname", &name);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "getting the snap "
+ "name failed (volume: %s)", volinfo->volname);
+ goto out;
+ }
+
+ /*
+ If the snapname is not found that means the failure happened at
+ staging, or in commit, before the snap object is created, in which
+ case there is nothing to cleanup. So set ret to 0.
+ */
+ snap = glusterd_find_snap_by_name (name);
+ if (!snap) {
+ gf_log (this->name, GF_LOG_INFO, "snap %s is not found", name);
+ ret = 0;
+ goto out;
+ }
+
+ ret = glusterd_snap_remove (rsp_dict, snap, _gf_true, _gf_true);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "removing the snap %s failed",
+ name);
+ goto out;
+ }
+
+ name = NULL;
+
+ ret = 0;
+
+out:
+
+ return ret;
+}
+
+/* In case of a successful, delete or create operation, during post_validate *
+ * look for missed snap operations and update the missed snap lists */
+int32_t
+glusterd_snapshot_update_snaps_post_validate (dict_t *dict, char **op_errstr,
+ dict_t *rsp_dict)
+{
+ int32_t ret = -1;
+ int32_t missed_snap_count = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (rsp_dict);
+ GF_ASSERT (op_errstr);
+
+ ret = dict_get_int32 (dict, "missed_snap_count",
+ &missed_snap_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "No missed snaps");
+ ret = 0;
+ goto out;
+ }
+
+ ret = glusterd_add_missed_snaps_to_list (dict, missed_snap_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to add missed snaps to list");
+ goto out;
+ }
+
+ ret = glusterd_store_update_missed_snaps ();
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to update missed_snaps_list");
+ goto out;
+ }
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_snapshot_create_commit (dict_t *dict, char **op_errstr,
+ dict_t *rsp_dict)
+{
+ int ret = -1;
+ int64_t i = 0;
+ int64_t volcount = 0;
+ char *snapname = NULL;
+ char *volname = NULL;
+ char *tmp_name = NULL;
+ char key[PATH_MAX] = "";
+ xlator_t *this = NULL;
+ glusterd_snap_t *snap = NULL;
+ glusterd_volinfo_t *origin_vol = NULL;
+ glusterd_volinfo_t *snap_vol = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ this = THIS;
+ GF_ASSERT(this);
+ GF_ASSERT(dict);
+ GF_ASSERT(op_errstr);
+ GF_ASSERT(rsp_dict);
+ priv = this->private;
+ GF_ASSERT(priv);
+
+ ret = dict_get_int64 (dict, "volcount", &volcount);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to "
+ "get the volume count");
+ goto out;
+ }
+
+ ret = dict_get_str (dict, "snapname", &snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to fetch snapname");
+ goto out;
+ }
+ tmp_name = gf_strdup (snapname);
+ if (!tmp_name) {
+ gf_log (this->name, GF_LOG_ERROR, "Out of memory");
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_dynstr (rsp_dict, "snapname", tmp_name);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set snapname in rsp_dict");
+ GF_FREE (tmp_name);
+ goto out;
+ }
+ tmp_name = NULL;
+
+ snap = glusterd_create_snap_object (dict, rsp_dict);
+ if (!snap) {
+ gf_log (this->name, GF_LOG_ERROR, "creating the"
+ "snap object %s failed", snapname);
+ ret = -1;
+ goto out;
+ }
+
+ for (i = 1; i <= volcount; i++) {
+ snprintf (key, sizeof (key), "volname%"PRId64, i);
+ ret = dict_get_str (dict, key, &volname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to get volume name");
+ goto out;
+ }
+
+ ret = glusterd_volinfo_find (volname, &origin_vol);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to get the volinfo for "
+ "the volume %s", volname);
+ goto out;
+ }
+
+ /* TODO: Create a stub where the bricks are
+ added parallely by worker threads so that
+ the snap creating happens parallely. */
+ snap_vol = glusterd_do_snap_vol (origin_vol, snap, dict,
+ rsp_dict, i);
+ if (!snap_vol) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_WARNING, "taking the "
+ "snapshot of the volume %s failed", volname);
+ goto out;
+ }
+ }
+
+ snap->snap_status = GD_SNAP_STATUS_IN_USE;
+ ret = glusterd_store_snap (snap);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Could not store snap"
+ "object %s", snap->snapname);
+ goto out;
+ }
+
+ ret = 0;
+
+out:
+ if (ret) {
+ if (snap)
+ glusterd_snap_remove (rsp_dict, snap,
+ _gf_true, _gf_true);
+ snap = NULL;
+ }
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int
+snap_max_hard_limit_set_commit (dict_t *dict, uint64_t value,
+ char *volname, char **op_errstr)
+{
+ char err_str[PATH_MAX] = "";
+ glusterd_conf_t *conf = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ int ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (volname);
+ GF_ASSERT (op_errstr);
+
+ conf = this->private;
+
+ GF_ASSERT (conf);
+
+ /* TODO: Initiate auto deletion when there is a limit change */
+ if (!volname) {
+ /* For system limit */
+ conf->snap_max_hard_limit = value;
+
+ ret = glusterd_store_global_info (this);
+ if (ret) {
+ snprintf (err_str, PATH_MAX, "Failed to store "
+ "snap-max-hard-limit for system");
+ goto out;
+ }
+ } else {
+ /* For one volume */
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret) {
+ snprintf (err_str, PATH_MAX, "Failed to get the"
+ " volinfo for volume %s", volname);
+ goto out;
+ }
+
+ volinfo->snap_max_hard_limit = value;
+
+ ret = glusterd_store_volinfo (volinfo,
+ GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+ if (ret) {
+ snprintf (err_str, PATH_MAX, "Failed to store "
+ "snap-max-hard-limit for volume %s", volname);
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ if (ret) {
+ *op_errstr = gf_strdup (err_str);
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
+ }
+ return ret;
+}
+
+int
+snap_max_limits_display_commit (dict_t *rsp_dict, char *volname,
+ char **op_errstr)
+{
+ char err_str[PATH_MAX] = "";
+ char buf[PATH_MAX] = "";
+ glusterd_conf_t *conf = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ int ret = -1;
+ uint64_t active_hard_limit = 0;
+ uint64_t snap_max_limit = 0;
+ uint64_t soft_limit_value = -1;
+ uint64_t count = 0;
+ xlator_t *this = NULL;
+
+ this = THIS;
+
+ GF_ASSERT (this);
+ GF_ASSERT (rsp_dict);
+ GF_ASSERT (volname);
+ GF_ASSERT (op_errstr);
+
+ conf = this->private;
+
+ GF_ASSERT (conf);
+
+ if (!volname) {
+ /* For system limit */
+ list_for_each_entry (volinfo, &conf->volumes, vol_list) {
+ if (volinfo->is_snap_volume == _gf_true)
+ continue;
+ snap_max_limit = volinfo->snap_max_hard_limit;
+ if (snap_max_limit > conf->snap_max_hard_limit)
+ active_hard_limit = conf->snap_max_hard_limit;
+ else
+ active_hard_limit = snap_max_limit;
+ soft_limit_value = (active_hard_limit *
+ conf->snap_max_soft_limit) / 100;
+
+ snprintf (buf, sizeof(buf), "volume%"PRIu64"-volname", count);
+ ret = dict_set_str (rsp_dict, buf, volinfo->volname);
+ if (ret) {
+ snprintf (err_str, PATH_MAX,
+ "Failed to set %s", buf);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf),
+ "volume%"PRIu64"-snap-max-hard-limit", count);
+ ret = dict_set_uint64 (rsp_dict, buf, snap_max_limit);
+ if (ret) {
+ snprintf (err_str, PATH_MAX,
+ "Failed to set %s", buf);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf),
+ "volume%"PRIu64"-active-hard-limit", count);
+ ret = dict_set_uint64 (rsp_dict, buf,
+ active_hard_limit);
+ if (ret) {
+ snprintf (err_str, PATH_MAX,
+ "Failed to set %s", buf);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf),
+ "volume%"PRIu64"-snap-max-soft-limit", count);
+ ret = dict_set_uint64 (rsp_dict, buf, soft_limit_value);
+ if (ret) {
+ snprintf (err_str, PATH_MAX,
+ "Failed to set %s", buf);
+ goto out;
+ }
+ count++;
+ }
+
+ ret = dict_set_uint64 (rsp_dict, "voldisplaycount", count);
+ if (ret) {
+ snprintf (err_str, PATH_MAX,
+ "Failed to set voldisplaycount");
+ goto out;
+ }
+ } else {
+ /* For one volume */
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret) {
+ snprintf (err_str, PATH_MAX, "Failed to get the"
+ " volinfo for volume %s", volname);
+ goto out;
+ }
+
+ snap_max_limit = volinfo->snap_max_hard_limit;
+ if (snap_max_limit > conf->snap_max_hard_limit)
+ active_hard_limit = conf->snap_max_hard_limit;
+ else
+ active_hard_limit = snap_max_limit;
+
+ soft_limit_value = (active_hard_limit *
+ conf->snap_max_soft_limit) / 100;
+
+ snprintf (buf, sizeof(buf), "volume%"PRIu64"-volname", count);
+ ret = dict_set_str (rsp_dict, buf, volinfo->volname);
+ if (ret) {
+ snprintf (err_str, PATH_MAX,
+ "Failed to set %s", buf);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf),
+ "volume%"PRIu64"-snap-max-hard-limit", count);
+ ret = dict_set_uint64 (rsp_dict, buf, snap_max_limit);
+ if (ret) {
+ snprintf (err_str, PATH_MAX,
+ "Failed to set %s", buf);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf),
+ "volume%"PRIu64"-active-hard-limit", count);
+ ret = dict_set_uint64 (rsp_dict, buf, active_hard_limit);
+ if (ret) {
+ snprintf (err_str, PATH_MAX,
+ "Failed to set %s", buf);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf),
+ "volume%"PRIu64"-snap-max-soft-limit", count);
+ ret = dict_set_uint64 (rsp_dict, buf, soft_limit_value);
+ if (ret) {
+ snprintf (err_str, PATH_MAX,
+ "Failed to set %s", buf);
+ goto out;
+ }
+
+ count++;
+
+ ret = dict_set_uint64 (rsp_dict, "voldisplaycount", count);
+ if (ret) {
+ snprintf (err_str, PATH_MAX,
+ "Failed to set voldisplaycount");
+ goto out;
+ }
+
+ }
+
+ ret = dict_set_uint64 (rsp_dict, "snap-max-hard-limit",
+ conf->snap_max_hard_limit);
+ if (ret) {
+ snprintf (err_str, PATH_MAX,
+ "Failed to set sys-snap-max-hard-limit ");
+ goto out;
+ }
+
+ ret = dict_set_uint64 (rsp_dict, "snap-max-soft-limit",
+ conf->snap_max_soft_limit);
+ if (ret) {
+ snprintf (err_str, PATH_MAX,
+ "Failed to set sys-snap-max-hard-limit ");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (ret) {
+ *op_errstr = gf_strdup (err_str);
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
+ }
+ return ret;
+}
+
+int
+glusterd_snapshot_config_commit (dict_t *dict, char **op_errstr,
+ dict_t *rsp_dict)
+{
+ char *volname = NULL;
+ xlator_t *this = NULL;
+ int ret = -1;
+ char err_str[PATH_MAX] = {0,};
+ glusterd_conf_t *conf = NULL;
+ int config_command = 0;
+ uint64_t hard_limit = 0;
+ uint64_t soft_limit = 0;
+
+ this = THIS;
+
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (op_errstr);
+
+ conf = this->private;
+
+ GF_ASSERT (conf);
+
+ ret = dict_get_int32 (dict, "config-command", &config_command);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to get config-command type");
+ goto out;
+ }
+
+ /* Ignore the return value of the following dict_get,
+ * as they are optional
+ */
+ ret = dict_get_str (dict, "volname", &volname);
+
+ ret = dict_get_uint64 (dict, "snap-max-hard-limit", &hard_limit);
+
+ ret = dict_get_uint64 (dict, "snap-max-soft-limit", &soft_limit);
+
+ switch (config_command) {
+ case GF_SNAP_CONFIG_TYPE_SET:
+ if (hard_limit) {
+ /* Commit ops for snap-max-hard-limit */
+ ret = snap_max_hard_limit_set_commit (dict, hard_limit,
+ volname,
+ op_errstr);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "snap-max-hard-limit set "
+ "commit failed.");
+ goto out;
+ }
+ }
+
+ if (soft_limit) {
+ /* For system limit */
+ conf->snap_max_soft_limit = soft_limit;
+
+ ret = glusterd_store_global_info (this);
+ if (ret) {
+ snprintf (err_str, PATH_MAX, "Failed to store "
+ "snap-max-soft-limit for system");
+ *op_errstr = gf_strdup (err_str);
+ gf_log (this->name, GF_LOG_ERROR, "%s",
+ err_str);
+ goto out;
+ }
+ }
+ break;
+
+ case GF_SNAP_CONFIG_DISPLAY:
+ /* Reading data from local node only */
+ if (!is_origin_glusterd (dict)) {
+ ret = 0;
+ break;
+ }
+
+ ret = snap_max_limits_display_commit (rsp_dict, volname,
+ op_errstr);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "snap-max-limit "
+ "display commit failed.");
+ goto out;
+ }
+ break;
+ default:
+ break;
+ }
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_get_brick_lvm_details (dict_t *rsp_dict,
+ glusterd_brickinfo_t *brickinfo, char *volname,
+ char *device, char *key_prefix)
+{
+
+ int ret = -1;
+ glusterd_conf_t *priv = NULL;
+ runner_t runner = {0,};
+ xlator_t *this = NULL;
+ char msg[PATH_MAX] = "";
+ char buf[PATH_MAX] = "";
+ char *ptr = NULL;
+ char *token = NULL;
+ char key[PATH_MAX] = "";
+ char *value = NULL;
+
+ GF_ASSERT (rsp_dict);
+ GF_ASSERT (brickinfo);
+ GF_ASSERT (volname);
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ device = glusterd_get_brick_mount_details (brickinfo);
+ if (!device) {
+ gf_log (this->name, GF_LOG_ERROR, "Getting device name for "
+ "the brick %s:%s failed", brickinfo->hostname,
+ brickinfo->path);
+ goto out;
+ }
+ runinit (&runner);
+ snprintf (msg, sizeof (msg), "running lvs command, "
+ "for getting snap status");
+ /* Using lvs command fetch the Volume Group name,
+ * Percentage of data filled and Logical Volume size
+ *
+ * "-o" argument is used to get the desired information,
+ * example : "lvs /dev/VolGroup/thin_vol -o vgname,lv_size",
+ * will get us Volume Group name and Logical Volume size.
+ *
+ * Here separator used is ":",
+ * for the above given command with separator ":",
+ * The output will be "vgname:lvsize"
+ */
+ runner_add_args (&runner, LVS, device, "--noheading", "-o",
+ "vg_name,data_percent,lv_size",
+ "--separator", ":", NULL);
+ runner_redir (&runner, STDOUT_FILENO, RUN_PIPE);
+ runner_log (&runner, "", GF_LOG_DEBUG, msg);
+ ret = runner_start (&runner);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not perform lvs action");
+ goto end;
+ }
+ do {
+ ptr = fgets (buf, sizeof (buf),
+ runner_chio (&runner, STDOUT_FILENO));
+
+ if (ptr == NULL)
+ break;
+ token = strtok (buf, ":");
+ if (token != NULL) {
+ while (token && token[0] == ' ')
+ token++;
+ if (!token) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR,
+ "Invalid vg entry");
+ goto end;
+ }
+ value = gf_strdup (token);
+ if (!value) {
+ ret = -1;
+ goto end;
+ }
+ ret = snprintf (key, sizeof (key), "%s.vgname",
+ key_prefix);
+ if (ret < 0) {
+ goto end;
+ }
+
+ ret = dict_set_dynstr (rsp_dict, key, value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not save vgname ");
+ goto end;
+ }
+ }
+
+ token = strtok (NULL, ":");
+ if (token != NULL) {
+ value = gf_strdup (token);
+ if (!value) {
+ ret = -1;
+ goto end;
+ }
+ ret = snprintf (key, sizeof (key), "%s.data",
+ key_prefix);
+ if (ret < 0) {
+ goto end;
+ }
+
+ ret = dict_set_dynstr (rsp_dict, key, value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not save data percent ");
+ goto end;
+ }
+ }
+ token = strtok (NULL, ":");
+ if (token != NULL) {
+ value = gf_strdup (token);
+ if (!value) {
+ ret = -1;
+ goto end;
+ }
+ ret = snprintf (key, sizeof (key), "%s.lvsize",
+ key_prefix);
+ if (ret < 0) {
+ goto end;
+ }
+
+ ret = dict_set_dynstr (rsp_dict, key, value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not save meta data percent ");
+ goto end;
+ }
+ }
+
+ } while (ptr != NULL);
+
+ ret = 0;
+
+end:
+ runner_end (&runner);
+
+out:
+ if (ret && value) {
+ GF_FREE (value);
+ }
+
+ return ret;
+}
+
+int
+glusterd_get_single_brick_status (char **op_errstr, dict_t *rsp_dict,
+ char *keyprefix, int index,
+ glusterd_volinfo_t *snap_volinfo,
+ glusterd_brickinfo_t *brickinfo)
+{
+ int ret = -1;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ char key[PATH_MAX] = "";
+ char *device = NULL;
+ char *value = NULL;
+ char brick_path[PATH_MAX] = "";
+ char pidfile[PATH_MAX] = "";
+ pid_t pid = -1;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ GF_ASSERT (op_errstr);
+ GF_ASSERT (rsp_dict);
+ GF_ASSERT (keyprefix);
+ GF_ASSERT (snap_volinfo);
+ GF_ASSERT (brickinfo);
+
+ ret = snprintf (key, sizeof (key), "%s.brick%d.path", keyprefix,
+ index);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = snprintf (brick_path, sizeof (brick_path),
+ "%s:%s", brickinfo->hostname, brickinfo->path);
+ if (ret < 0) {
+ goto out;
+ }
+
+ value = gf_strdup (brick_path);
+ if (!value) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_dynstr (rsp_dict, key, value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to store "
+ "brick_path %s", brickinfo->path);
+ goto out;
+ }
+
+ if (brickinfo->snap_status == -1) {
+ /* Setting vgname as "Pending Snapshot" */
+ value = gf_strdup ("Pending Snapshot");
+ if (!value) {
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "%s.brick%d.vgname",
+ keyprefix, index);
+ ret = dict_set_dynstr (rsp_dict, key, value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not save vgname ");
+ goto out;
+ }
+
+ ret = 0;
+ goto out;
+ }
+ value = NULL;
+
+ ret = snprintf (key, sizeof (key), "%s.brick%d.status",
+ keyprefix, index);
+ if (ret < 0) {
+ goto out;
+ }
+
+ if (brickinfo->status == GF_BRICK_STOPPED) {
+ value = gf_strdup ("No");
+ if (!value) {
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_str (rsp_dict, key, value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not save brick status");
+ goto out;
+ }
+ value = NULL;
+ } else {
+ value = gf_strdup ("Yes");
+ if (!value) {
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_str (rsp_dict, key, value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not save brick status");
+ goto out;
+ }
+ value = NULL;
+
+ GLUSTERD_GET_BRICK_PIDFILE (pidfile, snap_volinfo,
+ brickinfo, priv);
+ ret = gf_is_service_running (pidfile, &pid);
+
+ ret = snprintf (key, sizeof (key), "%s.brick%d.pid",
+ keyprefix, index);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = dict_set_int32 (rsp_dict, key, pid);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not save pid %d", pid);
+ goto out;
+ }
+ }
+
+ ret = snprintf (key, sizeof (key), "%s.brick%d",
+ keyprefix, index);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = glusterd_get_brick_lvm_details (rsp_dict, brickinfo,
+ snap_volinfo->volname,
+ device, key);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get "
+ "brick LVM details");
+ goto out;
+ }
+out:
+ if (ret && value) {
+ GF_FREE (value);
+ }
+
+ return ret;
+}
+
+int
+glusterd_get_single_snap_status (char **op_errstr, dict_t *rsp_dict,
+ char *keyprefix, glusterd_snap_t *snap)
+{
+ int ret = -1;
+ xlator_t *this = NULL;
+ char key[PATH_MAX] = "";
+ char brickkey[PATH_MAX] = "";
+ glusterd_volinfo_t *snap_volinfo = NULL;
+ glusterd_volinfo_t *tmp_volinfo = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ int volcount = 0;
+ int brickcount = 0;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ GF_ASSERT (op_errstr);
+ GF_ASSERT (rsp_dict);
+ GF_ASSERT (keyprefix);
+ GF_ASSERT (snap);
+
+ list_for_each_entry_safe (snap_volinfo, tmp_volinfo, &snap->volumes,
+ vol_list) {
+ ret = snprintf (key, sizeof (key), "%s.vol%d", keyprefix,
+ volcount);
+ if (ret < 0) {
+ goto out;
+ }
+ list_for_each_entry (brickinfo, &snap_volinfo->bricks,
+ brick_list) {
+ if (!glusterd_is_local_brick (this, snap_volinfo,
+ brickinfo)) {
+ brickcount++;
+ continue;
+ }
+
+ ret = glusterd_get_single_brick_status (op_errstr,
+ rsp_dict, key, brickcount,
+ snap_volinfo, brickinfo);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Getting "
+ "single snap status failed");
+ goto out;
+ }
+ brickcount++;
+ }
+ ret = snprintf (brickkey, sizeof (brickkey), "%s.brickcount",
+ key);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = dict_set_int32 (rsp_dict, brickkey, brickcount);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not save brick count");
+ goto out;
+ }
+ volcount++;
+ }
+
+ ret = snprintf (key, sizeof (key), "%s.volcount", keyprefix);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = dict_set_int32 (rsp_dict, key, volcount);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not save volcount");
+ goto out;
+ }
+
+out:
+
+ return ret;
+}
+
+int
+glusterd_get_each_snap_object_status (char **op_errstr, dict_t *rsp_dict,
+ glusterd_snap_t *snap, char *keyprefix)
+{
+ int ret = -1;
+ char key[PATH_MAX] = "";
+ char *temp = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (op_errstr);
+ GF_ASSERT (rsp_dict);
+ GF_ASSERT (snap);
+ GF_ASSERT (keyprefix);
+
+ /* TODO : Get all the snap volume info present in snap object,
+ * as of now, There will be only one snapvolinfo per snap object
+ */
+ ret = snprintf (key, sizeof (key), "%s.snapname", keyprefix);
+ if (ret < 0) {
+ goto out;
+ }
+
+ temp = gf_strdup (snap->snapname);
+ if (temp == NULL) {
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_dynstr (rsp_dict, key, temp);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not save "
+ "snap name");
+ goto out;
+ }
+
+ temp = NULL;
+
+ ret = snprintf (key, sizeof (key), "%s.uuid", keyprefix);
+ if (ret < 0) {
+ goto out;
+ }
+
+ temp = gf_strdup (uuid_utoa (snap->snap_id));
+ if (temp == NULL) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_dynstr (rsp_dict, key, temp);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not save "
+ "snap UUID");
+ goto out;
+ }
+
+ temp = NULL;
+
+ ret = glusterd_get_single_snap_status (op_errstr, rsp_dict, keyprefix,
+ snap);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not get single snap status");
+ goto out;
+ }
+
+ ret = snprintf (key, sizeof (key), "%s.volcount", keyprefix);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = dict_set_int32 (rsp_dict, key, 1);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not save volcount");
+ goto out;
+ }
+out:
+ if (ret && temp)
+ GF_FREE (temp);
+
+ return ret;
+}
+
+int
+glusterd_get_snap_status_of_volume (char **op_errstr, dict_t *rsp_dict,
+ char *volname, char *keyprefix) {
+ int ret = -1;
+ glusterd_volinfo_t *snap_volinfo = NULL;
+ glusterd_volinfo_t *temp_volinfo = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ char key[PATH_MAX] = "";
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ int i = 0;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ GF_ASSERT (op_errstr);
+ GF_ASSERT (rsp_dict);
+ GF_ASSERT (volname);
+ GF_ASSERT (keyprefix);
+
+ ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get volinfo of "
+ "volume %s", volname);
+ goto out;
+ }
+
+ list_for_each_entry_safe (snap_volinfo, temp_volinfo,
+ &volinfo->snap_volumes, snapvol_list) {
+ ret = snprintf (key, sizeof (key), "status.snap%d", i);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = glusterd_get_each_snap_object_status (op_errstr,
+ rsp_dict, snap_volinfo->snapshot, key);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Function : "
+ "glusterd_get_single_snap_status failed");
+ goto out;
+ }
+ i++;
+ }
+
+ ret = dict_set_int32 (rsp_dict, "status.snapcount", i);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to save snapcount");
+ ret = -1;
+ goto out;
+ }
+out:
+ return ret;
+}
+
+int
+glusterd_get_all_snapshot_status (dict_t *dict, char **op_errstr,
+ dict_t *rsp_dict)
+{
+ int32_t i = 0;
+ int ret = -1;
+ char key[PATH_MAX] = "";
+ glusterd_conf_t *priv = NULL;
+ glusterd_snap_t *snap = NULL;
+ glusterd_snap_t *tmp_snap = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ GF_ASSERT (dict);
+ GF_ASSERT (op_errstr);
+
+ list_for_each_entry_safe (snap, tmp_snap,
+ &priv->snapshots, snap_list) {
+ ret = snprintf (key, sizeof (key), "status.snap%d", i);
+ if (ret < 0) {
+ goto out;
+ }
+
+ ret = glusterd_get_each_snap_object_status (op_errstr,
+ rsp_dict, snap, key);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not get "
+ "the details of a snap object: %s",
+ snap->snapname);
+ goto out;
+ }
+ i++;
+ }
+
+ ret = dict_set_int32 (rsp_dict, "status.snapcount", i);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not save snapcount");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+
+int
+glusterd_snapshot_status_commit (dict_t *dict, char **op_errstr,
+ dict_t *rsp_dict)
+{
+ xlator_t *this = NULL;
+ int ret = -1;
+ glusterd_conf_t *conf = NULL;
+ char *get_buffer = NULL;
+ int32_t cmd = -1;
+ char *snapname = NULL;
+ glusterd_snap_t *snap = NULL;
+ char *volname = NULL;
+
+ this = THIS;
+
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (op_errstr);
+
+ conf = this->private;
+
+ GF_ASSERT (conf);
+ ret = dict_get_int32 (dict, "cmd", &cmd);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get status cmd type");
+ goto out;
+ }
+
+ ret = dict_set_int32 (rsp_dict, "cmd", cmd);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not save status cmd in rsp dictionary");
+ goto out;
+ }
+ switch (cmd) {
+ case GF_SNAP_STATUS_TYPE_ALL:
+ {
+ ret = glusterd_get_all_snapshot_status (dict, op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to "
+ "get snapshot status");
+ goto out;
+ }
+ break;
+ }
+ case GF_SNAP_STATUS_TYPE_SNAP:
+ {
+
+ ret = dict_get_str (dict, "snapname", &snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to "
+ "get snap name");
+ goto out;
+ }
+
+ snap = glusterd_find_snap_by_name (snapname);
+ if (!snap) {
+ ret = gf_asprintf (op_errstr, "Snap (%s) "
+ "not found", snapname);
+ if (ret < 0) {
+ goto out;
+ }
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR, "Unable to "
+ "get snap volinfo");
+ goto out;
+ }
+ ret = glusterd_get_each_snap_object_status (op_errstr,
+ rsp_dict, snap, "status.snap0");
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to "
+ "get status of snap %s", get_buffer);
+ goto out;
+ }
+ break;
+ }
+ case GF_SNAP_STATUS_TYPE_VOL:
+ {
+ ret = dict_get_str (dict, "volname", &volname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to"
+ " get volume name");
+ goto out;
+ }
+
+ ret = glusterd_get_snap_status_of_volume (op_errstr,
+ rsp_dict, volname, "status.vol0");
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Function :"
+ " glusterd_get_snap_status_of_volume "
+ "failed");
+ goto out;
+ }
+ }
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+int32_t
+glusterd_snapshot_create_postvalidate (dict_t *dict, int32_t op_ret,
+ char **op_errstr, dict_t *rsp_dict)
+{
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ int ret = -1;
+
+ this = THIS;
+
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (rsp_dict);
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ if (op_ret) {
+ ret = glusterd_do_snap_cleanup (dict, op_errstr, rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "cleanup operation "
+ "failed");
+ goto out;
+ }
+ } else {
+ ret = glusterd_snapshot_update_snaps_post_validate (dict,
+ op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "create snapshot");
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int32_t
+glusterd_snapshot (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ int32_t snap_command = 0;
+ int ret = -1;
+
+ this = THIS;
+
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (rsp_dict);
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ ret = dict_get_int32 (dict, "type", &snap_command);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "unable to get the type of "
+ "the snapshot command");
+ goto out;
+ }
+
+ switch (snap_command) {
+ case (GF_SNAP_OPTION_TYPE_CREATE):
+ ret = glusterd_snapshot_create_commit (dict, op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "create snapshot");
+ goto out;
+ }
+ break;
+
+ case GF_SNAP_OPTION_TYPE_CONFIG:
+ ret = glusterd_snapshot_config_commit (dict, op_errstr,
+ rsp_dict);
+ break;
+
+ case GF_SNAP_OPTION_TYPE_DELETE:
+ ret = glusterd_snapshot_remove_commit (dict, op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "delete snapshot");
+ goto out;
+ }
+ break;
+
+ case GF_SNAP_OPTION_TYPE_RESTORE:
+ ret = glusterd_snapshot_restore (dict, op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Failed to "
+ "restore snapshot");
+ goto out;
+ }
+
+ break;
+
+ case GF_SNAP_OPTION_TYPE_STATUS:
+ ret = glusterd_snapshot_status_commit (dict, op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "show snapshot status");
+ goto out;
+ }
+ break;
+
+
+ default:
+ gf_log (this->name, GF_LOG_WARNING, "invalid snap command");
+ goto out;
+ break;
+ }
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
+int
+glusterd_snapshot_brickop (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
+{
+ int ret = -1;
+ int64_t vol_count = 0;
+ int64_t count = 1;
+ char key[1024] = {0,};
+ char *volname = NULL;
+ int32_t snap_command = 0;
+ xlator_t *this = NULL;
+
+ this = THIS;
+
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (rsp_dict);
+
+ ret = dict_get_int32 (dict, "type", &snap_command);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "unable to get the type of "
+ "the snapshot command");
+ goto out;
+ }
+
+ switch (snap_command) {
+ case GF_SNAP_OPTION_TYPE_CREATE:
+ ret = dict_get_int64 (dict, "volcount", &vol_count);
+ if (ret)
+ goto out;
+ while (count <= vol_count) {
+ snprintf (key, 1024, "volname%"PRId64, count);
+ ret = dict_get_str (dict, key, &volname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to get volname");
+ goto out;
+ }
+ ret = dict_set_str (dict, "volname", volname);
+ if (ret)
+ goto out;
+
+ ret = gd_brick_op_phase (GD_OP_SNAP, NULL, dict,
+ op_errstr);
+ if (ret)
+ goto out;
+ volname = NULL;
+ count++;
+ }
+
+ dict_del (dict, "volname");
+ ret = 0;
+ break;
+ case GF_SNAP_OPTION_TYPE_DELETE:
+ break;
+ default:
+ break;
+ }
+
+out:
+ return ret;
+}
+
+int
+glusterd_snapshot_prevalidate (dict_t *dict, char **op_errstr,
+ dict_t *rsp_dict)
+{
+ int snap_command = 0;
+ xlator_t *this = NULL;
+ int ret = -1;
+
+ this = THIS;
+
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (rsp_dict);
+
+ ret = dict_get_int32 (dict, "type", &snap_command);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "unable to get the type of "
+ "the snapshot command");
+ goto out;
+ }
+
+ switch (snap_command) {
+ case (GF_SNAP_OPTION_TYPE_CREATE):
+ ret = glusterd_snapshot_create_prevalidate (dict, op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Snapshot create "
+ "pre-validation failed");
+ goto out;
+ }
+ break;
+
+ case (GF_SNAP_OPTION_TYPE_CONFIG):
+ ret = glusterd_snapshot_config_prevalidate (dict, op_errstr);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Snapshot config "
+ "pre-validation failed");
+ goto out;
+ }
+ break;
+
+ case GF_SNAP_OPTION_TYPE_RESTORE:
+ ret = glusterd_snapshot_restore_prevalidate (dict, op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Snapshot restore "
+ "validation failed");
+ goto out;
+ }
+ break;
+ case GF_SNAP_OPTION_TYPE_DELETE:
+ ret = glusterd_snapshot_remove_prevalidate (dict, op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Snapshot remove "
+ "validation failed");
+ goto out;
+ }
+ break;
+
+ case GF_SNAP_OPTION_TYPE_STATUS:
+ ret = glusterd_snapshot_status_prevalidate (dict, op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Snapshot status "
+ "validation failed");
+ goto out;
+ }
+ break;
+
+ default:
+ gf_log (this->name, GF_LOG_WARNING, "invalid snap command");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int
+glusterd_snapshot_postvalidate (dict_t *dict, int32_t op_ret, char **op_errstr,
+ dict_t *rsp_dict)
+{
+ int snap_command = 0;
+ xlator_t *this = NULL;
+ int ret = -1;
+
+ this = THIS;
+
+ GF_ASSERT (this);
+ GF_ASSERT (dict);
+ GF_ASSERT (rsp_dict);
+
+ ret = dict_get_int32 (dict, "type", &snap_command);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "unable to get the type of "
+ "the snapshot command");
+ goto out;
+ }
+
+ switch (snap_command) {
+ case GF_SNAP_OPTION_TYPE_CREATE:
+ ret = glusterd_snapshot_create_postvalidate (dict, op_ret,
+ op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Snapshot create "
+ "post-validation failed");
+ goto out;
+ }
+ break;
+
+ case GF_SNAP_OPTION_TYPE_DELETE:
+ case GF_SNAP_OPTION_TYPE_RESTORE:
+ ret = glusterd_snapshot_update_snaps_post_validate (dict,
+ op_errstr,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "update missed snaps list");
+ goto out;
+ }
+ break;
+
+ default:
+ gf_log (this->name, GF_LOG_WARNING, "invalid snap command");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int
+glusterd_handle_snapshot_fn (rpcsvc_request_t *req)
+{
+ int32_t ret = 0;
+ dict_t *dict = NULL;
+ gf_cli_req cli_req = {{0},};
+ glusterd_op_t cli_op = GD_OP_SNAP;
+ int type = 0;
+ glusterd_conf_t *conf = NULL;
+ char *host_uuid = NULL;
+ char err_str[2048] = {0,};
+ xlator_t *this = NULL;
+ char *volname = NULL;
+
+ GF_ASSERT (req);
+
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ ret = xdr_to_generic (req->msg[0], &cli_req,
+ (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
+ req->rpc_err = GARBAGE_ARGS;
+ goto out;
+ }
+
+ if (cli_req.dict.dict_len > 0) {
+ dict = dict_new ();
+ if (!dict)
+ goto out;
+
+ ret = dict_unserialize (cli_req.dict.dict_val,
+ cli_req.dict.dict_len,
+ &dict);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "failed to "
+ "unserialize req-buffer to dictionary");
+ snprintf (err_str, sizeof (err_str), "Unable to decode "
+ "the command");
+ goto out;
+ }
+
+ dict->extra_stdfree = cli_req.dict.dict_val;
+
+ host_uuid = gf_strdup (uuid_utoa(MY_UUID));
+ if (host_uuid == NULL) {
+ snprintf (err_str, sizeof (err_str), "Failed to get "
+ "the uuid of local glusterd");
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_dynstr (dict, "host-uuid", host_uuid);
+ if (ret) {
+ GF_FREE (host_uuid);
+ goto out;
+ }
+
+
+ } else {
+ gf_log (this->name, GF_LOG_ERROR, "request dict length is %d",
+ cli_req.dict.dict_len);
+ goto out;
+ }
+
+ ret = dict_get_int32 (dict, "type", &type);
+ if (ret < 0) {
+ snprintf (err_str, sizeof (err_str), "Command type not found");
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
+ goto out;
+ }
+
+ switch (type) {
+ case GF_SNAP_OPTION_TYPE_CREATE:
+ ret = glusterd_handle_snapshot_create (req, cli_op, dict,
+ err_str, sizeof (err_str));
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Snapshot create "
+ "failed: %s", err_str);
+ }
+ break;
+ case GF_SNAP_OPTION_TYPE_RESTORE:
+ ret = glusterd_handle_snapshot_restore (req, cli_op, dict,
+ err_str, sizeof (err_str));
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Snapshot restore "
+ "failed: %s", err_str);
+ }
+
+ break;
+ case GF_SNAP_OPTION_TYPE_INFO:
+ ret = glusterd_handle_snapshot_info (req, cli_op, dict,
+ err_str, sizeof (err_str));
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Snapshot info "
+ "failed");
+ }
+ break;
+ case GF_SNAP_OPTION_TYPE_LIST:
+ ret = glusterd_handle_snapshot_list (req, cli_op, dict,
+ err_str, sizeof (err_str));
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Snapshot list "
+ "failed");
+ }
+ break;
+ case GF_SNAP_OPTION_TYPE_CONFIG:
+ /* TODO : Type of lock to be taken when we are setting
+ * limits system wide
+ */
+ ret = dict_get_str (dict, "volname", &volname);
+ if (!volname) {
+ ret = dict_set_int32 (dict, "hold_vol_locks",
+ _gf_false);
+ if (ret) {
+ gf_log ("cli", GF_LOG_ERROR,
+ "Unable to set hold_vol_locks value "
+ "as _gf_false");
+ goto out;
+ }
+
+ }
+ ret = glusterd_mgmt_v3_initiate_all_phases (req, cli_op, dict);
+ break;
+ case GF_SNAP_OPTION_TYPE_DELETE:
+ ret = glusterd_handle_snapshot_remove (req, cli_op, dict,
+ err_str,
+ sizeof (err_str));
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Snapshot delete "
+ "failed: %s", err_str);
+ }
+ break;
+ case GF_SNAP_OPTION_TYPE_START:
+ case GF_SNAP_OPTION_TYPE_STOP:
+ case GF_SNAP_OPTION_TYPE_STATUS:
+ ret = glusterd_handle_snapshot_status (req, cli_op, dict,
+ err_str,
+ sizeof (err_str));
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Snapshot status "
+ "failed: %s", err_str);
+ }
+ break;
+ default:
+ gf_log (this->name, GF_LOG_ERROR, "Unkown snapshot request "
+ "type (%d)", type);
+ ret = -1; /* Failure */
+ }
+
+out:
+ if (ret) {
+ if (err_str[0] == '\0')
+ snprintf (err_str, sizeof (err_str),
+ "Operation failed");
+ ret = glusterd_op_send_cli_response (cli_op, ret, 0, req,
+ dict, err_str);
+ }
+
+ return ret;
+}
+
+int
+glusterd_handle_snapshot (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req, glusterd_handle_snapshot_fn);
+}
+
+static inline void
+glusterd_free_snap_op (glusterd_snap_op_t *snap_op)
+{
+ if (snap_op) {
+ if (snap_op->brick_path)
+ GF_FREE (snap_op->brick_path);
+
+ GF_FREE (snap_op);
+ }
+}
+
+static inline void
+glusterd_free_missed_snapinfo (glusterd_missed_snap_info *missed_snapinfo)
+{
+ glusterd_snap_op_t *snap_opinfo = NULL;
+ glusterd_snap_op_t *tmp = NULL;
+
+ if (missed_snapinfo) {
+ list_for_each_entry_safe (snap_opinfo, tmp,
+ &missed_snapinfo->snap_ops,
+ snap_ops_list) {
+ glusterd_free_snap_op (snap_opinfo);
+ snap_opinfo = NULL;
+ }
+
+ if (missed_snapinfo->node_uuid)
+ GF_FREE (missed_snapinfo->node_uuid);
+
+ if (missed_snapinfo->snap_uuid)
+ GF_FREE (missed_snapinfo->snap_uuid);
+
+ GF_FREE (missed_snapinfo);
+ }
+}
+
+/* Look for duplicates and accordingly update the list */
+int32_t
+glusterd_update_missed_snap_entry (glusterd_missed_snap_info *missed_snapinfo,
+ glusterd_snap_op_t *missed_snap_op)
+{
+ int32_t ret = -1;
+ glusterd_snap_op_t *snap_opinfo = NULL;
+ gf_boolean_t match = _gf_false;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT(this);
+ GF_ASSERT(missed_snapinfo);
+ GF_ASSERT(missed_snap_op);
+
+ list_for_each_entry (snap_opinfo, &missed_snapinfo->snap_ops,
+ snap_ops_list) {
+ /* If the entry is not for the same snap_vol_id
+ * then continue
+ */
+ if (strcmp (snap_opinfo->snap_vol_id,
+ missed_snap_op->snap_vol_id))
+ continue;
+
+ if ((!strcmp (snap_opinfo->brick_path,
+ missed_snap_op->brick_path)) &&
+ (snap_opinfo->op == missed_snap_op->op)) {
+ /* If two entries have conflicting status
+ * GD_MISSED_SNAP_DONE takes precedence
+ */
+ if ((snap_opinfo->status == GD_MISSED_SNAP_PENDING) &&
+ (missed_snap_op->status == GD_MISSED_SNAP_DONE)) {
+ snap_opinfo->status = GD_MISSED_SNAP_DONE;
+ gf_log (this->name, GF_LOG_INFO,
+ "Updating missed snap status "
+ "for %s:%s=%s:%d:%s:%d as DONE",
+ missed_snapinfo->node_uuid,
+ missed_snapinfo->snap_uuid,
+ snap_opinfo->snap_vol_id,
+ snap_opinfo->brick_num,
+ snap_opinfo->brick_path,
+ snap_opinfo->op);
+ ret = 0;
+ glusterd_free_snap_op (missed_snap_op);
+ goto out;
+ }
+ match = _gf_true;
+ break;
+ } else if ((snap_opinfo->brick_num ==
+ missed_snap_op->brick_num) &&
+ (snap_opinfo->op == GF_SNAP_OPTION_TYPE_CREATE) &&
+ ((missed_snap_op->op ==
+ GF_SNAP_OPTION_TYPE_DELETE) ||
+ (missed_snap_op->op ==
+ GF_SNAP_OPTION_TYPE_RESTORE))) {
+ /* Optimizing create and delete entries for the same
+ * brick and same node
+ */
+ gf_log (this->name, GF_LOG_INFO,
+ "Updating missed snap status "
+ "for %s:%s=%s:%d:%s:%d as DONE",
+ missed_snapinfo->node_uuid,
+ missed_snapinfo->snap_uuid,
+ snap_opinfo->snap_vol_id,
+ snap_opinfo->brick_num,
+ snap_opinfo->brick_path,
+ snap_opinfo->op);
+ snap_opinfo->status = GD_MISSED_SNAP_DONE;
+ ret = 0;
+ glusterd_free_snap_op (missed_snap_op);
+ goto out;
+ }
+ }
+
+ if (match == _gf_true) {
+ gf_log (this->name, GF_LOG_INFO,
+ "Duplicate entry. Not updating");
+ glusterd_free_snap_op (missed_snap_op);
+ } else {
+ list_add_tail (&missed_snap_op->snap_ops_list,
+ &missed_snapinfo->snap_ops);
+ }
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* Add new missed snap entry to the missed_snaps list. */
+int32_t
+glusterd_add_new_entry_to_list (char *missed_info, char *snap_vol_id,
+ int32_t brick_num, char *brick_path,
+ int32_t snap_op, int32_t snap_status)
+{
+ char *buf = NULL;
+ char *save_ptr = NULL;
+ char node_snap_info[PATH_MAX] = "";
+ int32_t ret = -1;
+ glusterd_missed_snap_info *missed_snapinfo = NULL;
+ glusterd_snap_op_t *missed_snap_op = NULL;
+ glusterd_conf_t *priv = NULL;
+ gf_boolean_t match = _gf_false;
+ gf_boolean_t free_missed_snap_info = _gf_false;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT(this);
+ GF_ASSERT(missed_info);
+ GF_ASSERT(snap_vol_id);
+ GF_ASSERT(brick_path);
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ /* Create the snap_op object consisting of the *
+ * snap id and the op */
+ ret = glusterd_missed_snap_op_new (&missed_snap_op);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to create new missed snap object.");
+ ret = -1;
+ goto out;
+ }
+
+ missed_snap_op->snap_vol_id = gf_strdup(snap_vol_id);
+ if (!missed_snap_op->snap_vol_id) {
+ ret = -1;
+ goto out;
+ }
+ missed_snap_op->brick_path = gf_strdup(brick_path);
+ if (!missed_snap_op->brick_path) {
+ ret = -1;
+ goto out;
+ }
+ missed_snap_op->brick_num = brick_num;
+ missed_snap_op->op = snap_op;
+ missed_snap_op->status = snap_status;
+
+ /* Look for other entries for the same node and same snap */
+ list_for_each_entry (missed_snapinfo, &priv->missed_snaps_list,
+ missed_snaps) {
+ snprintf (node_snap_info, sizeof(node_snap_info),
+ "%s:%s", missed_snapinfo->node_uuid,
+ missed_snapinfo->snap_uuid);
+ if (!strcmp (node_snap_info, missed_info)) {
+ /* Found missed snapshot info for *
+ * the same node and same snap */
+ match = _gf_true;
+ break;
+ }
+ }
+
+ if (match == _gf_false) {
+ /* First snap op missed for the brick */
+ ret = glusterd_missed_snapinfo_new (&missed_snapinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to create missed snapinfo");
+ goto out;
+ }
+ free_missed_snap_info = _gf_true;
+ buf = strtok_r (missed_info, ":", &save_ptr);
+ if (!buf) {
+ ret = -1;
+ goto out;
+ }
+ missed_snapinfo->node_uuid = gf_strdup(buf);
+ if (!missed_snapinfo->node_uuid) {
+ ret = -1;
+ goto out;
+ }
+
+ buf = strtok_r (NULL, ":", &save_ptr);
+ if (!buf) {
+ ret = -1;
+ goto out;
+ }
+ missed_snapinfo->snap_uuid = gf_strdup(buf);
+ if (!missed_snapinfo->snap_uuid) {
+ ret = -1;
+ goto out;
+ }
+
+ list_add_tail (&missed_snap_op->snap_ops_list,
+ &missed_snapinfo->snap_ops);
+ list_add_tail (&missed_snapinfo->missed_snaps,
+ &priv->missed_snaps_list);
+
+ ret = 0;
+ goto out;
+ } else {
+ ret = glusterd_update_missed_snap_entry (missed_snapinfo,
+ missed_snap_op);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to update existing missed snap entry.");
+ goto out;
+ }
+ }
+
+out:
+ if (ret) {
+ glusterd_free_snap_op (missed_snap_op);
+
+ if (missed_snapinfo &&
+ (free_missed_snap_info == _gf_true))
+ glusterd_free_missed_snapinfo (missed_snapinfo);
+ }
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* Add missing snap entries to the in-memory conf->missed_snap_list */
+int32_t
+glusterd_add_missed_snaps_to_list (dict_t *dict, int32_t missed_snap_count)
+{
+ char *buf = NULL;
+ char *tmp = NULL;
+ char *save_ptr = NULL;
+ char *nodeid = NULL;
+ char *snap_uuid = NULL;
+ char *snap_vol_id = NULL;
+ char *brick_path = NULL;
+ char missed_info[PATH_MAX] = "";
+ char name_buf[PATH_MAX] = "";
+ int32_t i = -1;
+ int32_t ret = -1;
+ int32_t brick_num = -1;
+ int32_t snap_op = -1;
+ int32_t snap_status = -1;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT(this);
+ GF_ASSERT(dict);
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ /* We can update the missed_snaps_list without acquiring *
+ * any additional locks as big lock will be held. */
+ for (i = 0; i < missed_snap_count; i++) {
+ snprintf (name_buf, sizeof(name_buf), "missed_snaps_%d",
+ i);
+ ret = dict_get_str (dict, name_buf, &buf);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to fetch %s", name_buf);
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_DEBUG, "missed_snap_entry = %s",
+ buf);
+
+ /* Need to make a duplicate string coz the same dictionary *
+ * is resent to the non-originator nodes */
+ tmp = gf_strdup (buf);
+ if (!tmp) {
+ ret = -1;
+ goto out;
+ }
+
+ /* Fetch the node-id, snap-id, brick_num,
+ * brick_path, snap_op and snap status
+ */
+ nodeid = strtok_r (tmp, ":", &save_ptr);
+ snap_uuid = strtok_r (NULL, "=", &save_ptr);
+ snap_vol_id = strtok_r (NULL, ":", &save_ptr);
+ brick_num = atoi(strtok_r (NULL, ":", &save_ptr));
+ brick_path = strtok_r (NULL, ":", &save_ptr);
+ snap_op = atoi(strtok_r (NULL, ":", &save_ptr));
+ snap_status = atoi(strtok_r (NULL, ":", &save_ptr));
+
+ if (!nodeid || !snap_uuid || !brick_path ||
+ !snap_vol_id || brick_num < 1 || snap_op < 1 ||
+ snap_status < 1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Invalid missed_snap_entry");
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (missed_info, sizeof(missed_info), "%s:%s",
+ nodeid, snap_uuid);
+
+ ret = glusterd_add_new_entry_to_list (missed_info,
+ snap_vol_id,
+ brick_num,
+ brick_path,
+ snap_op,
+ snap_status);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to store missed snaps_list");
+ goto out;
+ }
+
+ GF_FREE (tmp);
+ tmp = NULL;
+ }
+
+ ret = 0;
+out:
+ if (tmp)
+ GF_FREE (tmp);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* This function will restore origin volume to it's snap.
+ * The restore operation will simply replace the Gluster origin
+ * volume with the snap volume.
+ * TODO: Multi-volume delete to be done.
+ * Cleanup in case of restore failure is pending.
+ *
+ * @param orig_vol volinfo of origin volume
+ * @param snap_vol volinfo of snapshot volume
+ *
+ * @return 0 on success and negative value on error
+ */
+int
+gd_restore_snap_volume (dict_t *rsp_dict,
+ glusterd_volinfo_t *orig_vol,
+ glusterd_volinfo_t *snap_vol)
+{
+ int ret = -1;
+ glusterd_volinfo_t *new_volinfo = NULL;
+ glusterd_snap_t *snap = NULL;
+ xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
+ glusterd_volinfo_t *temp_volinfo = NULL;
+ glusterd_volinfo_t *voliter = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (rsp_dict);
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ GF_VALIDATE_OR_GOTO (this->name, orig_vol, out);
+ GF_VALIDATE_OR_GOTO (this->name, snap_vol, out);
+ snap = snap_vol->snapshot;
+ GF_VALIDATE_OR_GOTO (this->name, snap, out);
+
+ /* Snap volume must be stoped before performing the
+ * restore operation.
+ */
+ ret = glusterd_stop_volume (snap_vol);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to stop "
+ "snap volume");
+ goto out;
+ }
+
+ /* Create a new volinfo for the restored volume */
+ ret = glusterd_volinfo_dup (snap_vol, &new_volinfo, _gf_true);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to create volinfo");
+ goto out;
+ }
+
+ /* Following entries need to be derived from origin volume. */
+ strcpy (new_volinfo->volname, orig_vol->volname);
+ uuid_copy (new_volinfo->volume_id, orig_vol->volume_id);
+ new_volinfo->snap_count = orig_vol->snap_count;
+ new_volinfo->snap_max_hard_limit = orig_vol->snap_max_hard_limit;
+ uuid_copy (new_volinfo->restored_from_snap,
+ snap_vol->snapshot->snap_id);
+
+ /* Bump the version of the restored volume, so that nodes *
+ * which are done can sync during handshake */
+ new_volinfo->version = orig_vol->version;
+
+ list_for_each_entry_safe (voliter, temp_volinfo,
+ &orig_vol->snap_volumes, snapvol_list) {
+ list_add_tail (&voliter->snapvol_list,
+ &new_volinfo->snap_volumes);
+ }
+ /* Copy the snap vol info to the new_volinfo.*/
+ ret = glusterd_snap_volinfo_restore (rsp_dict, new_volinfo, snap_vol);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to restore snap");
+ (void)glusterd_volinfo_delete (new_volinfo);
+ goto out;
+ }
+
+ ret = glusterd_lvm_snapshot_remove (rsp_dict, orig_vol);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to remove "
+ "LVM backend");
+ (void)glusterd_volinfo_delete (new_volinfo);
+ goto out;
+ }
+
+ /* New volinfo always shows the status as created. Therefore
+ * set the status to the original volume's status. */
+ glusterd_set_volume_status (new_volinfo, orig_vol->status);
+
+ /* Once the new_volinfo is completely constructed then delete
+ * the orinal volinfo
+ */
+ ret = glusterd_volinfo_delete (orig_vol);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to delete volinfo");
+ (void)glusterd_volinfo_delete (new_volinfo);
+ goto out;
+ }
+
+ list_add_tail (&new_volinfo->vol_list, &conf->volumes);
+
+ /* Now delete the snap entry. As a first step delete the snap
+ * volume information stored in store. */
+ ret = glusterd_snap_remove (rsp_dict, snap, _gf_false, _gf_true);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Failed to delete "
+ "snap %s", snap->snapname);
+ goto out;
+ }
+
+ ret = glusterd_store_volinfo (new_volinfo,
+ GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to store volinfo");
+ goto out;
+ }
+
+ ret = 0;
+out:
+
+ return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-store.c b/xlators/mgmt/glusterd/src/glusterd-store.c
index 6f08cc7e2..afbc8ff35 100644
--- a/xlators/mgmt/glusterd/src/glusterd-store.c
+++ b/xlators/mgmt/glusterd/src/glusterd-store.c
@@ -1,5 +1,5 @@
/*
- Copyright (c) 2007-2012 Red Hat, Inc. <http://www.redhat.com>
+ Copyright (c) 2007-2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
@@ -35,6 +35,7 @@
#include "glusterd-op-sm.h"
#include "glusterd-utils.h"
#include "glusterd-hooks.h"
+#include "store.h"
#include "glusterd-store.h"
#include "rpc-clnt.h"
@@ -43,152 +44,10 @@
#include <sys/resource.h>
#include <inttypes.h>
#include <dirent.h>
+#include <mntent.h>
-static int32_t
-glusterd_store_mkdir (char *path)
-{
- int32_t ret = -1;
-
- ret = mkdir (path, 0777);
-
- if ((-1 == ret) && (EEXIST != errno)) {
- gf_log ("", GF_LOG_ERROR, "mkdir() failed on path %s,"
- "errno: %s", path, strerror (errno));
- } else {
- ret = 0;
- }
-
- return ret;
-}
-
-int32_t
-glusterd_store_handle_create_on_absence (glusterd_store_handle_t **shandle,
- char *path)
-{
- GF_ASSERT (shandle);
- int32_t ret = 0;
-
- if (*shandle == NULL) {
- ret = glusterd_store_handle_new (path, shandle);
-
- if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to create store"
- " handle for path: %s", path);
- }
- }
- return ret;
-}
-
-int32_t
-glusterd_store_mkstemp (glusterd_store_handle_t *shandle)
-{
- int fd = -1;
- char tmppath[PATH_MAX] = {0,};
-
- GF_ASSERT (shandle);
- GF_ASSERT (shandle->path);
-
- snprintf (tmppath, sizeof (tmppath), "%s.tmp", shandle->path);
- fd = open (tmppath, O_RDWR | O_CREAT | O_TRUNC | O_SYNC, 0600);
- if (fd <= 0) {
- gf_log ("glusterd", GF_LOG_ERROR, "Failed to open %s, "
- "error: %s", tmppath, strerror (errno));
- }
-
- return fd;
-}
-
-int
-glusterd_store_sync_direntry (char *path)
-{
- int ret = -1;
- int dirfd = -1;
- char *dir = NULL;
- char *pdir = NULL;
- xlator_t *this = NULL;
-
- this = THIS;
-
- dir = gf_strdup (path);
- if (!dir)
- goto out;
-
- pdir = dirname (dir);
- dirfd = open (pdir, O_RDONLY);
- if (dirfd == -1) {
- gf_log (this->name, GF_LOG_ERROR, "Failed to open directory "
- "%s, due to %s", pdir, strerror (errno));
- goto out;
- }
-
- ret = fsync (dirfd);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Failed to fsync %s, due to "
- "%s", pdir, strerror (errno));
- goto out;
- }
-
- ret = 0;
-out:
- if (dirfd >= 0) {
- ret = close (dirfd);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Failed to close "
- "%s, due to %s", pdir, strerror (errno));
- }
- }
-
- if (dir)
- GF_FREE (dir);
-
- return ret;
-}
-
-int32_t
-glusterd_store_rename_tmppath (glusterd_store_handle_t *shandle)
-{
- int32_t ret = -1;
- char tmppath[PATH_MAX] = {0,};
-
- GF_ASSERT (shandle);
- GF_ASSERT (shandle->path);
-
- snprintf (tmppath, sizeof (tmppath), "%s.tmp", shandle->path);
- ret = rename (tmppath, shandle->path);
- if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "Failed to rename %s to %s, "
- "error: %s", tmppath, shandle->path, strerror (errno));
- goto out;
- }
-
- ret = glusterd_store_sync_direntry (tmppath);
-out:
- return ret;
-}
-
-int32_t
-glusterd_store_unlink_tmppath (glusterd_store_handle_t *shandle)
-{
- int32_t ret = -1;
- char tmppath[PATH_MAX] = {0,};
-
- GF_ASSERT (shandle);
- GF_ASSERT (shandle->path);
-
- snprintf (tmppath, sizeof (tmppath), "%s.tmp", shandle->path);
- ret = unlink (tmppath);
- if (ret && (errno != ENOENT)) {
- gf_log ("glusterd", GF_LOG_ERROR, "Failed to mv %s to %s, "
- "error: %s", tmppath, shandle->path, strerror (errno));
- } else {
- ret = 0;
- }
-
- return ret;
-}
-
-static void
-glusterd_replace_slash_with_hipen (char *str)
+void
+glusterd_replace_slash_with_hyphen (char *str)
{
char *ptr = NULL;
@@ -213,7 +72,7 @@ glusterd_store_create_brick_dir (glusterd_volinfo_t *volinfo)
GF_ASSERT (priv);
GLUSTERD_GET_BRICK_DIR (brickdirpath, volinfo, priv);
- ret = glusterd_store_mkdir (brickdirpath);
+ ret = gf_store_mkdir (brickdirpath);
return ret;
}
@@ -227,7 +86,7 @@ glusterd_store_key_vol_brick_set (glusterd_brickinfo_t *brickinfo,
GF_ASSERT (len >= PATH_MAX);
snprintf (key_vol_brick, len, "%s", brickinfo->path);
- glusterd_replace_slash_with_hipen (key_vol_brick);
+ glusterd_replace_slash_with_hyphen (key_vol_brick);
}
static void
@@ -275,21 +134,26 @@ glusterd_store_is_valid_brickpath (char *volname, char *brick)
glusterd_volinfo_t *volinfo = NULL;
int32_t ret = 0;
size_t volname_len = strlen (volname);
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
ret = glusterd_brickinfo_new_from_brick (brick, &brickinfo);
if (ret) {
- gf_log ("", GF_LOG_WARNING, "brick path validation failed");
+ gf_log (this->name, GF_LOG_WARNING, "Failed to create brick "
+ "info for brick %s", brick);
ret = 0;
goto out;
}
ret = glusterd_volinfo_new (&volinfo);
if (ret) {
- gf_log ("", GF_LOG_WARNING, "brick path validation failed");
+ gf_log (this->name, GF_LOG_WARNING, "Failed to create volinfo");
ret = 0;
goto out;
}
if (volname_len >= sizeof (volinfo->volname)) {
- gf_log ("", GF_LOG_WARNING, "volume name too long");
+ gf_log (this->name, GF_LOG_WARNING, "volume name too long");
ret = 0;
goto out;
}
@@ -303,7 +167,7 @@ out:
if (brickinfo)
glusterd_brickinfo_delete (brickinfo);
if (volinfo)
- glusterd_volinfo_delete (volinfo);
+ glusterd_volinfo_unref (volinfo);
return ret;
}
@@ -321,7 +185,7 @@ glusterd_store_volinfo_brick_fname_write (int vol_fd,
brick_count);
glusterd_store_brickinfofname_set (brickinfo, brickfname,
sizeof (brickfname));
- ret = glusterd_store_save_value (vol_fd, key, brickfname);
+ ret = gf_store_save_value (vol_fd, key, brickfname);
if (ret)
goto out;
@@ -340,9 +204,9 @@ glusterd_store_create_brick_shandle_on_absence (glusterd_volinfo_t *volinfo,
GF_ASSERT (brickinfo);
glusterd_store_brickinfopath_set (volinfo, brickinfo, brickpath,
- sizeof (brickpath));
- ret = glusterd_store_handle_create_on_absence (&brickinfo->shandle,
- brickpath);
+ sizeof (brickpath));
+ ret = gf_store_handle_create_on_absence (&brickinfo->shandle,
+ brickpath);
return ret;
}
@@ -355,32 +219,55 @@ glusterd_store_brickinfo_write (int fd, glusterd_brickinfo_t *brickinfo)
GF_ASSERT (brickinfo);
GF_ASSERT (fd > 0);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_HOSTNAME,
- brickinfo->hostname);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_HOSTNAME,
+ brickinfo->hostname);
if (ret)
goto out;
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_PATH,
- brickinfo->path);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_PATH,
+ brickinfo->path);
if (ret)
goto out;
snprintf (value, sizeof(value), "%d", brickinfo->port);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_PORT,
- value);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_PORT, value);
snprintf (value, sizeof(value), "%d", brickinfo->rdma_port);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_RDMA_PORT,
- value);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_RDMA_PORT,
+ value);
snprintf (value, sizeof(value), "%d", brickinfo->decommissioned);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED,
- value);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED,
+ value);
+ if (ret)
+ goto out;
+
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_ID,
+ brickinfo->brick_id);
+ if (ret)
+ goto out;
+
+ if (strlen(brickinfo->device_path) > 0) {
+ snprintf (value, sizeof(value), "%s", brickinfo->device_path);
+ ret = gf_store_save_value (fd,
+ GLUSTERD_STORE_KEY_BRICK_DEVICE_PATH, value);
+ if (ret)
+ goto out;
+ }
+
+ snprintf (value, sizeof(value), "%d", brickinfo->snap_status);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_SNAP_STATUS,
+ value);
if (ret)
goto out;
+ if (!brickinfo->vg[0])
+ goto out;
+
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_VGNAME,
+ brickinfo->vg);
out:
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -391,7 +278,7 @@ glusterd_store_perform_brick_store (glusterd_brickinfo_t *brickinfo)
int32_t ret = -1;
GF_ASSERT (brickinfo);
- fd = glusterd_store_mkstemp (brickinfo->shandle);
+ fd = gf_store_mkstemp (brickinfo->shandle);
if (fd <= 0) {
ret = -1;
goto out;
@@ -403,10 +290,10 @@ glusterd_store_perform_brick_store (glusterd_brickinfo_t *brickinfo)
out:
if (ret && (fd > 0))
- glusterd_store_unlink_tmppath (brickinfo->shandle);
+ gf_store_unlink_tmppath (brickinfo->shandle);
if (fd > 0)
close (fd);
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -436,30 +323,27 @@ glusterd_store_brickinfo (glusterd_volinfo_t *volinfo,
ret = glusterd_store_perform_brick_store (brickinfo);
out:
- gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
+ gf_log (THIS->name, GF_LOG_DEBUG, "Returning with %d", ret);
return ret;
}
int32_t
-glusterd_store_delete_brick (glusterd_volinfo_t *volinfo,
- glusterd_brickinfo_t *brickinfo)
+glusterd_store_delete_brick (glusterd_brickinfo_t *brickinfo, char *delete_path)
{
int32_t ret = -1;
glusterd_conf_t *priv = NULL;
- char path[PATH_MAX] = {0,};
char brickpath[PATH_MAX] = {0,};
char *ptr = NULL;
char *tmppath = NULL;
+ xlator_t *this = NULL;
- GF_ASSERT (volinfo);
+ this = THIS;
+ GF_ASSERT (this);
GF_ASSERT (brickinfo);
- priv = THIS->private;
-
+ priv = this->private;
GF_ASSERT (priv);
- GLUSTERD_GET_BRICK_DIR (path, volinfo, priv);
-
tmppath = gf_strdup (brickinfo->path);
ptr = strchr (tmppath, '/');
@@ -469,16 +353,17 @@ glusterd_store_delete_brick (glusterd_volinfo_t *volinfo,
ptr = strchr (tmppath, '/');
}
- snprintf (brickpath, sizeof (brickpath), "%s/%s:%s",
- path, brickinfo->hostname, tmppath);
+ snprintf (brickpath, sizeof (brickpath),
+ "%s/"GLUSTERD_BRICK_INFO_DIR"/%s:%s", delete_path,
+ brickinfo->hostname, tmppath);
GF_FREE (tmppath);
ret = unlink (brickpath);
if ((ret < 0) && (errno != ENOENT)) {
- gf_log ("", GF_LOG_ERROR, "Unlink failed on %s, reason: %s",
- brickpath, strerror(errno));
+ gf_log (this->name, GF_LOG_DEBUG, "Unlink failed on %s, "
+ "reason: %s", brickpath, strerror(errno));
ret = -1;
goto out;
} else {
@@ -487,15 +372,15 @@ glusterd_store_delete_brick (glusterd_volinfo_t *volinfo,
out:
if (brickinfo->shandle) {
- glusterd_store_handle_destroy (brickinfo->shandle);
+ gf_store_handle_destroy (brickinfo->shandle);
brickinfo->shandle = NULL;
}
- gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning with %d", ret);
return ret;
}
int32_t
-glusterd_store_remove_bricks (glusterd_volinfo_t *volinfo)
+glusterd_store_remove_bricks (glusterd_volinfo_t *volinfo, char *delete_path)
{
int32_t ret = 0;
glusterd_brickinfo_t *tmp = NULL;
@@ -504,19 +389,24 @@ glusterd_store_remove_bricks (glusterd_volinfo_t *volinfo)
DIR *dir = NULL;
struct dirent *entry = NULL;
char path[PATH_MAX] = {0,};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
GF_ASSERT (volinfo);
list_for_each_entry (tmp, &volinfo->bricks, brick_list) {
- ret = glusterd_store_delete_brick (volinfo, tmp);
+ ret = glusterd_store_delete_brick (tmp, delete_path);
if (ret)
goto out;
}
- priv = THIS->private;
+ priv = this->private;
GF_ASSERT (priv);
- GLUSTERD_GET_BRICK_DIR (brickdir, volinfo, priv);
+ snprintf (brickdir, sizeof (brickdir), "%s/%s", delete_path,
+ GLUSTERD_BRICK_INFO_DIR);
dir = opendir (brickdir);
@@ -527,7 +417,7 @@ glusterd_store_remove_bricks (glusterd_volinfo_t *volinfo)
brickdir, entry->d_name);
ret = unlink (path);
if (ret && errno != ENOENT) {
- gf_log ("", GF_LOG_ERROR, "Unable to unlink %s, "
+ gf_log (this->name, GF_LOG_DEBUG, "Unable to unlink %s, "
"reason: %s", path, strerror(errno));
}
glusterd_for_each_entry (entry, dir);
@@ -538,7 +428,7 @@ glusterd_store_remove_bricks (glusterd_volinfo_t *volinfo)
ret = rmdir (brickdir);
out:
- gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning with %d", ret);
return ret;
}
@@ -546,9 +436,13 @@ static int
_storeslaves (dict_t *this, char *key, data_t *value, void *data)
{
int32_t ret = 0;
- glusterd_store_handle_t *shandle = NULL;
+ gf_store_handle_t *shandle = NULL;
+ xlator_t *xl = NULL;
- shandle = (glusterd_store_handle_t*)data;
+ xl = THIS;
+ GF_ASSERT (xl);
+
+ shandle = (gf_store_handle_t*)data;
GF_ASSERT (shandle);
GF_ASSERT (shandle->fd > 0);
@@ -564,12 +458,12 @@ _storeslaves (dict_t *this, char *key, data_t *value, void *data)
if (!value || !value->data)
return -1;
- gf_log ("", GF_LOG_DEBUG, "Storing in volinfo:key= %s, val=%s",
+ gf_log (xl->name, GF_LOG_DEBUG, "Storing in volinfo:key= %s, val=%s",
key, value->data);
- ret = glusterd_store_save_value (shandle->fd, key, (char*)value->data);
+ ret = gf_store_save_value (shandle->fd, key, (char*)value->data);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to write into store"
+ gf_log (xl->name, GF_LOG_ERROR, "Unable to write into store"
" handle for path: %s", shandle->path);
return -1;
}
@@ -581,9 +475,13 @@ int _storeopts (dict_t *this, char *key, data_t *value, void *data)
{
int32_t ret = 0;
int32_t exists = 0;
- glusterd_store_handle_t *shandle = NULL;
+ gf_store_handle_t *shandle = NULL;
+ xlator_t *xl = NULL;
+
+ xl = THIS;
+ GF_ASSERT (xl);
- shandle = (glusterd_store_handle_t*)data;
+ shandle = (gf_store_handle_t*)data;
GF_ASSERT (shandle);
GF_ASSERT (shandle->fd > 0);
@@ -607,19 +505,19 @@ int _storeopts (dict_t *this, char *key, data_t *value, void *data)
}
if (1 == exists) {
- gf_log ("", GF_LOG_DEBUG, "Storing in volinfo:key= %s, val=%s",
- key, value->data);
+ gf_log (xl->name, GF_LOG_DEBUG, "Storing in volinfo:key= %s, "
+ "val=%s", key, value->data);
} else {
- gf_log ("", GF_LOG_DEBUG, "Discarding:key= %s, val=%s",
+ gf_log (xl->name, GF_LOG_DEBUG, "Discarding:key= %s, val=%s",
key, value->data);
return 0;
}
- ret = glusterd_store_save_value (shandle->fd, key, (char*)value->data);
+ ret = gf_store_save_value (shandle->fd, key, (char*)value->data);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to write into store"
- " handle for path: %s", shandle->path);
+ gf_log (xl->name, GF_LOG_ERROR, "Unable to write into store"
+ " handle for path: %s", shandle->path);
return -1;
}
return 0;
@@ -628,85 +526,125 @@ int _storeopts (dict_t *this, char *key, data_t *value, void *data)
int32_t
glusterd_volume_exclude_options_write (int fd, glusterd_volinfo_t *volinfo)
{
- char *str = NULL;
+ char *str = NULL;
+ char buf[PATH_MAX] = "";
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+ this = THIS;
+ GF_ASSERT (this);
GF_ASSERT (fd > 0);
GF_ASSERT (volinfo);
- char buf[PATH_MAX] = {0,};
- int32_t ret = -1;
-
snprintf (buf, sizeof (buf), "%d", volinfo->type);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_TYPE, buf);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_TYPE, buf);
if (ret)
goto out;
snprintf (buf, sizeof (buf), "%d", volinfo->brick_count);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_COUNT, buf);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_COUNT, buf);
if (ret)
goto out;
snprintf (buf, sizeof (buf), "%d", volinfo->status);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_STATUS, buf);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_STATUS, buf);
if (ret)
goto out;
snprintf (buf, sizeof (buf), "%d", volinfo->sub_count);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_SUB_COUNT,
- buf);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_SUB_COUNT, buf);
if (ret)
goto out;
snprintf (buf, sizeof (buf), "%d", volinfo->stripe_count);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_STRIPE_CNT,
- buf);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_STRIPE_CNT, buf);
if (ret)
goto out;
snprintf (buf, sizeof (buf), "%d", volinfo->replica_count);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_REPLICA_CNT,
- buf);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_REPLICA_CNT,
+ buf);
if (ret)
goto out;
snprintf (buf, sizeof (buf), "%d", volinfo->version);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_VERSION,
- buf);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_VERSION, buf);
if (ret)
goto out;
snprintf (buf, sizeof (buf), "%d", volinfo->transport_type);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_TRANSPORT,
- buf);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_TRANSPORT, buf);
if (ret)
goto out;
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_ID,
- uuid_utoa (volinfo->volume_id));
+ snprintf (buf, sizeof (buf), "%s", volinfo->parent_volname);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_PARENT_VOLNAME, buf);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to store "
+ GLUSTERD_STORE_KEY_PARENT_VOLNAME);
+ goto out;
+ }
+
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_ID,
+ uuid_utoa (volinfo->volume_id));
if (ret)
goto out;
str = glusterd_auth_get_username (volinfo);
if (str) {
- ret = glusterd_store_save_value (fd,
- GLUSTERD_STORE_KEY_USERNAME,
- str);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_USERNAME,
+ str);
if (ret)
goto out;
}
str = glusterd_auth_get_password (volinfo);
if (str) {
- ret = glusterd_store_save_value (fd,
- GLUSTERD_STORE_KEY_PASSWORD,
- str);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_PASSWORD,
+ str);
+ if (ret)
+ goto out;
+ }
+
+ snprintf (buf, sizeof (buf), "%d", volinfo->op_version);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_OP_VERSION, buf);
+ if (ret)
+ goto out;
+
+ snprintf (buf, sizeof (buf), "%d", volinfo->client_op_version);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_CLIENT_OP_VERSION,
+ buf);
+ if (ret)
+ goto out;
+ if (volinfo->caps) {
+ snprintf (buf, sizeof (buf), "%d", volinfo->caps);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_CAPS,
+ buf);
if (ret)
goto out;
}
+
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_RESTORED_SNAP,
+ uuid_utoa (volinfo->restored_from_snap));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to write restored_from_snap");
+ goto out;
+ }
+
+ snprintf (buf, sizeof (buf), "%"PRIu64, volinfo->snap_max_hard_limit);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT,
+ buf);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to write snap-max-hard-limit");
+ goto out;
+ }
+
out:
if (ret)
- gf_log ("", GF_LOG_ERROR, "Unable to write volume values"
- " for %s", volinfo->volname);
+ gf_log (this->name, GF_LOG_ERROR, "Unable to write volume "
+ "values for %s", volinfo->volname);
return ret;
}
@@ -720,8 +658,7 @@ glusterd_store_voldirpath_set (glusterd_volinfo_t *volinfo, char *voldirpath,
priv = THIS->private;
GF_ASSERT (priv);
- snprintf (voldirpath, len, "%s/%s/%s", priv->workdir,
- GLUSTERD_VOLUME_DIR_PREFIX, volinfo->volname);
+ GLUSTERD_GET_VOLUME_DIR (voldirpath, volinfo, priv);
}
static int32_t
@@ -734,8 +671,30 @@ glusterd_store_create_volume_dir (glusterd_volinfo_t *volinfo)
glusterd_store_voldirpath_set (volinfo, voldirpath,
sizeof (voldirpath));
- ret = glusterd_store_mkdir (voldirpath);
- gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
+ ret = gf_store_mkdir (voldirpath);
+
+ gf_log (THIS->name, GF_LOG_DEBUG, "Returning with %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_store_create_snap_dir (glusterd_snap_t *snap)
+{
+ int32_t ret = -1;
+ char snapdirpath[PATH_MAX] = {0,};
+ glusterd_conf_t *priv = NULL;
+
+ priv = THIS->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (snap);
+
+ GLUSTERD_GET_SNAP_DIR (snapdirpath, snap, priv);
+
+ ret = mkdir_p (snapdirpath, 0755, _gf_true);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR, "Failed to create snaps dir "
+ "%s", snapdirpath);
+ }
return ret;
}
@@ -743,7 +702,7 @@ int32_t
glusterd_store_volinfo_write (int fd, glusterd_volinfo_t *volinfo)
{
int32_t ret = -1;
- glusterd_store_handle_t *shandle = NULL;
+ gf_store_handle_t *shandle = NULL;
GF_ASSERT (fd > 0);
GF_ASSERT (volinfo);
GF_ASSERT (volinfo->shandle);
@@ -759,7 +718,50 @@ glusterd_store_volinfo_write (int fd, glusterd_volinfo_t *volinfo)
dict_foreach (volinfo->gsync_slaves, _storeslaves, shandle);
shandle->fd = 0;
out:
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_store_snapinfo_write (glusterd_snap_t *snap)
+{
+ int32_t ret = -1;
+ int fd = 0;
+ char buf[PATH_MAX] = "";
+
+ GF_ASSERT (snap);
+
+ fd = gf_store_mkstemp (snap->shandle);
+ if (fd <= 0)
+ goto out;
+
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_SNAP_ID,
+ uuid_utoa (snap->snap_id));
+ if (ret)
+ goto out;
+
+ snprintf (buf, sizeof (buf), "%d", snap->snap_status);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_SNAP_STATUS, buf);
+ if (ret)
+ goto out;
+
+ snprintf (buf, sizeof (buf), "%d", snap->snap_restored);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_SNAP_RESTORED, buf);
+ if (ret)
+ goto out;
+
+ if (snap->description) {
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_SNAP_DESC,
+ snap->description);
+ if (ret)
+ goto out;
+ }
+
+ snprintf (buf, sizeof (buf), "%ld", snap->time_stamp);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_SNAP_TIMESTAMP, buf);
+
+out:
+ gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -807,6 +809,51 @@ glusterd_store_node_state_path_set (glusterd_volinfo_t *volinfo,
GLUSTERD_NODE_STATE_FILE);
}
+static void
+glusterd_store_quota_conf_path_set (glusterd_volinfo_t *volinfo,
+ char *quota_conf_path, size_t len)
+{
+ char voldirpath[PATH_MAX] = {0,};
+ GF_ASSERT (volinfo);
+ GF_ASSERT (quota_conf_path);
+ GF_ASSERT (len <= PATH_MAX);
+
+ glusterd_store_voldirpath_set (volinfo, voldirpath,
+ sizeof (voldirpath));
+ snprintf (quota_conf_path, len, "%s/%s", voldirpath,
+ GLUSTERD_VOLUME_QUOTA_CONFIG);
+}
+
+static void
+glusterd_store_missed_snaps_list_path_set (char *missed_snaps_list,
+ size_t len)
+{
+ glusterd_conf_t *priv = NULL;
+
+ priv = THIS->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (missed_snaps_list);
+ GF_ASSERT (len <= PATH_MAX);
+
+ snprintf (missed_snaps_list, len, "%s/snaps/"
+ GLUSTERD_MISSED_SNAPS_LIST_FILE, priv->workdir);
+}
+
+static void
+glusterd_store_snapfpath_set (glusterd_snap_t *snap, char *snap_fpath,
+ size_t len)
+{
+ glusterd_conf_t *priv = NULL;
+ priv = THIS->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (snap);
+ GF_ASSERT (snap_fpath);
+ GF_ASSERT (len <= PATH_MAX);
+
+ snprintf (snap_fpath, len, "%s/snaps/%s/%s", priv->workdir,
+ snap->snapname, GLUSTERD_SNAP_INFO_FILE);
+}
+
int32_t
glusterd_store_create_rbstate_shandle_on_absence (glusterd_volinfo_t *volinfo)
{
@@ -816,8 +863,8 @@ glusterd_store_create_rbstate_shandle_on_absence (glusterd_volinfo_t *volinfo)
GF_ASSERT (volinfo);
glusterd_store_rbstatepath_set (volinfo, rbstatepath, sizeof (rbstatepath));
- ret = glusterd_store_handle_create_on_absence (&volinfo->rb_shandle,
- rbstatepath);
+ ret = gf_store_handle_create_on_absence (&volinfo->rb_shandle,
+ rbstatepath);
return ret;
}
@@ -830,8 +877,7 @@ glusterd_store_create_vol_shandle_on_absence (glusterd_volinfo_t *volinfo)
GF_ASSERT (volinfo);
glusterd_store_volfpath_set (volinfo, volfpath, sizeof (volfpath));
- ret = glusterd_store_handle_create_on_absence (&volinfo->shandle,
- volfpath);
+ ret = gf_store_handle_create_on_absence (&volinfo->shandle, volfpath);
return ret;
}
@@ -846,9 +892,62 @@ glusterd_store_create_nodestate_sh_on_absence (glusterd_volinfo_t *volinfo)
glusterd_store_node_state_path_set (volinfo, node_state_path,
sizeof (node_state_path));
ret =
- glusterd_store_handle_create_on_absence (&volinfo->node_state_shandle,
- node_state_path);
+ gf_store_handle_create_on_absence (&volinfo->node_state_shandle,
+ node_state_path);
+
+ return ret;
+}
+
+int32_t
+glusterd_store_create_quota_conf_sh_on_absence (glusterd_volinfo_t *volinfo)
+{
+ char quota_conf_path[PATH_MAX] = {0};
+ int32_t ret = 0;
+
+ GF_ASSERT (volinfo);
+
+ glusterd_store_quota_conf_path_set (volinfo, quota_conf_path,
+ sizeof (quota_conf_path));
+ ret =
+ gf_store_handle_create_on_absence (&volinfo->quota_conf_shandle,
+ quota_conf_path);
+
+ return ret;
+}
+
+static int32_t
+glusterd_store_create_missed_snaps_list_shandle_on_absence ()
+{
+ char missed_snaps_list[PATH_MAX] = "";
+ int32_t ret = -1;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ glusterd_store_missed_snaps_list_path_set (missed_snaps_list,
+ sizeof(missed_snaps_list));
+
+ ret = gf_store_handle_create_on_absence
+ (&priv->missed_snaps_list_shandle,
+ missed_snaps_list);
+ return ret;
+}
+
+int32_t
+glusterd_store_create_snap_shandle_on_absence (glusterd_snap_t *snap)
+{
+ char snapfpath[PATH_MAX] = {0};
+ int32_t ret = 0;
+
+ GF_ASSERT (snap);
+ glusterd_store_snapfpath_set (snap, snapfpath, sizeof (snapfpath));
+ ret = gf_store_handle_create_on_absence (&snap->shandle, snapfpath);
return ret;
}
@@ -863,13 +962,13 @@ glusterd_store_brickinfos (glusterd_volinfo_t *volinfo, int vol_fd)
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
ret = glusterd_store_brickinfo (volinfo, brickinfo,
- brick_count, vol_fd);
+ brick_count, vol_fd);
if (ret)
goto out;
brick_count++;
}
out:
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -877,36 +976,56 @@ int32_t
glusterd_store_rbstate_write (int fd, glusterd_volinfo_t *volinfo)
{
int ret = -1;
+ int port = 0;
char buf[PATH_MAX] = {0, };
GF_ASSERT (fd > 0);
GF_ASSERT (volinfo);
- snprintf (buf, sizeof (buf), "%d", volinfo->rb_status);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_RB_STATUS,
- buf);
+ snprintf (buf, sizeof (buf), "%d", volinfo->rep_brick.rb_status);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_RB_STATUS, buf);
if (ret)
goto out;
- if (volinfo->rb_status > GF_RB_STATUS_NONE) {
+ if (volinfo->rep_brick.rb_status > GF_RB_STATUS_NONE) {
snprintf (buf, sizeof (buf), "%s:%s",
- volinfo->src_brick->hostname,
- volinfo->src_brick->path);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_RB_SRC_BRICK,
- buf);
+ volinfo->rep_brick.src_brick->hostname,
+ volinfo->rep_brick.src_brick->path);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_RB_SRC_BRICK,
+ buf);
if (ret)
goto out;
snprintf (buf, sizeof (buf), "%s:%s",
- volinfo->dst_brick->hostname,
- volinfo->dst_brick->path);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_RB_DST_BRICK,
- buf);
+ volinfo->rep_brick.dst_brick->hostname,
+ volinfo->rep_brick.dst_brick->path);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_RB_DST_BRICK,
+ buf);
+ if (ret)
+ goto out;
+
+ switch (volinfo->transport_type) {
+ case GF_TRANSPORT_RDMA:
+ port = volinfo->rep_brick.dst_brick->rdma_port;
+ break;
+
+ case GF_TRANSPORT_TCP:
+ case GF_TRANSPORT_BOTH_TCP_RDMA:
+ port = volinfo->rep_brick.dst_brick->port;
+ break;
+ }
+
+ snprintf (buf, sizeof (buf), "%d", port);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_RB_DST_PORT,
+ buf);
if (ret)
goto out;
+ uuid_unparse (volinfo->rep_brick.rb_id, buf);
+ ret = gf_store_save_value (fd, GF_REPLACE_BRICK_TID_KEY, buf);
}
+ ret = 0;
out:
gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
@@ -919,7 +1038,7 @@ glusterd_store_perform_rbstate_store (glusterd_volinfo_t *volinfo)
int32_t ret = -1;
GF_ASSERT (volinfo);
- fd = glusterd_store_mkstemp (volinfo->rb_shandle);
+ fd = gf_store_mkstemp (volinfo->rb_shandle);
if (fd <= 0) {
ret = -1;
goto out;
@@ -929,16 +1048,29 @@ glusterd_store_perform_rbstate_store (glusterd_volinfo_t *volinfo)
if (ret)
goto out;
- ret = glusterd_store_rename_tmppath (volinfo->rb_shandle);
+ ret = gf_store_rename_tmppath (volinfo->rb_shandle);
if (ret)
goto out;
out:
if (ret && (fd > 0))
- glusterd_store_unlink_tmppath (volinfo->rb_shandle);
+ gf_store_unlink_tmppath (volinfo->rb_shandle);
if (fd > 0)
close (fd);
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int
+_gd_store_rebalance_dict (dict_t *dict, char *key, data_t *value, void *data)
+{
+ int ret = -1;
+ int fd = 0;
+
+ fd = *(int *)data;
+
+ ret = gf_store_save_value (fd, key, value->data);
+
return ret;
}
@@ -951,17 +1083,30 @@ glusterd_store_node_state_write (int fd, glusterd_volinfo_t *volinfo)
GF_ASSERT (fd > 0);
GF_ASSERT (volinfo);
- if (volinfo->defrag_cmd == GF_DEFRAG_CMD_STATUS) {
+ if (volinfo->rebal.defrag_cmd == GF_DEFRAG_CMD_STATUS) {
ret = 0;
goto out;
}
- snprintf (buf, sizeof (buf), "%d", volinfo->defrag_cmd);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_DEFRAG,
- buf);
+ snprintf (buf, sizeof (buf), "%d", volinfo->rebal.defrag_cmd);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_VOL_DEFRAG, buf);
+ if (ret)
+ goto out;
+
+ snprintf (buf, sizeof (buf), "%d", volinfo->rebal.op);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_DEFRAG_OP, buf);
+ if (ret)
+ goto out;
+
+ uuid_unparse (volinfo->rebal.rebalance_id, buf);
+ ret = gf_store_save_value (fd, GF_REBALANCE_TID_KEY, buf);
if (ret)
goto out;
+ if (volinfo->rebal.dict) {
+ dict_foreach (volinfo->rebal.dict, _gd_store_rebalance_dict,
+ &fd);
+ }
out:
gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
@@ -974,7 +1119,7 @@ glusterd_store_perform_node_state_store (glusterd_volinfo_t *volinfo)
int32_t ret = -1;
GF_ASSERT (volinfo);
- fd = glusterd_store_mkstemp (volinfo->node_state_shandle);
+ fd = gf_store_mkstemp (volinfo->node_state_shandle);
if (fd <= 0) {
ret = -1;
goto out;
@@ -984,16 +1129,16 @@ glusterd_store_perform_node_state_store (glusterd_volinfo_t *volinfo)
if (ret)
goto out;
- ret = glusterd_store_rename_tmppath (volinfo->node_state_shandle);
+ ret = gf_store_rename_tmppath (volinfo->node_state_shandle);
if (ret)
goto out;
out:
if (ret && (fd > 0))
- glusterd_store_unlink_tmppath (volinfo->node_state_shandle);
+ gf_store_unlink_tmppath (volinfo->node_state_shandle);
if (fd > 0)
close (fd);
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -1004,7 +1149,7 @@ glusterd_store_perform_volume_store (glusterd_volinfo_t *volinfo)
int32_t ret = -1;
GF_ASSERT (volinfo);
- fd = glusterd_store_mkstemp (volinfo->shandle);
+ fd = gf_store_mkstemp (volinfo->shandle);
if (fd <= 0) {
ret = -1;
goto out;
@@ -1020,10 +1165,10 @@ glusterd_store_perform_volume_store (glusterd_volinfo_t *volinfo)
out:
if (ret && (fd > 0))
- glusterd_store_unlink_tmppath (volinfo->shandle);
+ gf_store_unlink_tmppath (volinfo->shandle);
if (fd > 0)
close (fd);
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -1053,7 +1198,7 @@ glusterd_store_bricks_cleanup_tmp (glusterd_volinfo_t *volinfo)
GF_ASSERT (volinfo);
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
- glusterd_store_unlink_tmppath (brickinfo->shandle);
+ gf_store_unlink_tmppath (brickinfo->shandle);
}
}
@@ -1064,11 +1209,11 @@ glusterd_store_volume_cleanup_tmp (glusterd_volinfo_t *volinfo)
glusterd_store_bricks_cleanup_tmp (volinfo);
- glusterd_store_unlink_tmppath (volinfo->shandle);
+ gf_store_unlink_tmppath (volinfo->shandle);
- glusterd_store_unlink_tmppath (volinfo->rb_shandle);
+ gf_store_unlink_tmppath (volinfo->rb_shandle);
- glusterd_store_unlink_tmppath (volinfo->node_state_shandle);
+ gf_store_unlink_tmppath (volinfo->node_state_shandle);
}
int32_t
@@ -1080,7 +1225,7 @@ glusterd_store_brickinfos_atomic_update (glusterd_volinfo_t *volinfo)
GF_ASSERT (volinfo);
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
- ret = glusterd_store_rename_tmppath (brickinfo->shandle);
+ ret = gf_store_rename_tmppath (brickinfo->shandle);
if (ret)
goto out;
}
@@ -1094,7 +1239,7 @@ glusterd_store_volinfo_atomic_update (glusterd_volinfo_t *volinfo)
int ret = -1;
GF_ASSERT (volinfo);
- ret = glusterd_store_rename_tmppath (volinfo->shandle);
+ ret = gf_store_rename_tmppath (volinfo->shandle);
if (ret)
goto out;
@@ -1122,6 +1267,60 @@ out:
}
int32_t
+glusterd_store_snap_atomic_update (glusterd_snap_t *snap)
+{
+ int ret = -1;
+ GF_ASSERT (snap);
+
+ ret = gf_store_rename_tmppath (snap->shandle);
+ if (ret)
+ gf_log (THIS->name, GF_LOG_ERROR, "Couldn't rename "
+ "temporary file(s): Reason %s", strerror (errno));
+
+ return ret;
+}
+
+int32_t
+glusterd_store_snap (glusterd_snap_t *snap)
+{
+ int32_t ret = -1;
+
+ GF_ASSERT (snap);
+
+ ret = glusterd_store_create_snap_dir (snap);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR, "Failed to create snap dir");
+ goto out;
+ }
+
+ ret = glusterd_store_create_snap_shandle_on_absence (snap);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR, "Failed to create snap info "
+ "file");
+ goto out;
+ }
+
+ ret = glusterd_store_snapinfo_write (snap);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR, "Failed to write snap info");
+ goto out;
+ }
+
+ ret = glusterd_store_snap_atomic_update (snap);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR,"Failed to do automic update");
+ goto out;
+ }
+
+out:
+ if (ret && snap->shandle)
+ gf_store_unlink_tmppath (snap->shandle);
+
+ gf_log (THIS->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
glusterd_store_volinfo (glusterd_volinfo_t *volinfo, glusterd_volinfo_ver_ac_t ac)
{
int32_t ret = -1;
@@ -1165,7 +1364,7 @@ glusterd_store_volinfo (glusterd_volinfo_t *volinfo, glusterd_volinfo_ver_ac_t a
goto out;
//checksum should be computed at the end
- ret = glusterd_volume_compute_cksum (volinfo);
+ ret = glusterd_compute_cksum (volinfo, _gf_false);
if (ret)
goto out;
@@ -1173,48 +1372,81 @@ out:
if (ret)
glusterd_store_volume_cleanup_tmp (volinfo);
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
-
int32_t
glusterd_store_delete_volume (glusterd_volinfo_t *volinfo)
{
- char pathname[PATH_MAX] = {0,};
- int32_t ret = 0;
- glusterd_conf_t *priv = NULL;
- DIR *dir = NULL;
- struct dirent *entry = NULL;
- char path[PATH_MAX] = {0,};
- struct stat st = {0, };
+ char pathname[PATH_MAX] = {0,};
+ int32_t ret = 0;
+ glusterd_conf_t *priv = NULL;
+ DIR *dir = NULL;
+ struct dirent *entry = NULL;
+ char path[PATH_MAX] = {0,};
+ char delete_path[PATH_MAX] = {0,};
+ char trashdir[PATH_MAX] = {0,};
+ struct stat st = {0, };
+ xlator_t *this = NULL;
+ gf_boolean_t rename_fail = _gf_false;
+
+ this = THIS;
+ GF_ASSERT (this);
GF_ASSERT (volinfo);
- priv = THIS->private;
+ priv = this->private;
GF_ASSERT (priv);
- snprintf (pathname, sizeof (pathname), "%s/vols/%s", priv->workdir,
- volinfo->volname);
- dir = opendir (pathname);
- if (!dir)
+ GLUSTERD_GET_VOLUME_DIR (pathname, volinfo, priv);
+
+ snprintf (delete_path, sizeof (delete_path),
+ "%s/"GLUSTERD_TRASH"/%s.deleted", priv->workdir,
+ uuid_utoa (volinfo->volume_id));
+
+ snprintf (trashdir, sizeof (trashdir), "%s/"GLUSTERD_TRASH,
+ priv->workdir);
+
+ ret = mkdir (trashdir, 0777);
+ if (ret && errno != EEXIST) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to create trash "
+ "directory, reason : %s", strerror (errno));
+ ret = -1;
goto out;
- ret = glusterd_store_remove_bricks (volinfo);
+ }
+ ret = rename (pathname, delete_path);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Remove bricks failed for %s",
+ gf_log (this->name, GF_LOG_ERROR, "Failed to rename volume "
+ "directory for volume %s", volinfo->volname);
+ rename_fail = _gf_true;
+ goto out;
+ }
+
+ dir = opendir (delete_path);
+ if (!dir) {
+ gf_log (this->name, GF_LOG_DEBUG, "Failed to open directory %s."
+ " Reason : %s", delete_path, strerror (errno));
+ ret = 0;
+ goto out;
+ }
+ ret = glusterd_store_remove_bricks (volinfo, delete_path);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "Remove bricks failed for %s",
volinfo->volname);
}
glusterd_for_each_entry (entry, dir);
while (entry) {
- snprintf (path, PATH_MAX, "%s/%s", pathname, entry->d_name);
+ snprintf (path, PATH_MAX, "%s/%s", delete_path, entry->d_name);
ret = stat (path, &st);
if (ret == -1) {
- gf_log ("", GF_LOG_ERROR, "Failed to stat entry: %s:%s",
- path, strerror (errno));
+ gf_log (this->name, GF_LOG_DEBUG, "Failed to stat "
+ "entry %s : %s", path, strerror (errno));
goto stat_failed;
}
@@ -1223,12 +1455,13 @@ glusterd_store_delete_volume (glusterd_volinfo_t *volinfo)
else
ret = unlink (path);
- if (ret)
- gf_log ("", GF_LOG_INFO, "errno:%d (%s)", errno,
- strerror (errno));
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, " Failed to remove "
+ "%s. Reason : %s", path, strerror (errno));
+ }
- gf_log ("", GF_LOG_INFO, "%s %s",
- ret?"Failed to remove":"Removed",
+ gf_log (this->name, GF_LOG_DEBUG, "%s %s",
+ ret ? "Failed to remove":"Removed",
entry->d_name);
stat_failed:
memset (path, 0, sizeof(path));
@@ -1237,354 +1470,322 @@ stat_failed:
ret = closedir (dir);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Failed to close dir, errno:%d",
- errno);
+ gf_log (this->name, GF_LOG_DEBUG, "Failed to close dir %s. "
+ "Reason : %s",delete_path, strerror (errno));
}
- ret = rmdir (pathname);
+ ret = rmdir (delete_path);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Failed to rmdir: %s, err: %s",
- pathname, strerror (errno));
+ gf_log (this->name, GF_LOG_DEBUG, "Failed to rmdir: %s,err: %s",
+ delete_path, strerror (errno));
+ }
+ ret = rmdir (trashdir);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "Failed to rmdir: %s, Reason:"
+ " %s", trashdir, strerror (errno));
}
-
out:
if (volinfo->shandle) {
- glusterd_store_handle_destroy (volinfo->shandle);
+ gf_store_handle_destroy (volinfo->shandle);
volinfo->shandle = NULL;
}
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ ret = (rename_fail == _gf_true) ? -1: 0;
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
-
-int
-glusterd_store_read_and_tokenize (FILE *file, char *str,
- char **iter_key, char **iter_val,
- glusterd_store_op_errno_t *store_errno)
+/*TODO: cleanup the duplicate code and implement a generic function for
+ * deleting snap/volume depending on the parameter flag */
+int32_t
+glusterd_store_delete_snap (glusterd_snap_t *snap)
{
- int32_t ret = -1;
- char *savetok = NULL;
+ char pathname[PATH_MAX] = {0,};
+ int32_t ret = 0;
+ glusterd_conf_t *priv = NULL;
+ DIR *dir = NULL;
+ struct dirent *entry = NULL;
+ char path[PATH_MAX] = {0,};
+ char delete_path[PATH_MAX] = {0,};
+ char trashdir[PATH_MAX] = {0,};
+ struct stat st = {0, };
+ xlator_t *this = NULL;
+ gf_boolean_t rename_fail = _gf_false;
+
+ this = THIS;
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ GF_ASSERT (snap);
+ GLUSTERD_GET_SNAP_DIR (pathname, snap, priv);
+
+ snprintf (delete_path, sizeof (delete_path),
+ "%s/"GLUSTERD_TRASH"/snap-%s.deleted", priv->workdir,
+ uuid_utoa (snap->snap_id));
- GF_ASSERT (file);
- GF_ASSERT (str);
- GF_ASSERT (iter_key);
- GF_ASSERT (iter_val);
- GF_ASSERT (store_errno);
+ snprintf (trashdir, sizeof (trashdir), "%s/"GLUSTERD_TRASH,
+ priv->workdir);
- ret = fscanf (file, "%s", str);
- if (ret <= 0 || feof (file)) {
+ ret = mkdir (trashdir, 0777);
+ if (ret && errno != EEXIST) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to create trash "
+ "directory, reason : %s", strerror (errno));
ret = -1;
- *store_errno = GD_STORE_EOF;
goto out;
}
- *iter_key = strtok_r (str, "=", &savetok);
- if (*iter_key == NULL) {
- ret = -1;
- *store_errno = GD_STORE_KEY_NULL;
+ ret = rename (pathname, delete_path);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to rename snap "
+ "directory %s to %s", pathname, delete_path);
+ rename_fail = _gf_true;
goto out;
}
- *iter_val = strtok_r (NULL, "=", &savetok);
- if (*iter_key == NULL) {
- ret = -1;
- *store_errno = GD_STORE_VALUE_NULL;
+ dir = opendir (delete_path);
+ if (!dir) {
+ gf_log (this->name, GF_LOG_DEBUG, "Failed to open directory %s."
+ " Reason : %s", delete_path, strerror (errno));
+ ret = 0;
goto out;
}
- *store_errno = GD_STORE_SUCCESS;
- ret = 0;
-out:
- return ret;
-}
-
-int32_t
-glusterd_store_retrieve_value (glusterd_store_handle_t *handle,
- char *key, char **value)
-{
- int32_t ret = -1;
- char *scan_str = NULL;
- char *iter_key = NULL;
- char *iter_val = NULL;
- char *free_str = NULL;
- struct stat st = {0,};
- glusterd_store_op_errno_t store_errno = GD_STORE_SUCCESS;
+ glusterd_for_each_entry (entry, dir);
+ while (entry) {
+ snprintf (path, PATH_MAX, "%s/%s", delete_path, entry->d_name);
+ ret = stat (path, &st);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_DEBUG, "Failed to stat "
+ "entry %s : %s", path, strerror (errno));
+ goto stat_failed;
+ }
- GF_ASSERT (handle);
+ if (S_ISDIR (st.st_mode))
+ ret = rmdir (path);
+ else
+ ret = unlink (path);
- handle->fd = open (handle->path, O_RDWR);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, " Failed to remove "
+ "%s. Reason : %s", path, strerror (errno));
+ }
- if (handle->fd == -1) {
- gf_log ("", GF_LOG_ERROR, "Unable to open file %s errno: %s",
- handle->path, strerror (errno));
- goto out;
+ gf_log (this->name, GF_LOG_DEBUG, "%s %s",
+ ret ? "Failed to remove":"Removed",
+ entry->d_name);
+stat_failed:
+ memset (path, 0, sizeof(path));
+ glusterd_for_each_entry (entry, dir);
}
- if (!handle->read)
- handle->read = fdopen (handle->fd, "r");
- if (!handle->read) {
- gf_log ("", GF_LOG_ERROR, "Unable to open file %s errno: %s",
- handle->path, strerror (errno));
- goto out;
+ ret = closedir (dir);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "Failed to close dir %s. "
+ "Reason : %s",delete_path, strerror (errno));
}
- ret = fstat (handle->fd, &st);
- if (ret < 0) {
- gf_log ("glusterd", GF_LOG_WARNING,
- "stat on file failed");
- ret = -1;
- store_errno = GD_STORE_STAT_FAILED;
- goto out;
+ ret = rmdir (delete_path);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "Failed to rmdir: %s,err: %s",
+ delete_path, strerror (errno));
}
-
- scan_str = GF_CALLOC (1, st.st_size,
- gf_gld_mt_char);
- if (scan_str == NULL) {
- ret = -1;
- store_errno = GD_STORE_ENOMEM;
- goto out;
+ ret = rmdir (trashdir);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "Failed to rmdir: %s, Reason:"
+ " %s", trashdir, strerror (errno));
}
- free_str = scan_str;
-
- do {
- ret = glusterd_store_read_and_tokenize (handle->read, scan_str,
- &iter_key, &iter_val,
- &store_errno);
- if (ret < 0) {
- goto out;
- }
-
- gf_log ("", GF_LOG_DEBUG, "key %s read", iter_key);
-
- if (!strcmp (key, iter_key)) {
- gf_log ("", GF_LOG_DEBUG, "key %s found", key);
- ret = 0;
- if (iter_val)
- *value = gf_strdup (iter_val);
- goto out;
- }
- } while (1);
out:
- if (handle->fd > 0) {
- close (handle->fd);
- handle->read = NULL;
+ if (snap->shandle) {
+ gf_store_handle_destroy (snap->shandle);
+ snap->shandle = NULL;
}
+ ret = (rename_fail == _gf_true) ? -1: 0;
- GF_FREE (free_str);
-
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
-int32_t
-glusterd_store_save_value (int fd, char *key, char *value)
+int
+glusterd_store_global_info (xlator_t *this)
{
- int32_t ret = -1;
- FILE *fp = NULL;
+ int ret = -1;
+ glusterd_conf_t *conf = NULL;
+ char op_version_str[15] = {0,};
+ char path[PATH_MAX] = {0,};
+ gf_store_handle_t *handle = NULL;
+ char *uuid_str = NULL;
+ char buf[256] = {0, };
+
+ conf = this->private;
+
+ uuid_str = gf_strdup (uuid_utoa (MY_UUID));
+ if (!uuid_str)
+ goto out;
- GF_ASSERT (fd > 0);
- GF_ASSERT (key);
- GF_ASSERT (value);
+ if (!conf->handle) {
+ snprintf (path, PATH_MAX, "%s/%s", conf->workdir,
+ GLUSTERD_INFO_FILE);
+ ret = gf_store_handle_new (path, &handle);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to get store handle");
+ goto out;
+ }
- fp = fdopen (fd, "a+");
- if (fp == NULL) {
- gf_log ("", GF_LOG_WARNING, "fdopen failed.");
- ret = -1;
- goto out;
- }
+ conf->handle = handle;
+ } else
+ handle = conf->handle;
- ret = fprintf (fp, "%s=%s\n", key, value);
- if (ret < 0) {
- gf_log ("", GF_LOG_WARNING, "Unable to store key: %s,"
- "value: %s, error: %s", key, value,
- strerror (errno));
- ret = -1;
+ /* These options need to be available for all users */
+ ret = chmod (handle->path, 0644);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "chmod error for %s: %s",
+ GLUSTERD_INFO_FILE, strerror (errno));
goto out;
}
- ret = fflush (fp);
- if (feof (fp)) {
- gf_log ("", GF_LOG_WARNING,
- "fflush failed, error: %s",
- strerror (errno));
+ handle->fd = gf_store_mkstemp (handle);
+ if (handle->fd <= 0) {
ret = -1;
goto out;
}
- ret = 0;
-out:
-
- gf_log ("", GF_LOG_DEBUG, "returning: %d", ret);
- return ret;
-}
-
-int32_t
-glusterd_store_handle_new (char *path, glusterd_store_handle_t **handle)
-{
- int32_t ret = -1;
- glusterd_store_handle_t *shandle = NULL;
- int fd = -1;
- char *spath = NULL;
-
- shandle = GF_CALLOC (1, sizeof (*shandle), gf_gld_mt_store_handle_t);
- if (!shandle)
- goto out;
-
- spath = gf_strdup (path);
-
- if (!spath)
- goto out;
-
- fd = open (path, O_RDWR | O_CREAT | O_APPEND, 0600);
- if (fd <= 0) {
- gf_log ("glusterd", GF_LOG_ERROR, "Failed to open file: %s, "
- "error: %s", path, strerror (errno));
+ ret = gf_store_save_value (handle->fd, GLUSTERD_STORE_UUID_KEY,
+ uuid_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "Storing uuid failed ret = %d", ret);
goto out;
}
- ret = glusterd_store_sync_direntry (spath);
- if (ret)
+ snprintf (op_version_str, 15, "%d", conf->op_version);
+ ret = gf_store_save_value (handle->fd, GD_OP_VERSION_KEY,
+ op_version_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Storing op-version failed ret = %d", ret);
goto out;
-
- shandle->path = spath;
- *handle = shandle;
-
- ret = 0;
-out:
- if (fd > 0)
- close (fd);
-
- if (ret == -1) {
- GF_FREE (spath);
- GF_FREE (shandle);
}
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
- return ret;
-}
-
-int
-glusterd_store_handle_retrieve (char *path, glusterd_store_handle_t **handle)
-{
- int32_t ret = -1;
- struct stat statbuf = {0};
-
- ret = stat (path, &statbuf);
+ snprintf (buf, sizeof (buf), "%"PRIu64, conf->snap_max_hard_limit);
+ ret = gf_store_save_value (handle->fd,
+ GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT, buf);
if (ret) {
- gf_log ("glusterd", GF_LOG_ERROR, "Unable to retrieve store "
- "handle for %s, error: %s", path, strerror (errno));
+ gf_log (this->name, GF_LOG_ERROR,
+ "Storing snap-max-hard-limit failed ret = %d", ret);
goto out;
}
- ret = glusterd_store_handle_new (path, handle);
-out:
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
- return ret;
-}
-
-int32_t
-glusterd_store_handle_destroy (glusterd_store_handle_t *handle)
-{
- int32_t ret = -1;
- if (!handle) {
- ret = 0;
+ snprintf (buf, sizeof (buf), "%"PRIu64, conf->snap_max_soft_limit);
+ ret = gf_store_save_value (handle->fd,
+ GLUSTERD_STORE_KEY_SNAP_MAX_SOFT_LIMIT, buf);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Storing snap-max-soft-limit failed ret = %d", ret);
goto out;
}
- GF_FREE (handle->path);
+ ret = gf_store_rename_tmppath (handle);
+out:
+ if (handle) {
+ if (ret && (handle->fd > 0))
+ gf_store_unlink_tmppath (handle);
- GF_FREE (handle);
+ if (handle->fd > 0) {
+ close (handle->fd);
+ handle->fd = 0;
+ }
+ }
- ret = 0;
+ if (uuid_str)
+ GF_FREE (uuid_str);
-out:
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to store glusterd global-info");
return ret;
}
-int32_t
-glusterd_store_uuid ()
+int
+glusterd_retrieve_op_version (xlator_t *this, int *op_version)
{
- glusterd_conf_t *priv = NULL;
- char path[PATH_MAX] = {0,};
- int32_t ret = -1;
- glusterd_store_handle_t *handle = NULL;
- xlator_t *this = NULL;
+ char *op_version_str = NULL;
+ glusterd_conf_t *priv = NULL;
+ int ret = -1;
+ int tmp_version = 0;
+ char *tmp = NULL;
+ char path[PATH_MAX] = {0,};
+ gf_store_handle_t *handle = NULL;
- this = THIS;
priv = this->private;
- snprintf (path, PATH_MAX, "%s/%s", priv->workdir,
- GLUSTERD_INFO_FILE);
-
if (!priv->handle) {
- ret = glusterd_store_handle_new (path, &handle);
+ snprintf (path, PATH_MAX, "%s/%s", priv->workdir,
+ GLUSTERD_INFO_FILE);
+ ret = gf_store_handle_retrieve (path, &handle);
if (ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "Unable to get store handle!");
+ gf_log ("", GF_LOG_DEBUG, "Unable to get store "
+ "handle!");
goto out;
}
priv->handle = handle;
- } else {
- handle = priv->handle;
}
- /* make glusterd's uuid available for users */
- ret = chmod (handle->path, 0644);
+ ret = gf_store_retrieve_value (priv->handle, GD_OP_VERSION_KEY,
+ &op_version_str);
if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "chmod error for %s: %s",
- GLUSTERD_INFO_FILE, strerror (errno));
+ gf_log (this->name, GF_LOG_DEBUG,
+ "No previous op_version present");
goto out;
}
- handle->fd = open (handle->path, O_RDWR | O_CREAT | O_TRUNC, 0644);
- if (handle->fd <= 0) {
- ret = -1;
- goto out;
- }
- ret = glusterd_store_save_value (handle->fd, GLUSTERD_STORE_UUID_KEY,
- uuid_utoa (MY_UUID));
-
- if (ret) {
- gf_log (this->name, GF_LOG_CRITICAL,
- "Storing uuid failed ret = %d", ret);
+ tmp_version = strtol (op_version_str, &tmp, 10);
+ if ((tmp_version <= 0) || (tmp && strlen (tmp) > 1)) {
+ gf_log (this->name, GF_LOG_WARNING, "invalid version number");
goto out;
}
+ *op_version = tmp_version;
+ ret = 0;
out:
- if (handle->fd > 0) {
- close (handle->fd);
- handle->fd = 0;
- }
- gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
+ if (op_version_str)
+ GF_FREE (op_version_str);
+
return ret;
}
-int32_t
-glusterd_retrieve_uuid ()
+int
+glusterd_retrieve_sys_snap_max_limit (xlator_t *this, uint64_t *limit,
+ char *key)
{
- char *uuid_str = NULL;
- int32_t ret = -1;
- glusterd_store_handle_t *handle = NULL;
- glusterd_conf_t *priv = NULL;
- char path[PATH_MAX] = {0,};
+ char *limit_str = NULL;
+ glusterd_conf_t *priv = NULL;
+ int ret = -1;
+ uint64_t tmp_limit = 0;
+ char *tmp = NULL;
+ char path[PATH_MAX] = {0,};
+ gf_store_handle_t *handle = NULL;
- priv = THIS->private;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (limit);
+ GF_ASSERT (key);
if (!priv->handle) {
snprintf (path, PATH_MAX, "%s/%s", priv->workdir,
GLUSTERD_INFO_FILE);
- ret = glusterd_store_handle_retrieve (path, &handle);
+ ret = gf_store_handle_retrieve (path, &handle);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get store "
+ gf_log ("", GF_LOG_DEBUG, "Unable to get store "
"handle!");
goto out;
}
@@ -1592,261 +1793,151 @@ glusterd_retrieve_uuid ()
priv->handle = handle;
}
- ret = glusterd_store_retrieve_value (priv->handle,
- GLUSTERD_STORE_UUID_KEY,
- &uuid_str);
-
+ ret = gf_store_retrieve_value (priv->handle,
+ key,
+ &limit_str);
if (ret) {
- gf_log ("", GF_LOG_INFO, "No previous uuid is present");
- goto out;
- }
-
- uuid_parse (uuid_str, priv->uuid);
-
-out:
- GF_FREE (uuid_str);
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
- return ret;
-}
-
-int32_t
-glusterd_store_iter_new (glusterd_store_handle_t *shandle,
- glusterd_store_iter_t **iter)
-{
- int32_t ret = -1;
- glusterd_store_iter_t *tmp_iter = NULL;
- int fd = -1;
-
- GF_ASSERT (shandle);
- GF_ASSERT (iter);
-
- tmp_iter = GF_CALLOC (1, sizeof (*tmp_iter),
- gf_gld_mt_store_iter_t);
-
- if (!tmp_iter) {
- gf_log ("", GF_LOG_ERROR, "Out of Memory");
+ gf_log (this->name, GF_LOG_DEBUG,
+ "No previous %s present", key);
goto out;
}
- fd = open (shandle->path, O_RDWR);
-
- if (fd < 0) {
- gf_log ("", GF_LOG_ERROR, "Unable to open %s, errno: %d",
- shandle->path, errno);
+ tmp_limit = strtoul (limit_str, &tmp, 10);
+ if ((tmp_limit <= 0) || (tmp && strlen (tmp) > 1)) {
+ gf_log (this->name, GF_LOG_WARNING, "invalid version number");
goto out;
}
- tmp_iter->fd = fd;
+ *limit = tmp_limit;
- tmp_iter->file = fdopen (tmp_iter->fd, "r");
-
- if (!tmp_iter->file) {
- gf_log ("", GF_LOG_ERROR, "Unable to open file %s errno: %d",
- shandle->path, errno);
- goto out;
- }
-
- strncpy (tmp_iter->filepath, shandle->path, sizeof (tmp_iter->filepath));
- tmp_iter->filepath[sizeof (tmp_iter->filepath) - 1] = 0;
- *iter = tmp_iter;
ret = 0;
-
out:
- gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
- return ret;
-}
-
-int32_t
-glusterd_store_validate_key_value (char *storepath, char *key, char*val,
- glusterd_store_op_errno_t *op_errno)
-{
- int ret = 0;
-
- GF_ASSERT (op_errno);
- GF_ASSERT (storepath);
-
- if ((key == NULL) && (val == NULL)) {
- ret = -1;
- gf_log ("glusterd", GF_LOG_ERROR, "Glusterd store may be "
- "corrupted, Invalid key and value (null) in %s",
- storepath);
- *op_errno = GD_STORE_KEY_VALUE_NULL;
- } else if (key == NULL) {
- ret = -1;
- gf_log ("glusterd", GF_LOG_ERROR, "Glusterd store may be "
- "corrupted, Invalid key (null) in %s", storepath);
- *op_errno = GD_STORE_KEY_NULL;
- } else if (val == NULL) {
- ret = -1;
- gf_log ("glusterd", GF_LOG_ERROR, "Glusterd store may be "
- "corrupted, Invalid value (null) for key %s in %s",
- key, storepath);
- *op_errno = GD_STORE_VALUE_NULL;
- } else {
- ret = 0;
- *op_errno = GD_STORE_SUCCESS;
- }
+ if (limit_str)
+ GF_FREE (limit_str);
return ret;
}
-
-int32_t
-glusterd_store_iter_get_next (glusterd_store_iter_t *iter,
- char **key, char **value,
- glusterd_store_op_errno_t *op_errno)
+static int
+glusterd_restore_op_version (xlator_t *this)
{
- int32_t ret = -1;
- char *scan_str = NULL;
- char *free_str = NULL;
- char *iter_key = NULL;
- char *iter_val = NULL;
- struct stat st = {0,};
- glusterd_store_op_errno_t store_errno = GD_STORE_SUCCESS;
-
- GF_ASSERT (iter);
- GF_ASSERT (iter->file);
- GF_ASSERT (key);
- GF_ASSERT (value);
+ glusterd_conf_t *conf = NULL;
+ int ret = 0;
+ int op_version = 0;
- ret = fstat (iter->fd, &st);
- if (ret < 0) {
- gf_log ("glusterd", GF_LOG_WARNING,
- "stat on file failed");
- ret = -1;
- store_errno = GD_STORE_STAT_FAILED;
- goto out;
- }
+ conf = this->private;
- scan_str = GF_CALLOC (1, st.st_size,
- gf_gld_mt_char);
- if (scan_str == NULL) {
- ret = -1;
- store_errno = GD_STORE_ENOMEM;
- goto out;
+ ret = glusterd_retrieve_sys_snap_max_limit (this,
+ &conf->snap_max_hard_limit,
+ GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Unable to retrieve system snap-max-hard-limit, "
+ "setting it to default value(%d)",
+ GLUSTERD_SNAPS_MAX_HARD_LIMIT);
+ conf->snap_max_hard_limit = GLUSTERD_SNAPS_MAX_HARD_LIMIT;
}
- *key = NULL;
- *value = NULL;
-
- free_str = scan_str;
-
- ret = glusterd_store_read_and_tokenize (iter->file, scan_str,
- &iter_key, &iter_val,
- &store_errno);
- if (ret < 0) {
- goto out;
+ ret = glusterd_retrieve_sys_snap_max_limit (this,
+ &conf->snap_max_soft_limit,
+ GLUSTERD_STORE_KEY_SNAP_MAX_SOFT_LIMIT);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Unable to retrieve system snap-max-soft-limit, "
+ "setting it to default value(%d)",
+ GLUSTERD_SNAPS_DEF_SOFT_LIMIT_PERCENT);
+ conf->snap_max_soft_limit = GLUSTERD_SNAPS_DEF_SOFT_LIMIT_PERCENT;
}
-
- ret = glusterd_store_validate_key_value (iter->filepath, iter_key,
- iter_val, &store_errno);
- if (ret)
- goto out;
-
- *value = gf_strdup (iter_val);
-
- *key = gf_strdup (iter_key);
- if (!iter_key || !iter_val) {
- ret = -1;
- store_errno = GD_STORE_ENOMEM;
+ ret = glusterd_retrieve_op_version (this, &op_version);
+ if (!ret) {
+ if ((op_version < GD_OP_VERSION_MIN) ||
+ (op_version > GD_OP_VERSION_MAX)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "wrong op-version (%d) retrieved", op_version);
+ ret = -1;
+ goto out;
+ }
+ conf->op_version = op_version;
+ gf_log ("glusterd", GF_LOG_INFO,
+ "retrieved op-version: %d", conf->op_version);
goto out;
}
- ret = 0;
-
-out:
+ /* op-version can be missing from the store file in 2 cases,
+ * 1. This is a new install of glusterfs
+ * 2. This is an upgrade of glusterfs from a version without op-version
+ * to a version with op-version (eg. 3.3 -> 3.4)
+ *
+ * Detection of a new install or an upgrade from an older install can be
+ * done by checking for the presence of the its peer-id in the store
+ * file. If peer-id is present, the installation is an upgrade else, it
+ * is a new install.
+ *
+ * For case 1, set op-version to GD_OP_VERSION_MAX.
+ * For case 2, set op-version to GD_OP_VERSION_MIN.
+ */
+ ret = glusterd_retrieve_uuid();
if (ret) {
- if (*key) {
- GF_FREE (*key);
- *key = NULL;
- }
- if (*value) {
- GF_FREE (*value);
- *value = NULL;
- }
+ gf_log (this->name, GF_LOG_INFO, "Detected new install. Setting"
+ " op-version to maximum : %d", GD_OP_VERSION_MAX);
+ conf->op_version = GD_OP_VERSION_MAX;
+ } else {
+ gf_log (this->name, GF_LOG_INFO, "Upgrade detected. Setting"
+ " op-version to minimum : %d", GD_OP_VERSION_MIN);
+ conf->op_version = GD_OP_VERSION_MIN;
}
- GF_FREE (free_str);
- if (op_errno)
- *op_errno = store_errno;
-
- gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
+ ret = 0;
+out:
return ret;
}
int32_t
-glusterd_store_iter_get_matching (glusterd_store_iter_t *iter,
- char *key, char **value)
+glusterd_retrieve_uuid ()
{
- int32_t ret = -1;
- char *tmp_key = NULL;
- char *tmp_value = NULL;
+ char *uuid_str = NULL;
+ int32_t ret = -1;
+ gf_store_handle_t *handle = NULL;
+ glusterd_conf_t *priv = NULL;
+ char path[PATH_MAX] = {0,};
- ret = glusterd_store_iter_get_next (iter, &tmp_key, &tmp_value,
- NULL);
- while (!ret) {
- if (!strncmp (key, tmp_key, strlen (key))){
- *value = tmp_value;
- GF_FREE (tmp_key);
+ priv = THIS->private;
+
+ if (!priv->handle) {
+ snprintf (path, PATH_MAX, "%s/%s", priv->workdir,
+ GLUSTERD_INFO_FILE);
+ ret = gf_store_handle_retrieve (path, &handle);
+
+ if (ret) {
+ gf_log ("", GF_LOG_DEBUG, "Unable to get store"
+ "handle!");
goto out;
}
- GF_FREE (tmp_key);
- GF_FREE (tmp_value);
- ret = glusterd_store_iter_get_next (iter, &tmp_key,
- &tmp_value, NULL);
- }
-out:
- return ret;
-}
-
-int32_t
-glusterd_store_iter_destroy (glusterd_store_iter_t *iter)
-{
- int32_t ret = -1;
- GF_ASSERT (iter);
- GF_ASSERT (iter->fd > 0);
+ priv->handle = handle;
+ }
- ret = fclose (iter->file);
+ ret = gf_store_retrieve_value (priv->handle, GLUSTERD_STORE_UUID_KEY,
+ &uuid_str);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to close fd: %d, ret: %d, "
- "errno: %d" ,iter->fd, ret, errno);
+ gf_log ("", GF_LOG_DEBUG, "No previous uuid is present");
+ goto out;
}
- GF_FREE (iter);
+ uuid_parse (uuid_str, priv->uuid);
+out:
+ GF_FREE (uuid_str);
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
-char*
-glusterd_store_strerror (glusterd_store_op_errno_t op_errno)
-{
- switch (op_errno) {
- case GD_STORE_SUCCESS:
- return "Success";
- case GD_STORE_KEY_NULL:
- return "Invalid Key";
- case GD_STORE_VALUE_NULL:
- return "Invalid Value";
- case GD_STORE_KEY_VALUE_NULL:
- return "Invalid Key and Value";
- case GD_STORE_EOF:
- return "No data";
- case GD_STORE_ENOMEM:
- return "No memory";
- default:
- return "Invalid errno";
- }
- return "Invalid errno";
-}
-
int32_t
glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo)
{
-
int32_t ret = 0;
glusterd_brickinfo_t *brickinfo = NULL;
- glusterd_store_iter_t *iter = NULL;
+ gf_store_iter_t *iter = NULL;
char *key = NULL;
char *value = NULL;
char brickdir[PATH_MAX] = {0,};
@@ -1854,19 +1945,20 @@ glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo)
glusterd_conf_t *priv = NULL;
int32_t brick_count = 0;
char tmpkey[4096] = {0,};
- glusterd_store_iter_t *tmpiter = NULL;
+ gf_store_iter_t *tmpiter = NULL;
char *tmpvalue = NULL;
- struct pmap_registry *pmap = NULL;
- glusterd_store_op_errno_t op_errno = GD_STORE_SUCCESS;
+ struct pmap_registry *pmap = NULL;
+ int brickid = 0;
+ gf_store_op_errno_t op_errno = GD_STORE_SUCCESS;
GF_ASSERT (volinfo);
GF_ASSERT (volinfo->volname);
priv = THIS->private;
- GLUSTERD_GET_BRICK_DIR (brickdir, volinfo, priv)
+ GLUSTERD_GET_BRICK_DIR (brickdir, volinfo, priv);
- ret = glusterd_store_iter_new (volinfo->shandle, &tmpiter);
+ ret = gf_store_iter_new (volinfo->shandle, &tmpiter);
if (ret)
goto out;
@@ -1878,30 +1970,28 @@ glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo)
goto out;
snprintf (tmpkey, sizeof (tmpkey), "%s-%d",
GLUSTERD_STORE_KEY_VOL_BRICK,brick_count);
- ret = glusterd_store_iter_get_matching (tmpiter, tmpkey,
- &tmpvalue);
+ ret = gf_store_iter_get_matching (tmpiter, tmpkey, &tmpvalue);
snprintf (path, sizeof (path), "%s/%s", brickdir, tmpvalue);
GF_FREE (tmpvalue);
tmpvalue = NULL;
- ret = glusterd_store_handle_retrieve (path, &brickinfo->shandle);
+ ret = gf_store_handle_retrieve (path, &brickinfo->shandle);
if (ret)
goto out;
- ret = glusterd_store_iter_new (brickinfo->shandle, &iter);
+ ret = gf_store_iter_new (brickinfo->shandle, &iter);
if (ret)
goto out;
- ret = glusterd_store_iter_get_next (iter, &key, &value,
- &op_errno);
+ ret = gf_store_iter_get_next (iter, &key, &value, &op_errno);
if (ret) {
gf_log ("glusterd", GF_LOG_ERROR, "Unable to iterate "
"the store for brick: %s, reason: %s", path,
- glusterd_store_strerror (op_errno));
+ gf_store_strerror (op_errno));
goto out;
}
while (!ret) {
@@ -1916,7 +2006,7 @@ glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo)
strlen (GLUSTERD_STORE_KEY_BRICK_PORT))) {
gf_string2int (value, &brickinfo->port);
- if (brickinfo->port < GF_IANA_PRIV_PORTS_START){
+ if (brickinfo->port < priv->base_port) {
/* This is required to adhere to the
IANA standards */
brickinfo->port = 0;
@@ -1932,8 +2022,7 @@ glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo)
strlen (GLUSTERD_STORE_KEY_BRICK_RDMA_PORT))) {
gf_string2int (value, &brickinfo->rdma_port);
- if (brickinfo->rdma_port <
- GF_IANA_PRIV_PORTS_START){
+ if (brickinfo->rdma_port < priv->base_port) {
/* This is required to adhere to the
IANA standards */
brickinfo->rdma_port = 0;
@@ -1950,6 +2039,21 @@ glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo)
} else if (!strncmp (key, GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED,
strlen (GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED))) {
gf_string2int (value, &brickinfo->decommissioned);
+ } else if (!strncmp (key, GLUSTERD_STORE_KEY_BRICK_DEVICE_PATH,
+ strlen (GLUSTERD_STORE_KEY_BRICK_DEVICE_PATH))) {
+ strncpy (brickinfo->device_path, value,
+ sizeof (brickinfo->device_path));
+ } else if (!strncmp (key, GLUSTERD_STORE_KEY_BRICK_SNAP_STATUS,
+ strlen (GLUSTERD_STORE_KEY_BRICK_SNAP_STATUS))) {
+ gf_string2int (value, &brickinfo->snap_status);
+ } else if (!strncmp (key,
+ GLUSTERD_STORE_KEY_BRICK_VGNAME,
+ strlen (GLUSTERD_STORE_KEY_BRICK_VGNAME))) {
+ strncpy (brickinfo->vg, value,
+ sizeof (brickinfo->vg));
+ } else if (!strcmp(key, GLUSTERD_STORE_KEY_BRICK_ID)) {
+ strncpy (brickinfo->brick_id, value,
+ sizeof (brickinfo->brick_id));
} else {
gf_log ("", GF_LOG_ERROR, "Unknown key: %s",
key);
@@ -1960,22 +2064,31 @@ glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo)
key = NULL;
value = NULL;
- ret = glusterd_store_iter_get_next (iter, &key, &value,
- &op_errno);
+ ret = gf_store_iter_get_next (iter, &key, &value,
+ &op_errno);
}
- if (op_errno != GD_STORE_EOF)
+ if (op_errno != GD_STORE_EOF) {
+ gf_log ("", GF_LOG_ERROR, "Error parsing brickinfo: "
+ "op_errno=%d", op_errno);
goto out;
- ret = glusterd_store_iter_destroy (iter);
+ }
+ ret = gf_store_iter_destroy (iter);
if (ret)
goto out;
+ if (brickinfo->brick_id[0] == '\0') {
+ /* This is an old volume upgraded to op_version 4 */
+ GLUSTERD_ASSIGN_BRICKID_TO_BRICKINFO (brickinfo, volinfo,
+ brickid++);
+ }
+
list_add_tail (&brickinfo->brick_list, &volinfo->bricks);
brick_count++;
}
- ret = glusterd_store_iter_destroy (tmpiter);
+ ret = gf_store_iter_destroy (tmpiter);
if (ret)
goto out;
out:
@@ -1986,64 +2099,79 @@ out:
int32_t
-glusterd_store_retrieve_rbstate (char *volname)
+glusterd_store_retrieve_rbstate (glusterd_volinfo_t *volinfo)
{
int32_t ret = -1;
- glusterd_volinfo_t *volinfo = NULL;
- glusterd_store_iter_t *iter = NULL;
+ gf_store_iter_t *iter = NULL;
char *key = NULL;
char *value = NULL;
char volpath[PATH_MAX] = {0,};
glusterd_conf_t *priv = NULL;
char path[PATH_MAX] = {0,};
- glusterd_store_op_errno_t op_errno = GD_STORE_SUCCESS;
+ gf_store_op_errno_t op_errno = GD_STORE_SUCCESS;
+ xlator_t *this = NULL;
- priv = THIS->private;
-
- ret = glusterd_volinfo_find (volname, &volinfo);
- if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "Couldn't get"
- "volinfo for %s.", volname);
- goto out;
- }
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (volinfo);
GLUSTERD_GET_VOLUME_DIR(volpath, volinfo, priv);
snprintf (path, sizeof (path), "%s/%s", volpath,
GLUSTERD_VOLUME_RBSTATE_FILE);
- ret = glusterd_store_handle_retrieve (path, &volinfo->rb_shandle);
+ ret = gf_store_handle_retrieve (path, &volinfo->rb_shandle);
if (ret)
goto out;
- ret = glusterd_store_iter_new (volinfo->rb_shandle, &iter);
+ ret = gf_store_iter_new (volinfo->rb_shandle, &iter);
if (ret)
goto out;
- ret = glusterd_store_iter_get_next (iter, &key, &value, &op_errno);
+ ret = gf_store_iter_get_next (iter, &key, &value, &op_errno);
if (ret)
goto out;
while (!ret) {
if (!strncmp (key, GLUSTERD_STORE_KEY_RB_STATUS,
strlen (GLUSTERD_STORE_KEY_RB_STATUS))) {
- volinfo->rb_status = atoi (value);
+ volinfo->rep_brick.rb_status = atoi (value);
}
- if (volinfo->rb_status > GF_RB_STATUS_NONE) {
+ if (volinfo->rep_brick.rb_status > GF_RB_STATUS_NONE) {
if (!strncmp (key, GLUSTERD_STORE_KEY_RB_SRC_BRICK,
strlen (GLUSTERD_STORE_KEY_RB_SRC_BRICK))) {
ret = glusterd_brickinfo_new_from_brick (value,
- &volinfo->src_brick);
+ &volinfo->rep_brick.src_brick);
if (ret)
goto out;
} else if (!strncmp (key, GLUSTERD_STORE_KEY_RB_DST_BRICK,
strlen (GLUSTERD_STORE_KEY_RB_DST_BRICK))) {
ret = glusterd_brickinfo_new_from_brick (value,
- &volinfo->dst_brick);
+ &volinfo->rep_brick.dst_brick);
if (ret)
goto out;
+ } else if (!strncmp (key, GLUSTERD_STORE_KEY_RB_DST_PORT,
+ strlen (GLUSTERD_STORE_KEY_RB_DST_PORT))) {
+ switch (volinfo->transport_type) {
+ case GF_TRANSPORT_RDMA:
+ volinfo->rep_brick.dst_brick->rdma_port =
+ atoi (value);
+ break;
+
+ case GF_TRANSPORT_TCP:
+ case GF_TRANSPORT_BOTH_TCP_RDMA:
+ volinfo->rep_brick.dst_brick->port =
+ atoi (value);
+ break;
+ }
+ } else if (!strncmp (key, GF_REPLACE_BRICK_TID_KEY,
+ strlen (GF_REPLACE_BRICK_TID_KEY))) {
+ uuid_parse (value,
+ volinfo->rep_brick.rb_id);
}
}
@@ -2052,128 +2180,176 @@ glusterd_store_retrieve_rbstate (char *volname)
key = NULL;
value = NULL;
- ret = glusterd_store_iter_get_next (iter, &key, &value,
- &op_errno);
+ ret = gf_store_iter_get_next (iter, &key, &value, &op_errno);
}
if (op_errno != GD_STORE_EOF)
goto out;
- ret = glusterd_store_iter_destroy (iter);
+ ret = gf_store_iter_destroy (iter);
if (ret)
goto out;
out:
- gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
+ gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret);
return ret;
}
int32_t
-glusterd_store_retrieve_node_state (char *volname)
+glusterd_store_retrieve_node_state (glusterd_volinfo_t *volinfo)
{
- int32_t ret = -1;
- glusterd_volinfo_t *volinfo = NULL;
- glusterd_store_iter_t *iter = NULL;
- char *key = NULL;
- char *value = NULL;
- char volpath[PATH_MAX] = {0,};
- glusterd_conf_t *priv = NULL;
- char path[PATH_MAX] = {0,};
- glusterd_store_op_errno_t op_errno = GD_STORE_SUCCESS;
+ int32_t ret = -1;
+ gf_store_iter_t *iter = NULL;
+ char *key = NULL;
+ char *value = NULL;
+ char *dup_value = NULL;
+ char volpath[PATH_MAX] = {0,};
+ glusterd_conf_t *priv = NULL;
+ char path[PATH_MAX] = {0,};
+ gf_store_op_errno_t op_errno = GD_STORE_SUCCESS;
+ dict_t *tmp_dict = NULL;
+ xlator_t *this = NULL;
- priv = THIS->private;
-
- ret = glusterd_volinfo_find (volname, &volinfo);
- if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "Couldn't get"
- "volinfo for %s.", volname);
- goto out;
- }
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (volinfo);
GLUSTERD_GET_VOLUME_DIR(volpath, volinfo, priv);
snprintf (path, sizeof (path), "%s/%s", volpath,
GLUSTERD_NODE_STATE_FILE);
- ret = glusterd_store_handle_retrieve (path,
- &volinfo->node_state_shandle);
-
+ ret = gf_store_handle_retrieve (path, &volinfo->node_state_shandle);
if (ret)
goto out;
- ret = glusterd_store_iter_new (volinfo->node_state_shandle, &iter);
+ ret = gf_store_iter_new (volinfo->node_state_shandle, &iter);
if (ret)
goto out;
- ret = glusterd_store_iter_get_next (iter, &key, &value, &op_errno);
+ ret = gf_store_iter_get_next (iter, &key, &value, &op_errno);
if (ret)
goto out;
- if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_DEFRAG,
- strlen (GLUSTERD_STORE_KEY_VOL_DEFRAG))) {
- volinfo->defrag_cmd = atoi (value);
- }
- GF_FREE (key);
- GF_FREE (value);
+ while (ret == 0) {
+ if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_DEFRAG,
+ strlen (GLUSTERD_STORE_KEY_VOL_DEFRAG))) {
+ volinfo->rebal.defrag_cmd = atoi (value);
+ } else if (!strncmp (key, GF_REBALANCE_TID_KEY,
+ strlen (GF_REBALANCE_TID_KEY))) {
+ uuid_parse (value, volinfo->rebal.rebalance_id);
+ } else if (!strncmp (key, GLUSTERD_STORE_KEY_DEFRAG_OP,
+ strlen (GLUSTERD_STORE_KEY_DEFRAG_OP))) {
+ volinfo->rebal.op = atoi (value);
+ } else {
+ if (!tmp_dict) {
+ tmp_dict = dict_new ();
+ if (!tmp_dict) {
+ ret = -1;
+ goto out;
+ }
+ }
+ dup_value = gf_strdup (value);
+ if (!dup_value) {
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to strdup value string");
+ goto out;
+ }
+ ret = dict_set_str (tmp_dict, key, dup_value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Error setting data in rebal "
+ "dict.");
+ goto out;
+ }
+ dup_value = NULL;
+ }
+
+ GF_FREE (key);
+ GF_FREE (value);
+ key = NULL;
+ value = NULL;
- if (op_errno != GD_STORE_EOF)
+ ret = gf_store_iter_get_next (iter, &key, &value, &op_errno);
+ }
+ if (tmp_dict)
+ volinfo->rebal.dict = dict_ref (tmp_dict);
+
+ if (op_errno != GD_STORE_EOF) {
+ ret = -1;
goto out;
+ }
- ret = glusterd_store_iter_destroy (iter);
+ ret = gf_store_iter_destroy (iter);
if (ret)
goto out;
out:
- gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
+ if (dup_value)
+ GF_FREE (dup_value);
+ if (ret && volinfo->rebal.dict)
+ dict_unref (volinfo->rebal.dict);
+ if (tmp_dict)
+ dict_unref (tmp_dict);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret);
return ret;
}
-int32_t
-glusterd_store_retrieve_volume (char *volname)
-{
- int32_t ret = -1;
- glusterd_volinfo_t *volinfo = NULL;
- glusterd_store_iter_t *iter = NULL;
- char *key = NULL;
- char *value = NULL;
- char volpath[PATH_MAX] = {0,};
- glusterd_conf_t *priv = NULL;
- char path[PATH_MAX] = {0,};
- int exists = 0;
- glusterd_store_op_errno_t op_errno = GD_STORE_SUCCESS;
-
- ret = glusterd_volinfo_new (&volinfo);
- if (ret)
- goto out;
+int
+glusterd_store_update_volinfo (glusterd_volinfo_t *volinfo)
+{
+ int ret = -1;
+ int exists = 0;
+ char *key = NULL;
+ char *value = NULL;
+ char volpath[PATH_MAX] = {0,};
+ char path[PATH_MAX] = {0,};
+ xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
+ gf_store_iter_t *iter = NULL;
+ gf_store_op_errno_t op_errno = GD_STORE_SUCCESS;
- strncpy (volinfo->volname, volname, GLUSTERD_MAX_VOLUME_NAME);
+ this = THIS;
+ GF_ASSERT (this);
+ conf = THIS->private;
+ GF_ASSERT (volinfo);
- priv = THIS->private;
+ GLUSTERD_GET_VOLUME_DIR(volpath, volinfo, conf);
- GLUSTERD_GET_VOLUME_DIR(volpath, volinfo, priv);
snprintf (path, sizeof (path), "%s/%s", volpath,
GLUSTERD_VOLUME_INFO_FILE);
- ret = glusterd_store_handle_retrieve (path, &volinfo->shandle);
-
- if (ret)
+ ret = gf_store_handle_retrieve (path, &volinfo->shandle);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "volinfo handle is NULL");
goto out;
+ }
- ret = glusterd_store_iter_new (volinfo->shandle, &iter);
-
- if (ret)
+ ret = gf_store_iter_new (volinfo->shandle, &iter);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get new store "
+ "iter");
goto out;
+ }
- ret = glusterd_store_iter_get_next (iter, &key, &value, &op_errno);
- if (ret)
+ ret = gf_store_iter_get_next (iter, &key, &value, &op_errno);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get next store "
+ "iter");
goto out;
+ }
while (!ret) {
+ gf_log ("", GF_LOG_DEBUG, "key = %s value = %s", key, value);
if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_TYPE,
strlen (GLUSTERD_STORE_KEY_VOL_TYPE))) {
volinfo->type = atoi (value);
@@ -2233,6 +2409,27 @@ glusterd_store_retrieve_volume (char *volname)
gf_log ("", GF_LOG_DEBUG, "Parsed as "GEOREP" "
" slave:key=%s,value:%s", key, value);
+ } else if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_OP_VERSION,
+ strlen (GLUSTERD_STORE_KEY_VOL_OP_VERSION))) {
+ volinfo->op_version = atoi (value);
+ } else if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_CLIENT_OP_VERSION,
+ strlen (GLUSTERD_STORE_KEY_VOL_CLIENT_OP_VERSION))) {
+ volinfo->client_op_version = atoi (value);
+ } else if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_CAPS,
+ strlen (GLUSTERD_STORE_KEY_VOL_CAPS))) {
+ volinfo->caps = atoi (value);
+ } else if (!strncmp (key, GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT,
+ strlen (GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT))) {
+ volinfo->snap_max_hard_limit = (uint64_t) atoll (value);
+ } else if (!strncmp (key, GLUSTERD_STORE_KEY_VOL_RESTORED_SNAP,
+ strlen (GLUSTERD_STORE_KEY_VOL_RESTORED_SNAP))) {
+ ret = uuid_parse (value, volinfo->restored_from_snap);
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to parse restored snap's uuid");
+ } else if (!strncmp (key, GLUSTERD_STORE_KEY_PARENT_VOLNAME,
+ strlen (GLUSTERD_STORE_KEY_PARENT_VOLNAME))) {
+ strncpy (volinfo->parent_volname, value, sizeof(volinfo->parent_volname) - 1);
} else {
if (is_key_glusterd_hooks_friendly (key)) {
@@ -2249,11 +2446,22 @@ glusterd_store_retrieve_volume (char *volname)
goto out;
case 0:
- gf_log ("", GF_LOG_ERROR, "Unknown key: %s",
- key);
+ /*Ignore GLUSTERD_STORE_KEY_VOL_BRICK since
+ glusterd_store_retrieve_bricks gets it later*/
+ if (!strstr (key, GLUSTERD_STORE_KEY_VOL_BRICK))
+ gf_log ("", GF_LOG_WARNING,
+ "Unknown key: %s", key);
break;
case 1:
+ /*The following strcmp check is to ensure that
+ * glusterd does not restore the quota limits
+ * into volinfo->dict post upgradation from 3.3
+ * to 3.4 as the same limits will now be stored
+ * in xattrs on the respective directories.
+ */
+ if (!strcmp (key, "features.limit-usage"))
+ break;
ret = dict_set_str(volinfo->dict, key,
gf_strdup (value));
if (ret) {
@@ -2272,8 +2480,7 @@ glusterd_store_retrieve_volume (char *volname)
key = NULL;
value = NULL;
- ret = glusterd_store_iter_get_next (iter, &key, &value,
- &op_errno);
+ ret = gf_store_iter_get_next (iter, &key, &value, &op_errno);
}
/* backward compatibility */
@@ -2307,39 +2514,214 @@ glusterd_store_retrieve_volume (char *volname)
break;
}
- volinfo->dist_leaf_count = (volinfo->stripe_count *
- volinfo->replica_count);
+ volinfo->dist_leaf_count = glusterd_get_dist_leaf_count (volinfo);
+
+ volinfo->subvol_count = (volinfo->brick_count /
+ volinfo->dist_leaf_count);
+
+ /* Only calculate volume op-versions if they are not found */
+ if (!volinfo->op_version && !volinfo->client_op_version)
+ gd_update_volume_op_versions (volinfo);
}
if (op_errno != GD_STORE_EOF)
goto out;
- ret = glusterd_store_iter_destroy (iter);
+ ret = gf_store_iter_destroy (iter);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to destroy store "
+ "iter");
+ goto out;
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+glusterd_volinfo_t*
+glusterd_store_retrieve_volume (char *volname, glusterd_snap_t *snap)
+{
+ int32_t ret = -1;
+ glusterd_volinfo_t *volinfo = NULL;
+ glusterd_volinfo_t *origin_volinfo = NULL;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (volname);
+
+ ret = glusterd_volinfo_new (&volinfo);
if (ret)
goto out;
+ strncpy (volinfo->volname, volname, GLUSTERD_MAX_VOLUME_NAME);
+ volinfo->snapshot = snap;
+ if (snap)
+ volinfo->is_snap_volume = _gf_true;
+
+ ret = glusterd_store_update_volinfo (volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to update volinfo "
+ "for %s volume", volname);
+ goto out;
+ }
+
ret = glusterd_store_retrieve_bricks (volinfo);
if (ret)
goto out;
- ret = glusterd_volume_compute_cksum (volinfo);
+ ret = glusterd_compute_cksum (volinfo, _gf_false);
+ if (ret)
+ goto out;
+
+ ret = glusterd_store_retrieve_quota_version (volinfo);
+ if (ret)
+ goto out;
+
+ ret = glusterd_store_create_quota_conf_sh_on_absence (volinfo);
+ if (ret)
+ goto out;
+
+ ret = glusterd_compute_cksum (volinfo, _gf_true);
+ if (ret)
+ goto out;
+
+ ret = glusterd_store_save_quota_version_and_cksum (volinfo);
if (ret)
goto out;
- list_add_tail (&volinfo->vol_list, &priv->volumes);
+
+ if (!snap) {
+ list_add_order (&volinfo->vol_list, &priv->volumes,
+ glusterd_compare_volume_name);
+ } else {
+ ret = glusterd_volinfo_find (volinfo->parent_volname,
+ &origin_volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Parent volinfo "
+ "not found for %s volume", volname);
+ goto out;
+ }
+ glusterd_list_add_snapvol (origin_volinfo, volinfo);
+ }
out:
- gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
+ if (ret) {
+ if (volinfo)
+ glusterd_volinfo_delete (volinfo);
+ volinfo = NULL;
+ }
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret);
+
+ return volinfo;
+}
+
+static inline void
+glusterd_store_set_options_path (glusterd_conf_t *conf, char *path, size_t len)
+{
+ snprintf (path, len, "%s/options", conf->workdir);
+}
+
+int
+_store_global_opts (dict_t *this, char *key, data_t *value, void *data)
+{
+ gf_store_handle_t *shandle = data;
+
+ gf_store_save_value (shandle->fd, key, (char*)value->data);
+ return 0;
+}
+
+int32_t
+glusterd_store_options (xlator_t *this, dict_t *opts)
+{
+ gf_store_handle_t *shandle = NULL;
+ glusterd_conf_t *conf = NULL;
+ char path[PATH_MAX] = {0};
+ int fd = -1;
+ int32_t ret = -1;
+
+ conf = this->private;
+ glusterd_store_set_options_path (conf, path, sizeof (path));
+
+ ret = gf_store_handle_new (path, &shandle);
+ if (ret)
+ goto out;
+
+ fd = gf_store_mkstemp (shandle);
+ if (fd <= 0) {
+ ret = -1;
+ goto out;
+ }
+ shandle->fd = fd;
+ dict_foreach (opts, _store_global_opts, shandle);
+ shandle->fd = 0;
+ ret = gf_store_rename_tmppath (shandle);
+ if (ret)
+ goto out;
+out:
+ gf_store_handle_destroy (shandle);
+ if (fd >=0 )
+ close (fd);
return ret;
}
+int32_t
+glusterd_store_retrieve_options (xlator_t *this)
+{
+ char path[PATH_MAX] = {0};
+ glusterd_conf_t *conf = NULL;
+ gf_store_handle_t *shandle = NULL;
+ gf_store_iter_t *iter = NULL;
+ char *key = NULL;
+ char *value = NULL;
+ gf_store_op_errno_t op_errno = 0;
+ int ret = -1;
+
+ conf = this->private;
+ glusterd_store_set_options_path (conf, path, sizeof (path));
+
+ ret = gf_store_handle_retrieve (path, &shandle);
+ if (ret)
+ goto out;
+
+ ret = gf_store_iter_new (shandle, &iter);
+ if (ret)
+ goto out;
+
+ ret = gf_store_iter_get_next (iter, &key, &value, &op_errno);
+ while (!ret) {
+ ret = dict_set_dynstr (conf->opts, key, value);
+ if (ret) {
+ GF_FREE (key);
+ GF_FREE (value);
+ goto out;
+ }
+ GF_FREE (key);
+ key = NULL;
+ value = NULL;
+
+ ret = gf_store_iter_get_next (iter, &key, &value, &op_errno);
+ }
+ if (op_errno != GD_STORE_EOF)
+ goto out;
+ ret = 0;
+out:
+ gf_store_iter_destroy (iter);
+ gf_store_handle_destroy (shandle);
+ return ret;
+}
int32_t
-glusterd_store_retrieve_volumes (xlator_t *this)
+glusterd_store_retrieve_volumes (xlator_t *this, glusterd_snap_t *snap)
{
- int32_t ret = 0;
+ int32_t ret = -1;
char path[PATH_MAX] = {0,};
glusterd_conf_t *priv = NULL;
DIR *dir = NULL;
@@ -2351,51 +2733,58 @@ glusterd_store_retrieve_volumes (xlator_t *this)
GF_ASSERT (priv);
- snprintf (path, PATH_MAX, "%s/%s", priv->workdir,
- GLUSTERD_VOLUME_DIR_PREFIX);
+ if (snap)
+ snprintf (path, PATH_MAX, "%s/snaps/%s", priv->workdir,
+ snap->snapname);
+ else
+ snprintf (path, PATH_MAX, "%s/%s", priv->workdir,
+ GLUSTERD_VOLUME_DIR_PREFIX);
dir = opendir (path);
if (!dir) {
gf_log ("", GF_LOG_ERROR, "Unable to open dir %s", path);
- ret = -1;
goto out;
}
glusterd_for_each_entry (entry, dir);
while (entry) {
- ret = glusterd_store_retrieve_volume (entry->d_name);
- if (ret) {
+ if ( entry->d_type != DT_DIR )
+ goto next;
+
+ volinfo = glusterd_store_retrieve_volume (entry->d_name, snap);
+ if (!volinfo) {
gf_log ("", GF_LOG_ERROR, "Unable to restore "
"volume: %s", entry->d_name);
+ ret = -1;
goto out;
}
- ret = glusterd_store_retrieve_rbstate (entry->d_name);
+ ret = glusterd_store_retrieve_rbstate (volinfo);
if (ret) {
/* Backward compatibility */
gf_log ("", GF_LOG_INFO, "Creating a new rbstate "
"for volume: %s.", entry->d_name);
- ret = glusterd_volinfo_find (entry->d_name, &volinfo);
ret = glusterd_store_create_rbstate_shandle_on_absence (volinfo);
ret = glusterd_store_perform_rbstate_store (volinfo);
}
- ret = glusterd_store_retrieve_node_state (entry->d_name);
+ ret = glusterd_store_retrieve_node_state (volinfo);
if (ret) {
/* Backward compatibility */
gf_log ("", GF_LOG_INFO, "Creating a new node_state "
"for volume: %s.", entry->d_name);
- ret = glusterd_volinfo_find (entry->d_name, &volinfo);
- ret =
glusterd_store_create_nodestate_sh_on_absence (volinfo);
ret = glusterd_store_perform_node_state_store (volinfo);
}
+next:
glusterd_for_each_entry (entry, dir);
}
+ ret = 0;
+
out:
if (dir)
closedir (dir);
@@ -2404,6 +2793,635 @@ out:
return ret;
}
+/* Figure out the brick mount path, from the brick path */
+int32_t
+glusterd_find_brick_mount_path (char *brick_path, int32_t brick_count,
+ char **brick_mount_path)
+{
+ char brick_num[PATH_MAX] = "";
+ char *ptr = NULL;
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (brick_path);
+ GF_ASSERT (brick_mount_path);
+
+ *brick_mount_path = gf_strdup (brick_path);
+ if (!*brick_mount_path) {
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (brick_num, sizeof(brick_num), "brick%d", brick_count);
+
+ /* Finding the pointer to the end of
+ * /var/run/gluster/snaps/<snap-uuid>
+ */
+ ptr = strstr (*brick_mount_path, brick_num);
+ if (!ptr) {
+ /* Snapshot bricks must have brick num as part
+ * of the brickpath
+ */
+ gf_log (this->name, GF_LOG_ERROR,
+ "Invalid brick path(%s)", brick_path);
+ ret = -1;
+ goto out;
+ }
+
+ /* Moving the pointer to the end of
+ * /var/run/gluster/snaps/<snap-uuid>/<brick_num>
+ * and assigning '\0' to it.
+ */
+ ptr += strlen(brick_num);
+ *ptr = '\0';
+
+ ret = 0;
+out:
+ if (ret && *brick_mount_path) {
+ GF_FREE (*brick_mount_path);
+ *brick_mount_path = NULL;
+ }
+ gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret);
+ return ret;
+}
+
+/* Check if brick_mount_path is already mounted. If not, mount the device_path
+ * at the brick_mount_path
+ */
+int32_t
+glusterd_mount_brick_paths (char *brick_mount_path, char *device_path)
+{
+ FILE *mtab = NULL;
+ int32_t ret = -1;
+ runner_t runner = {0, };
+ struct mntent *entry = NULL;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (brick_mount_path);
+ GF_ASSERT (device_path);
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ /* Check if the brick_mount_path is already mounted */
+ entry = glusterd_get_mnt_entry_info (brick_mount_path, mtab);
+ if (entry) {
+ gf_log (this->name, GF_LOG_INFO,
+ "brick_mount_path (%s) already mounted.",
+ brick_mount_path);
+ ret = 0;
+ goto out;
+ }
+
+ /* TODO RHEL 6.5 has the logical volumes inactive by default
+ * on reboot. Hence activating the logical vol. Check behaviour
+ * on other systems
+ */
+ /* Activate the snapshot */
+ runinit (&runner);
+ runner_add_args (&runner, "lvchange", "-ay", device_path,
+ NULL);
+ ret = runner_run (&runner);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to activate %s. Error: %s",
+ device_path, strerror(errno));
+ goto out;
+ } else
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Activating %s successful", device_path);
+
+ /* Mount the snapshot */
+ ret = glusterd_mount_lvm_snapshot (device_path, brick_mount_path);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to mount lvm snapshot.");
+ goto out;
+ }
+
+out:
+ if (mtab)
+ endmntent (mtab);
+ gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret);
+ return ret;
+}
+
+static int32_t
+glusterd_store_recreate_brick_mounts (glusterd_volinfo_t *volinfo)
+{
+ char *brick_mount_path = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ int32_t ret = -1;
+ int32_t brick_count = -1;
+ struct stat st_buf = {0, };
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (volinfo);
+
+ brick_count = 0;
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ brick_count++;
+ /* If the brick is not of this node, or its
+ * snapshot is pending, or the brick is not
+ * a snapshotted brick, we continue
+ */
+ if ((uuid_compare (brickinfo->uuid, MY_UUID)) ||
+ (brickinfo->snap_status == -1) ||
+ (strlen(brickinfo->device_path) == 0))
+ continue;
+
+ /* Fetch the brick mount path from the brickinfo->path */
+ ret = glusterd_find_brick_mount_path (brickinfo->path,
+ brick_count,
+ &brick_mount_path);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to find brick_mount_path for %s",
+ brickinfo->path);
+ goto out;
+ }
+
+ /* Check if the brickinfo path is present.
+ * If not create the brick_mount_path */
+ ret = lstat (brickinfo->path, &st_buf);
+ if (ret) {
+ if (errno == ENOENT) {
+ ret = mkdir_p (brick_mount_path, 0777,
+ _gf_true);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to create %s. "
+ "Error: %s", brick_mount_path,
+ strerror (errno));
+ goto out;
+ }
+ } else {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Brick Path(%s) not valid. "
+ "Error: %s", brickinfo->path,
+ strerror(errno));
+ goto out;
+ }
+ }
+
+ /* Check if brick_mount_path is already mounted.
+ * If not, mount the device_path at the brick_mount_path */
+ ret = glusterd_mount_brick_paths (brick_mount_path,
+ brickinfo->device_path);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to mount brick_mount_path");
+ goto out;
+ }
+
+ if (brick_mount_path) {
+ GF_FREE (brick_mount_path);
+ brick_mount_path = NULL;
+ }
+ }
+
+ ret = 0;
+out:
+ if (ret && brick_mount_path)
+ GF_FREE (brick_mount_path);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_resolve_snap_bricks (xlator_t *this, glusterd_snap_t *snap)
+{
+ int32_t ret = -1;
+ glusterd_volinfo_t *volinfo = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+
+ GF_ASSERT (this);
+ GF_VALIDATE_OR_GOTO (this->name, snap, out);
+
+ list_for_each_entry (volinfo, &snap->volumes, vol_list) {
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ ret = glusterd_resolve_brick (brickinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "resolve brick failed in restore");
+ goto out;
+ }
+ }
+ }
+
+ ret = 0;
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret);
+
+ return ret;
+}
+
+int
+glusterd_store_update_snap (glusterd_snap_t *snap)
+{
+ int ret = -1;
+ char *key = NULL;
+ char *value = NULL;
+ char snappath[PATH_MAX] = {0,};
+ char path[PATH_MAX] = {0,};
+ xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
+ gf_store_iter_t *iter = NULL;
+ gf_store_op_errno_t op_errno = GD_STORE_SUCCESS;
+
+ this = THIS;
+ conf = this->private;
+ GF_ASSERT (snap);
+
+ GLUSTERD_GET_SNAP_DIR (snappath, snap, conf);
+
+ snprintf (path, sizeof (path), "%s/%s", snappath,
+ GLUSTERD_SNAP_INFO_FILE);
+
+ ret = gf_store_handle_retrieve (path, &snap->shandle);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "snap handle is NULL");
+ goto out;
+ }
+
+ ret = gf_store_iter_new (snap->shandle, &iter);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get new store "
+ "iter");
+ goto out;
+ }
+
+ ret = gf_store_iter_get_next (iter, &key, &value, &op_errno);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get next store "
+ "iter");
+ goto out;
+ }
+
+ while (!ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "key = %s value = %s",
+ key, value);
+
+ if (!strncmp (key, GLUSTERD_STORE_KEY_SNAP_ID,
+ strlen (GLUSTERD_STORE_KEY_SNAP_ID))) {
+ ret = uuid_parse (value, snap->snap_id);
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING,
+ "Failed to parse uuid");
+ } else if (!strncmp (key, GLUSTERD_STORE_KEY_SNAP_RESTORED,
+ strlen (GLUSTERD_STORE_KEY_SNAP_RESTORED))) {
+ snap->snap_restored = atoi (value);
+ } else if (!strncmp (key, GLUSTERD_STORE_KEY_SNAP_STATUS,
+ strlen (GLUSTERD_STORE_KEY_SNAP_STATUS))) {
+ snap->snap_status = atoi (value);
+ } else if (!strncmp (key, GLUSTERD_STORE_KEY_SNAP_DESC,
+ strlen (GLUSTERD_STORE_KEY_SNAP_DESC))) {
+ snap->description = gf_strdup (value);
+ } else if (!strncmp (key, GLUSTERD_STORE_KEY_SNAP_TIMESTAMP,
+ strlen (GLUSTERD_STORE_KEY_SNAP_TIMESTAMP))) {
+ snap->time_stamp = atoi (value);
+ }
+
+ GF_FREE (key);
+ GF_FREE (value);
+ key = NULL;
+ value = NULL;
+
+ ret = gf_store_iter_get_next (iter, &key, &value, &op_errno);
+ }
+
+ if (op_errno != GD_STORE_EOF)
+ goto out;
+
+ ret = gf_store_iter_destroy (iter);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to destroy store "
+ "iter");
+ }
+
+out:
+ return ret;
+}
+
+int32_t
+glusterd_store_retrieve_snap (char *snapname)
+{
+ int32_t ret = -1;
+ glusterd_snap_t *snap = NULL;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ priv = this->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (snapname);
+
+ snap = glusterd_new_snap_object ();
+ if (!snap) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to create "
+ " snap object");
+ goto out;
+ }
+
+ strncpy (snap->snapname, snapname, strlen(snapname));
+ ret = glusterd_store_update_snap (snap);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to update snapshot "
+ "for %s snap", snapname);
+ goto out;
+ }
+
+ ret = glusterd_store_retrieve_volumes (this, snap);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to retrieve "
+ "snap volumes for snap %s", snapname);
+ goto out;
+ }
+
+ /* TODO: list_add_order can do 'N-square' comparisions and
+ is not efficient. Find a better solution to store the snap
+ in order */
+ list_add_order (&snap->snap_list, &priv->snapshots,
+ glusterd_compare_snap_time);
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret);
+ return ret;
+}
+
+/* Read the missed_snap_list and update the in-memory structs */
+int32_t
+glusterd_store_retrieve_missed_snaps_list (xlator_t *this)
+{
+ char buf[PATH_MAX] = "";
+ char path[PATH_MAX] = "";
+ char *snap_vol_id = NULL;
+ char *missed_node_info = NULL;
+ char *brick_path = NULL;
+ char *value = NULL;
+ char *save_ptr = NULL;
+ FILE *fp = NULL;
+ int32_t brick_num = -1;
+ int32_t snap_op = -1;
+ int32_t snap_status = -1;
+ int32_t ret = -1;
+ glusterd_conf_t *priv = NULL;
+ gf_store_op_errno_t store_errno = GD_STORE_SUCCESS;
+
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ /* Get the path of the missed_snap_list */
+ glusterd_store_missed_snaps_list_path_set (path, sizeof(path));
+
+ fp = fopen (path, "r");
+ if (!fp) {
+ /* If errno is ENOENT then there are no missed snaps yet */
+ if (errno != ENOENT) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to open %s. "
+ "Error: %s", path, strerror(errno));
+ } else {
+ gf_log (this->name, GF_LOG_INFO,
+ "No missed snaps list.");
+ ret = 0;
+ }
+ goto out;
+ }
+
+ do {
+ ret = gf_store_read_and_tokenize (fp, buf,
+ &missed_node_info, &value,
+ &store_errno);
+ if (ret) {
+ if (store_errno == GD_STORE_EOF) {
+ gf_log (this->name,
+ GF_LOG_DEBUG,
+ "EOF for missed_snap_list");
+ ret = 0;
+ break;
+ }
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to fetch data from "
+ "missed_snaps_list. Error: %s",
+ gf_store_strerror (store_errno));
+ goto out;
+ }
+
+ /* Fetch the brick_num, brick_path, snap_op and snap status */
+ snap_vol_id = strtok_r (value, ":", &save_ptr);
+ brick_num = atoi(strtok_r (NULL, ":", &save_ptr));
+ brick_path = strtok_r (NULL, ":", &save_ptr);
+ snap_op = atoi(strtok_r (NULL, ":", &save_ptr));
+ snap_status = atoi(strtok_r (NULL, ":", &save_ptr));
+
+ if (!missed_node_info || !brick_path || !snap_vol_id ||
+ brick_num < 1 || snap_op < 1 ||
+ snap_status < 1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Invalid missed_snap_entry");
+ ret = -1;
+ goto out;
+ }
+
+ ret = glusterd_add_new_entry_to_list (missed_node_info,
+ snap_vol_id,
+ brick_num,
+ brick_path,
+ snap_op,
+ snap_status);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to store missed snaps_list");
+ goto out;
+ }
+
+ } while (store_errno == GD_STORE_SUCCESS);
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_store_retrieve_snaps (xlator_t *this)
+{
+ int32_t ret = 0;
+ char path[PATH_MAX] = {0,};
+ glusterd_conf_t *priv = NULL;
+ DIR *dir = NULL;
+ struct dirent *entry = NULL;
+
+ GF_ASSERT (this);
+ priv = this->private;
+
+ GF_ASSERT (priv);
+
+ snprintf (path, PATH_MAX, "%s/snaps", priv->workdir);
+
+ dir = opendir (path);
+
+ if (!dir) {
+ /* If snaps dir doesn't exists ignore the error for
+ backward compatibility */
+ if (errno != ENOENT) {
+ ret = -1;
+ gf_log ("", GF_LOG_ERROR, "Unable to open dir %s", path);
+ }
+ goto out;
+ }
+
+ glusterd_for_each_entry (entry, dir);
+
+ while (entry) {
+ if (entry->d_type == DT_DIR) {
+ ret = glusterd_store_retrieve_snap (entry->d_name);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to restore snapshot: %s",
+ entry->d_name);
+ goto out;
+ }
+ }
+
+ glusterd_for_each_entry (entry, dir);
+ }
+
+ /* Retrieve missed_snaps_list */
+ ret = glusterd_store_retrieve_missed_snaps_list (this);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Failed to retrieve missed_snaps_list");
+ goto out;
+ }
+
+out:
+ if (dir)
+ closedir (dir);
+ gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
+
+ return ret;
+}
+
+/* Writes all the contents of conf->missed_snap_list */
+int32_t
+glusterd_store_write_missed_snapinfo (int32_t fd)
+{
+ char key[PATH_MAX] = "";
+ char value[PATH_MAX] = "";
+ int32_t ret = -1;
+ glusterd_conf_t *priv = NULL;
+ glusterd_missed_snap_info *missed_snapinfo = NULL;
+ glusterd_snap_op_t *snap_opinfo = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT(this);
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ /* Write the missed_snap_entry */
+ list_for_each_entry (missed_snapinfo, &priv->missed_snaps_list,
+ missed_snaps) {
+ list_for_each_entry (snap_opinfo,
+ &missed_snapinfo->snap_ops,
+ snap_ops_list) {
+ snprintf (key, sizeof(key), "%s:%s",
+ missed_snapinfo->node_uuid,
+ missed_snapinfo->snap_uuid);
+ snprintf (value, sizeof(value), "%s:%d:%s:%d:%d",
+ snap_opinfo->snap_vol_id,
+ snap_opinfo->brick_num,
+ snap_opinfo->brick_path,
+ snap_opinfo->op, snap_opinfo->status);
+ ret = gf_store_save_value (fd, key, value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to write missed snapinfo");
+ goto out;
+ }
+ }
+ }
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* Adds the missed snap entries to the in-memory conf->missed_snap_list *
+ * and writes them to disk */
+int32_t
+glusterd_store_update_missed_snaps ()
+{
+ int32_t fd = -1;
+ int32_t ret = -1;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT(this);
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ ret = glusterd_store_create_missed_snaps_list_shandle_on_absence ();
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to obtain "
+ "missed_snaps_list store handle.");
+ goto out;
+ }
+
+ fd = gf_store_mkstemp (priv->missed_snaps_list_shandle);
+ if (fd <= 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to create tmp file");
+ ret = -1;
+ goto out;
+ }
+
+ ret = glusterd_store_write_missed_snapinfo (fd);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to write missed snaps to disk");
+ goto out;
+ }
+
+ ret = gf_store_rename_tmppath (priv->missed_snaps_list_shandle);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to rename the tmp file");
+ goto out;
+ }
+out:
+ if (ret && (fd > 0)) {
+ ret = gf_store_unlink_tmppath (priv->missed_snaps_list_shandle);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to unlink the tmp file");
+ }
+ ret = -1;
+ }
+
+ if (fd > 0)
+ close (fd);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
int32_t
glusterd_store_delete_peerinfo (glusterd_peerinfo_t *peerinfo)
{
@@ -2452,7 +3470,7 @@ glusterd_store_delete_peerinfo (glusterd_peerinfo_t *peerinfo)
out:
if (peerinfo->shandle) {
- glusterd_store_handle_destroy (peerinfo->shandle);
+ gf_store_handle_destroy (peerinfo->shandle);
peerinfo->shandle = NULL;
}
gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
@@ -2478,7 +3496,7 @@ glusterd_store_create_peer_dir ()
char path[PATH_MAX];
glusterd_store_peerinfo_dirpath_set (path, sizeof (path));
- ret = glusterd_store_mkdir (path);
+ ret = gf_store_mkdir (path);
gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
return ret;
@@ -2522,8 +3540,8 @@ glusterd_store_peerinfo_hostname_shandle_create (glusterd_peerinfo_t *peerinfo)
glusterd_store_hostname_peerpath_set (peerinfo, peerfpath,
sizeof (peerfpath));
- ret = glusterd_store_handle_create_on_absence (&peerinfo->shandle,
- peerfpath);
+ ret = gf_store_handle_create_on_absence (&peerinfo->shandle,
+ peerfpath);
return ret;
}
@@ -2535,8 +3553,8 @@ glusterd_store_peerinfo_uuid_shandle_create (glusterd_peerinfo_t *peerinfo)
glusterd_store_uuid_peerpath_set (peerinfo, peerfpath,
sizeof (peerfpath));
- ret = glusterd_store_handle_create_on_absence (&peerinfo->shandle,
- peerfpath);
+ ret = gf_store_handle_create_on_absence (&peerinfo->shandle,
+ peerfpath);
return ret;
}
@@ -2552,7 +3570,7 @@ glusterd_peerinfo_hostname_shandle_check_destroy (glusterd_peerinfo_t *peerinfo)
ret = stat (peerfpath, &stbuf);
if (!ret) {
if (peerinfo->shandle)
- glusterd_store_handle_destroy (peerinfo->shandle);
+ gf_store_handle_destroy (peerinfo->shandle);
peerinfo->shandle = NULL;
ret = unlink (peerfpath);
}
@@ -2581,18 +3599,18 @@ glusterd_store_peer_write (int fd, glusterd_peerinfo_t *peerinfo)
char buf[50] = {0};
int32_t ret = 0;
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_PEER_UUID,
- uuid_utoa (peerinfo->uuid));
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_PEER_UUID,
+ uuid_utoa (peerinfo->uuid));
if (ret)
goto out;
snprintf (buf, sizeof (buf), "%d", peerinfo->state.state);
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_PEER_STATE, buf);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_PEER_STATE, buf);
if (ret)
goto out;
- ret = glusterd_store_save_value (fd, GLUSTERD_STORE_KEY_PEER_HOSTNAME "1",
- peerinfo->hostname);
+ ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_PEER_HOSTNAME "1",
+ peerinfo->hostname);
if (ret)
goto out;
@@ -2609,7 +3627,7 @@ glusterd_store_perform_peer_store (glusterd_peerinfo_t *peerinfo)
GF_ASSERT (peerinfo);
- fd = glusterd_store_mkstemp (peerinfo->shandle);
+ fd = gf_store_mkstemp (peerinfo->shandle);
if (fd <= 0) {
ret = -1;
goto out;
@@ -2619,10 +3637,10 @@ glusterd_store_perform_peer_store (glusterd_peerinfo_t *peerinfo)
if (ret)
goto out;
- ret = glusterd_store_rename_tmppath (peerinfo->shandle);
+ ret = gf_store_rename_tmppath (peerinfo->shandle);
out:
if (ret && (fd > 0))
- glusterd_store_unlink_tmppath (peerinfo->shandle);
+ gf_store_unlink_tmppath (peerinfo->shandle);
if (fd > 0)
close (fd);
gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
@@ -2662,13 +3680,13 @@ glusterd_store_retrieve_peers (xlator_t *this)
uuid_t uuid = {0,};
char *hostname = NULL;
int32_t state = 0;
- glusterd_store_handle_t *shandle = NULL;
+ gf_store_handle_t *shandle = NULL;
char filepath[PATH_MAX] = {0,};
- glusterd_store_iter_t *iter = NULL;
+ gf_store_iter_t *iter = NULL;
char *key = NULL;
char *value = NULL;
glusterd_peerctx_args_t args = {0};
- glusterd_store_op_errno_t op_errno = GD_STORE_SUCCESS;
+ gf_store_op_errno_t op_errno = GD_STORE_SUCCESS;
GF_ASSERT (this);
priv = this->private;
@@ -2690,16 +3708,15 @@ glusterd_store_retrieve_peers (xlator_t *this)
while (entry) {
snprintf (filepath, PATH_MAX, "%s/%s", path, entry->d_name);
- ret = glusterd_store_handle_retrieve (filepath, &shandle);
+ ret = gf_store_handle_retrieve (filepath, &shandle);
if (ret)
goto out;
- ret = glusterd_store_iter_new (shandle, &iter);
+ ret = gf_store_iter_new (shandle, &iter);
if (ret)
goto out;
- ret = glusterd_store_iter_get_next (iter, &key, &value,
- &op_errno);
+ ret = gf_store_iter_get_next (iter, &key, &value, &op_errno);
if (ret)
goto out;
@@ -2727,17 +3744,18 @@ glusterd_store_retrieve_peers (xlator_t *this)
key = NULL;
value = NULL;
- ret = glusterd_store_iter_get_next (iter, &key, &value,
- &op_errno);
+ ret = gf_store_iter_get_next (iter, &key, &value,
+ &op_errno);
}
- if (op_errno != GD_STORE_EOF)
+ if (op_errno != GD_STORE_EOF) {
+ GF_FREE(hostname);
goto out;
+ }
- (void) glusterd_store_iter_destroy (iter);
+ (void) gf_store_iter_destroy (iter);
- args.mode = GD_MODE_SWITCH_ON;
ret = glusterd_friend_add (hostname, 0, state, &uuid,
- &peerinfo, 1, &args);
+ &peerinfo, 1, NULL);
GF_FREE (hostname);
if (ret)
@@ -2747,6 +3765,13 @@ glusterd_store_retrieve_peers (xlator_t *this)
glusterd_for_each_entry (entry, dir);
}
+ args.mode = GD_MODE_ON;
+ list_for_each_entry (peerinfo, &priv->peers, uuid_list) {
+ ret = glusterd_friend_rpc_create (this, peerinfo, &args);
+ if (ret)
+ goto out;
+ }
+
out:
if (dir)
closedir (dir);
@@ -2755,19 +3780,147 @@ out:
return ret;
}
+static int32_t
+glusterd_recreate_vol_brick_mounts (xlator_t *this,
+ glusterd_volinfo_t *volinfo)
+{
+ int32_t ret = 0;
+ glusterd_conf_t *priv = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ ret = glusterd_store_recreate_brick_mounts (volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to recreate brick mounts "
+ "for %s", volinfo->volname);
+ goto out;
+ }
+ }
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret);
+ return ret;
+}
+
+/* Bricks for snap volumes are hosted at /var/run/gluster/snaps
+ * When a volume is restored, it points to the bricks of the snap
+ * volume it was restored from. Hence on a node restart these
+ * paths need to be recreated and re-mounted
+ */
+int32_t
+glusterd_recreate_all_snap_brick_mounts (xlator_t *this)
+{
+ int32_t ret = 0;
+ glusterd_conf_t *priv = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ glusterd_snap_t *snap = NULL;
+
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ /* Recreate bricks of volumes restored from snaps */
+ list_for_each_entry (volinfo, &priv->volumes, vol_list) {
+ /* If the volume is not a restored volume then continue */
+ if (uuid_is_null (volinfo->restored_from_snap))
+ continue;
+
+ ret = glusterd_recreate_vol_brick_mounts (this, volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to recreate brick mounts "
+ "for %s", volinfo->volname);
+ goto out;
+ }
+ }
+
+ /* Recreate bricks of snapshot volumes */
+ list_for_each_entry (snap, &priv->snapshots, snap_list) {
+ list_for_each_entry (volinfo, &snap->volumes, vol_list) {
+ ret = glusterd_recreate_vol_brick_mounts (this,
+ volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to recreate brick mounts "
+ "for %s", snap->snapname);
+ goto out;
+ }
+ }
+ }
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret);
+ return ret;
+}
+
+/* When the snapshot command from cli is received, the on disk and
+ * in memory structures for the snapshot are created (with the status)
+ * being marked as GD_SNAP_STATUS_INIT. Once the backend snapshot is
+ * taken, the status is changed to GD_SNAP_STATUS_IN_USE. If glusterd
+ * dies after taking the backend snapshot, but before updating the
+ * status, then when glusterd comes up, it should treat that snapshot
+ * as a failed snapshot and clean it up.
+ */
+int32_t
+glusterd_snap_cleanup (xlator_t *this)
+{
+ dict_t *dict = NULL;
+ int32_t ret = 0;
+ glusterd_conf_t *priv = NULL;
+ glusterd_snap_t *snap = NULL;
+
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ dict = dict_new();
+ if (!dict) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to create dict");
+ ret = -1;
+ goto out;
+ }
+
+ list_for_each_entry (snap, &priv->snapshots, snap_list) {
+ if (snap->snap_status != GD_SNAP_STATUS_IN_USE) {
+ ret = glusterd_snap_remove (dict, snap,
+ _gf_true, _gf_true);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to remove the snapshot %s",
+ snap->snapname);
+ goto out;
+ }
+ }
+ }
+out:
+ if (dict)
+ dict_unref (dict);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret);
+ return ret;
+}
+
int32_t
glusterd_resolve_all_bricks (xlator_t *this)
{
- int32_t ret = 0;
- glusterd_conf_t *priv = NULL;
- glusterd_volinfo_t *volinfo = NULL;
+ int32_t ret = 0;
+ glusterd_conf_t *priv = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
glusterd_brickinfo_t *brickinfo = NULL;
+ glusterd_snap_t *snap = NULL;
GF_ASSERT (this);
priv = this->private;
GF_ASSERT (priv);
+ /* Resolve bricks of volumes */
list_for_each_entry (volinfo, &priv->volumes, vol_list) {
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
ret = glusterd_resolve_brick (brickinfo);
@@ -2779,9 +3932,20 @@ glusterd_resolve_all_bricks (xlator_t *this)
}
}
-out:
- gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
+ /* Resolve bricks of snapshot volumes */
+ list_for_each_entry (snap, &priv->snapshots, snap_list) {
+ ret = glusterd_resolve_snap_bricks (this, snap);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "resolving the snap bricks"
+ " failed for snap: %s",
+ snap->snapname);
+ goto out;
+ }
+ }
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret);
return ret;
}
@@ -2793,8 +3957,18 @@ glusterd_restore ()
this = THIS;
- ret = glusterd_store_retrieve_volumes (this);
+ ret = glusterd_restore_op_version (this);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to restore op_version");
+ goto out;
+ }
+ ret = glusterd_store_retrieve_volumes (this, NULL);
+ if (ret)
+ goto out;
+
+ ret = glusterd_store_retrieve_snaps (this);
if (ret)
goto out;
@@ -2806,7 +3980,124 @@ glusterd_restore ()
if (ret)
goto out;
+ ret = glusterd_snap_cleanup (this);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to perform "
+ "a cleanup of the snapshots");
+ goto out;
+ }
+
+ ret = glusterd_recreate_all_snap_brick_mounts (this);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to recreate "
+ "all snap brick mounts");
+ goto out;
+ }
+
out:
gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
+
+int
+glusterd_store_retrieve_quota_version (glusterd_volinfo_t *volinfo)
+{
+ int ret = -1;
+ uint32_t version = 0;
+ char cksum_path[PATH_MAX] = {0,};
+ char path[PATH_MAX] = {0,};
+ char *version_str = NULL;
+ char *tmp = NULL;
+ xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
+ gf_store_handle_t *handle = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ GLUSTERD_GET_VOLUME_DIR (path, volinfo, conf);
+ snprintf (cksum_path, sizeof (cksum_path), "%s/%s", path,
+ GLUSTERD_VOL_QUOTA_CKSUM_FILE);
+
+ ret = gf_store_handle_new (cksum_path, &handle);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get store handle "
+ "for %s", cksum_path);
+ goto out;
+ }
+
+ ret = gf_store_retrieve_value (handle, "version", &version_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "Version absent");
+ ret = 0;
+ goto out;
+ }
+
+ version = strtoul (version_str, &tmp, 10);
+ if ((errno == ERANGE) || (errno == EINVAL)) {
+ gf_log (this->name, GF_LOG_DEBUG, "Invalid version number");
+ goto out;
+ }
+ volinfo->quota_conf_version = version;
+ ret = 0;
+
+out:
+ if (version_str)
+ GF_FREE (version_str);
+ gf_store_handle_destroy (handle);
+ return ret;
+}
+
+int
+glusterd_store_save_quota_version_and_cksum (glusterd_volinfo_t *volinfo)
+{
+ int ret = -1;
+ char cksum_path[PATH_MAX] = {0,};
+ char path[PATH_MAX] = {0,};
+ xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
+ char buf[256] = {0,};
+ int fd = -1;
+
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ GLUSTERD_GET_VOLUME_DIR (path, volinfo, conf);
+ snprintf (cksum_path, sizeof (cksum_path), "%s/%s", path,
+ GLUSTERD_VOL_QUOTA_CKSUM_FILE);
+
+ fd = open (cksum_path, O_RDWR | O_APPEND | O_CREAT| O_TRUNC, 0600);
+
+ if (-1 == fd) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to open %s,"
+ "Reason: %s", cksum_path, strerror (errno));
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (buf, sizeof (buf)-1, "%u", volinfo->quota_conf_cksum);
+ ret = gf_store_save_value (fd, "cksum", buf);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to store cksum");
+ goto out;
+ }
+
+ memset (buf, 0, sizeof (buf));
+ snprintf (buf, sizeof (buf)-1, "%u", volinfo->quota_conf_version);
+ ret = gf_store_save_value (fd, "version", buf);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to store version");
+ goto out;
+ }
+
+ ret = 0;
+
+out:
+ if (fd != -1)
+ close (fd);
+ return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-store.h b/xlators/mgmt/glusterd/src/glusterd-store.h
index dd34250d7..63d510cbf 100644
--- a/xlators/mgmt/glusterd/src/glusterd-store.h
+++ b/xlators/mgmt/glusterd/src/glusterd-store.h
@@ -35,35 +35,56 @@ typedef enum glusterd_store_ver_ac_{
} glusterd_volinfo_ver_ac_t;
-#define GLUSTERD_STORE_UUID_KEY "UUID"
-
-#define GLUSTERD_STORE_KEY_VOL_TYPE "type"
-#define GLUSTERD_STORE_KEY_VOL_COUNT "count"
-#define GLUSTERD_STORE_KEY_VOL_STATUS "status"
-#define GLUSTERD_STORE_KEY_VOL_PORT "port"
-#define GLUSTERD_STORE_KEY_VOL_SUB_COUNT "sub_count"
-#define GLUSTERD_STORE_KEY_VOL_STRIPE_CNT "stripe_count"
-#define GLUSTERD_STORE_KEY_VOL_REPLICA_CNT "replica_count"
-#define GLUSTERD_STORE_KEY_VOL_BRICK "brick"
-#define GLUSTERD_STORE_KEY_VOL_VERSION "version"
-#define GLUSTERD_STORE_KEY_VOL_TRANSPORT "transport-type"
-#define GLUSTERD_STORE_KEY_VOL_ID "volume-id"
-#define GLUSTERD_STORE_KEY_RB_STATUS "rb_status"
-#define GLUSTERD_STORE_KEY_RB_SRC_BRICK "rb_src"
-#define GLUSTERD_STORE_KEY_RB_DST_BRICK "rb_dst"
-#define GLUSTERD_STORE_KEY_VOL_DEFRAG "rebalance_status"
-#define GLUSTERD_STORE_KEY_USERNAME "username"
-#define GLUSTERD_STORE_KEY_PASSWORD "password"
-
-#define GLUSTERD_STORE_KEY_BRICK_HOSTNAME "hostname"
-#define GLUSTERD_STORE_KEY_BRICK_PATH "path"
-#define GLUSTERD_STORE_KEY_BRICK_PORT "listen-port"
-#define GLUSTERD_STORE_KEY_BRICK_RDMA_PORT "rdma.listen-port"
+#define GLUSTERD_STORE_UUID_KEY "UUID"
+
+#define GLUSTERD_STORE_KEY_VOL_TYPE "type"
+#define GLUSTERD_STORE_KEY_VOL_COUNT "count"
+#define GLUSTERD_STORE_KEY_VOL_STATUS "status"
+#define GLUSTERD_STORE_KEY_VOL_PORT "port"
+#define GLUSTERD_STORE_KEY_VOL_SUB_COUNT "sub_count"
+#define GLUSTERD_STORE_KEY_VOL_STRIPE_CNT "stripe_count"
+#define GLUSTERD_STORE_KEY_VOL_REPLICA_CNT "replica_count"
+#define GLUSTERD_STORE_KEY_VOL_BRICK "brick"
+#define GLUSTERD_STORE_KEY_VOL_VERSION "version"
+#define GLUSTERD_STORE_KEY_VOL_TRANSPORT "transport-type"
+#define GLUSTERD_STORE_KEY_VOL_ID "volume-id"
+#define GLUSTERD_STORE_KEY_VOL_RESTORED_SNAP "restored_from_snap"
+#define GLUSTERD_STORE_KEY_RB_STATUS "rb_status"
+#define GLUSTERD_STORE_KEY_RB_SRC_BRICK "rb_src"
+#define GLUSTERD_STORE_KEY_RB_DST_BRICK "rb_dst"
+#define GLUSTERD_STORE_KEY_RB_DST_PORT "rb_port"
+#define GLUSTERD_STORE_KEY_VOL_DEFRAG "rebalance_status"
+#define GLUSTERD_STORE_KEY_DEFRAG_OP "rebalance_op"
+#define GLUSTERD_STORE_KEY_USERNAME "username"
+#define GLUSTERD_STORE_KEY_PASSWORD "password"
+#define GLUSTERD_STORE_KEY_PARENT_VOLNAME "parent_volname"
+#define GLUSTERD_STORE_KEY_VOL_OP_VERSION "op-version"
+#define GLUSTERD_STORE_KEY_VOL_CLIENT_OP_VERSION "client-op-version"
+
+#define GLUSTERD_STORE_KEY_SNAP_NAME "name"
+#define GLUSTERD_STORE_KEY_SNAP_ID "snap-id"
+#define GLUSTERD_STORE_KEY_SNAP_DESC "desc"
+#define GLUSTERD_STORE_KEY_SNAP_TIMESTAMP "time-stamp"
+#define GLUSTERD_STORE_KEY_SNAP_STATUS "status"
+#define GLUSTERD_STORE_KEY_SNAP_RESTORED "snap-restored"
+#define GLUSTERD_STORE_KEY_SNAP_MAX_HARD_LIMIT "snap-max-hard-limit"
+#define GLUSTERD_STORE_KEY_SNAP_MAX_SOFT_LIMIT "snap-max-soft-limit"
+
+#define GLUSTERD_STORE_KEY_BRICK_HOSTNAME "hostname"
+#define GLUSTERD_STORE_KEY_BRICK_PATH "path"
+#define GLUSTERD_STORE_KEY_BRICK_PORT "listen-port"
+#define GLUSTERD_STORE_KEY_BRICK_RDMA_PORT "rdma.listen-port"
#define GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED "decommissioned"
+#define GLUSTERD_STORE_KEY_BRICK_VGNAME "vg"
+#define GLUSTERD_STORE_KEY_BRICK_DEVICE_PATH "device_path"
+#define GLUSTERD_STORE_KEY_BRICK_SNAP_STATUS "snap-status"
+#define GLUSTERD_STORE_KEY_BRICK_ID "brick-id"
-#define GLUSTERD_STORE_KEY_PEER_UUID "uuid"
-#define GLUSTERD_STORE_KEY_PEER_HOSTNAME "hostname"
-#define GLUSTERD_STORE_KEY_PEER_STATE "state"
+#define GLUSTERD_STORE_KEY_PEER_UUID "uuid"
+#define GLUSTERD_STORE_KEY_PEER_HOSTNAME "hostname"
+#define GLUSTERD_STORE_KEY_PEER_STATE "state"
+
+#define GLUSTERD_STORE_KEY_VOL_CAPS "caps"
#define glusterd_for_each_entry(entry, dir) \
do {\
@@ -71,6 +92,7 @@ typedef enum glusterd_store_ver_ac_{
if (dir) {\
entry = readdir (dir);\
while (entry && (!strcmp (entry->d_name, ".") ||\
+ !fnmatch ("*.tmp", entry->d_name, 0) ||\
!strcmp (entry->d_name, ".."))) {\
entry = readdir (dir);\
}\
@@ -78,16 +100,6 @@ typedef enum glusterd_store_ver_ac_{
} while (0); \
-typedef enum {
- GD_STORE_SUCCESS,
- GD_STORE_KEY_NULL,
- GD_STORE_VALUE_NULL,
- GD_STORE_KEY_VALUE_NULL,
- GD_STORE_EOF,
- GD_STORE_ENOMEM,
- GD_STORE_STAT_FAILED
-} glusterd_store_op_errno_t;
-
int32_t
glusterd_store_volinfo (glusterd_volinfo_t *volinfo, glusterd_volinfo_ver_ac_t ac);
@@ -95,17 +107,7 @@ int32_t
glusterd_store_delete_volume (glusterd_volinfo_t *volinfo);
int32_t
-glusterd_store_uuid ();
-
-int32_t
-glusterd_store_handle_new (char *path, glusterd_store_handle_t **handle);
-
-int32_t
-glusterd_store_save_value (int fd, char *key, char *value);
-
-int32_t
-glusterd_store_retrieve_value (glusterd_store_handle_t *handle,
- char *key, char **value);
+glusterd_store_delete_snap (glusterd_snap_t *snap);
int32_t
glusterd_retrieve_uuid ();
@@ -117,11 +119,8 @@ int32_t
glusterd_store_delete_peerinfo (glusterd_peerinfo_t *peerinfo);
int32_t
-glusterd_store_delete_brick (glusterd_volinfo_t *volinfo,
- glusterd_brickinfo_t *brickinfo);
-
-int32_t
-glusterd_store_handle_destroy (glusterd_store_handle_t *handle);
+glusterd_store_delete_brick (glusterd_brickinfo_t *brickinfo,
+ char *delete_path);
int32_t
glusterd_restore ();
@@ -134,4 +133,41 @@ glusterd_store_is_valid_brickpath (char *volname, char *brick);
int32_t
glusterd_store_perform_node_state_store (glusterd_volinfo_t *volinfo);
+
+int
+glusterd_retrieve_op_version (xlator_t *this, int *op_version);
+
+int
+glusterd_store_global_info (xlator_t *this);
+
+int32_t
+glusterd_store_retrieve_options (xlator_t *this);
+
+int32_t
+glusterd_store_retrieve_bricks (glusterd_volinfo_t *volinfo);
+
+int32_t
+glusterd_store_options (xlator_t *this, dict_t *opts);
+
+void
+glusterd_replace_slash_with_hyphen (char *str);
+
+int32_t
+glusterd_store_perform_volume_store (glusterd_volinfo_t *volinfo);
+
+int32_t
+glusterd_store_create_quota_conf_sh_on_absence (glusterd_volinfo_t *volinfo);
+
+int
+glusterd_store_retrieve_quota_version (glusterd_volinfo_t *volinfo);
+
+int
+glusterd_store_save_quota_version_and_cksum (glusterd_volinfo_t *volinfo);
+
+int32_t
+glusterd_store_snap (glusterd_snap_t *snap);
+
+int32_t
+glusterd_store_update_missed_snaps ();
+
#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-syncop.c b/xlators/mgmt/glusterd/src/glusterd-syncop.c
index 6c7bccf95..b36d6f616 100644
--- a/xlators/mgmt/glusterd/src/glusterd-syncop.c
+++ b/xlators/mgmt/glusterd/src/glusterd-syncop.c
@@ -8,7 +8,6 @@
cases as published by the Free Software Foundation.
*/
/* rpc related syncops */
-#include "syncop.h"
#include "rpc-clnt.h"
#include "protocol-common.h"
#include "xdr-generic.h"
@@ -18,11 +17,196 @@
#include "glusterd.h"
#include "glusterd-op-sm.h"
#include "glusterd-utils.h"
+#include "glusterd-locks.h"
+
+extern glusterd_op_info_t opinfo;
+
+void
+gd_synctask_barrier_wait (struct syncargs *args, int count)
+{
+ glusterd_conf_t *conf = THIS->private;
+
+ synclock_unlock (&conf->big_lock);
+ synctask_barrier_wait (args, count);
+ synclock_lock (&conf->big_lock);
+
+ syncbarrier_destroy (&args->barrier);
+}
+
+static void
+gd_mgmt_v3_collate_errors (struct syncargs *args, int op_ret, int op_errno,
+ char *op_errstr, int op_code,
+ glusterd_peerinfo_t *peerinfo, u_char *uuid)
+{
+ char err_str[PATH_MAX] = "Please check log file for details.";
+ char op_err[PATH_MAX] = "";
+ char *peer_str = NULL;
+
+ if (op_ret) {
+ args->op_ret = op_ret;
+ args->op_errno = op_errno;
+
+ if (peerinfo)
+ peer_str = peerinfo->hostname;
+ else
+ peer_str = uuid_utoa (uuid);
+
+ if (op_errstr && strcmp (op_errstr, ""))
+ snprintf (err_str, sizeof(err_str) - 1,
+ "Error: %s", op_errstr);
+
+ switch (op_code) {
+ case GLUSTERD_MGMT_V3_LOCK:
+ {
+ snprintf (op_err, sizeof(op_err) - 1,
+ "Locking failed "
+ "on %s. %s", peer_str, err_str);
+ break;
+ }
+ case GLUSTERD_MGMT_V3_UNLOCK:
+ {
+ snprintf (op_err, sizeof(op_err) - 1,
+ "Unlocking failed "
+ "on %s. %s", peer_str, err_str);
+ break;
+ }
+ }
+
+ if (args->errstr) {
+ snprintf (err_str, sizeof(err_str) - 1,
+ "%s\n%s", args->errstr,
+ op_err);
+ GF_FREE (args->errstr);
+ args->errstr = NULL;
+ } else
+ snprintf (err_str, sizeof(err_str) - 1,
+ "%s", op_err);
+
+ gf_log ("", GF_LOG_ERROR, "%s", op_err);
+ args->errstr = gf_strdup (err_str);
+ }
+
+ return;
+}
+
+static void
+gd_collate_errors (struct syncargs *args, int op_ret, int op_errno,
+ char *op_errstr, int op_code,
+ glusterd_peerinfo_t *peerinfo, u_char *uuid)
+{
+ char err_str[PATH_MAX] = "Please check log file for details.";
+ char op_err[PATH_MAX] = "";
+ int len = -1;
+ char *peer_str = NULL;
+
+ if (op_ret) {
+ args->op_ret = op_ret;
+ args->op_errno = op_errno;
+
+ if (peerinfo)
+ peer_str = peerinfo->hostname;
+ else
+ peer_str = uuid_utoa (uuid);
+
+ if (op_errstr && strcmp (op_errstr, "")) {
+ len = snprintf (err_str, sizeof(err_str) - 1,
+ "Error: %s", op_errstr);
+ err_str[len] = '\0';
+ }
+
+ switch (op_code){
+ case GLUSTERD_MGMT_CLUSTER_LOCK :
+ {
+ len = snprintf (op_err, sizeof(op_err) - 1,
+ "Locking failed on %s. %s",
+ peer_str, err_str);
+ break;
+ }
+ case GLUSTERD_MGMT_CLUSTER_UNLOCK :
+ {
+ len = snprintf (op_err, sizeof(op_err) - 1,
+ "Unlocking failed on %s. %s",
+ peer_str, err_str);
+ break;
+ }
+ case GLUSTERD_MGMT_STAGE_OP :
+ {
+ len = snprintf (op_err, sizeof(op_err) - 1,
+ "Staging failed on %s. %s",
+ peer_str, err_str);
+ break;
+ }
+ case GLUSTERD_MGMT_COMMIT_OP :
+ {
+ len = snprintf (op_err, sizeof(op_err) - 1,
+ "Commit failed on %s. %s",
+ peer_str, err_str);
+ break;
+ }
+ }
+ op_err[len] = '\0';
+
+ if (args->errstr) {
+ len = snprintf (err_str, sizeof(err_str) - 1,
+ "%s\n%s", args->errstr,
+ op_err);
+ GF_FREE (args->errstr);
+ args->errstr = NULL;
+ } else
+ len = snprintf (err_str, sizeof(err_str) - 1,
+ "%s", op_err);
+ err_str[len] = '\0';
+
+ gf_log ("", GF_LOG_ERROR, "%s", op_err);
+ args->errstr = gf_strdup (err_str);
+ }
+
+ return;
+}
+
+void
+gd_syncargs_init (struct syncargs *args, dict_t *op_ctx)
+{
+ args->dict = op_ctx;
+ pthread_mutex_init (&args->lock_dict, NULL);
+}
+
+static void
+gd_stage_op_req_free (gd1_mgmt_stage_op_req *req)
+{
+ if (!req)
+ return;
+
+ GF_FREE (req->buf.buf_val);
+ GF_FREE (req);
+}
+
+static void
+gd_commit_op_req_free (gd1_mgmt_commit_op_req *req)
+{
+ if (!req)
+ return;
+
+ GF_FREE (req->buf.buf_val);
+ GF_FREE (req);
+}
+
+static void
+gd_brick_op_req_free (gd1_mgmt_brick_op_req *req)
+{
+ if (!req)
+ return;
+
+ if (strcmp (req->name, "") != 0)
+ GF_FREE (req->name);
+ GF_FREE (req->input.input_val);
+ GF_FREE (req);
+}
int
-gd_syncop_submit_request (struct rpc_clnt *rpc, void *req,
- void *cookie, rpc_clnt_prog_t *prog,
- int procnum, fop_cbk_fn_t cbkfn, xdrproc_t xdrproc)
+gd_syncop_submit_request (struct rpc_clnt *rpc, void *req, void *local,
+ void *cookie, rpc_clnt_prog_t *prog, int procnum,
+ fop_cbk_fn_t cbkfn, xdrproc_t xdrproc)
{
int ret = -1;
struct iobuf *iobuf = NULL;
@@ -62,7 +246,8 @@ gd_syncop_submit_request (struct rpc_clnt *rpc, void *req,
iov.iov_len = ret;
count = 1;
- frame->local = cookie;
+ frame->local = local;
+ frame->cookie = cookie;
/* Send the msg */
ret = rpc_clnt_submit (rpc, prog, procnum, cbkfn,
@@ -80,145 +265,542 @@ out:
/* Defined in glusterd-rpc-ops.c */
extern struct rpc_clnt_program gd_mgmt_prog;
+extern struct rpc_clnt_program gd_brick_prog;
+extern struct rpc_clnt_program gd_mgmt_v3_prog;
-int32_t
-gd_syncop_mgmt_lock_cbk (struct rpc_req *req, struct iovec *iov,
- int count, void *myframe)
+int
+glusterd_syncop_aggr_rsp_dict (glusterd_op_t op, dict_t *aggr, dict_t *rsp)
{
- struct syncargs *args = NULL;
- gd1_mgmt_cluster_lock_rsp rsp = {{0},};
- int ret = -1;
- call_frame_t *frame = NULL;
+ int ret = 0;
- frame = myframe;
- args = frame->local;
- frame->local = NULL;
+ switch (op) {
+ case GD_OP_REPLACE_BRICK:
+ ret = glusterd_rb_use_rsp_dict (aggr, rsp);
+ if (ret)
+ goto out;
+ break;
- /* initialize */
- args->op_ret = -1;
- args->op_errno = EINVAL;
+ case GD_OP_SYNC_VOLUME:
+ ret = glusterd_sync_use_rsp_dict (aggr, rsp);
+ if (ret)
+ goto out;
+ break;
+
+ case GD_OP_PROFILE_VOLUME:
+ ret = glusterd_profile_volume_use_rsp_dict (aggr, rsp);
+ if (ret)
+ goto out;
+ break;
+
+ case GD_OP_GSYNC_CREATE:
+ break;
+
+ case GD_OP_GSYNC_SET:
+ ret = glusterd_gsync_use_rsp_dict (aggr, rsp, NULL);
+ if (ret)
+ goto out;
+ break;
+
+ case GD_OP_STATUS_VOLUME:
+ ret = glusterd_volume_status_copy_to_op_ctx_dict (aggr, rsp);
+ if (ret)
+ goto out;
+ break;
+
+ case GD_OP_REBALANCE:
+ case GD_OP_DEFRAG_BRICK_VOLUME:
+ ret = glusterd_volume_rebalance_use_rsp_dict (aggr, rsp);
+ if (ret)
+ goto out;
+ break;
+
+ case GD_OP_HEAL_VOLUME:
+ ret = glusterd_volume_heal_use_rsp_dict (aggr, rsp);
+ if (ret)
+ goto out;
+
+ break;
+
+ case GD_OP_CLEARLOCKS_VOLUME:
+ ret = glusterd_use_rsp_dict (aggr, rsp);
+ if (ret)
+ goto out;
+ break;
+
+ case GD_OP_QUOTA:
+ ret = glusterd_volume_quota_copy_to_op_ctx_dict (aggr, rsp);
+ if (ret)
+ goto out;
+ break;
+
+ case GD_OP_SYS_EXEC:
+ ret = glusterd_sys_exec_output_rsp_dict (aggr, rsp);
+ if (ret)
+ goto out;
+ break;
+
+ case GD_OP_SNAP:
+ ret = glusterd_snap_use_rsp_dict (aggr, rsp);
+ if (ret)
+ goto out;
+ break;
+
+ default:
+ break;
+ }
+out:
+ return ret;
+}
+
+int32_t
+gd_syncop_mgmt_v3_lock_cbk_fn (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ int ret = -1;
+ struct syncargs *args = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ gd1_mgmt_v3_lock_rsp rsp = {{0},};
+ call_frame_t *frame = NULL;
+ int op_ret = -1;
+ int op_errno = -1;
+
+ GF_ASSERT(req);
+ GF_ASSERT(iov);
+ GF_ASSERT(myframe);
+
+ frame = myframe;
+ args = frame->local;
+ peerinfo = frame->cookie;
+ frame->local = NULL;
+ frame->cookie = NULL;
if (-1 == req->rpc_status) {
- args->op_errno = ENOTCONN;
+ op_errno = ENOTCONN;
goto out;
}
ret = xdr_to_generic (*iov, &rsp,
- (xdrproc_t)xdr_gd1_mgmt_cluster_lock_rsp);
- if (ret < 0) {
+ (xdrproc_t)xdr_gd1_mgmt_v3_lock_rsp);
+ if (ret < 0)
+ goto out;
+
+ uuid_copy (args->uuid, rsp.uuid);
+
+ op_ret = rsp.op_ret;
+ op_errno = rsp.op_errno;
+out:
+ gd_mgmt_v3_collate_errors (args, op_ret, op_errno, NULL,
+ GLUSTERD_MGMT_V3_LOCK,
+ peerinfo, rsp.uuid);
+ STACK_DESTROY (frame->root);
+ synctask_barrier_wake(args);
+ return 0;
+}
+
+int32_t
+gd_syncop_mgmt_v3_lock_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ gd_syncop_mgmt_v3_lock_cbk_fn);
+}
+
+int
+gd_syncop_mgmt_v3_lock (glusterd_op_t op, dict_t *op_ctx,
+ glusterd_peerinfo_t *peerinfo,
+ struct syncargs *args, uuid_t my_uuid,
+ uuid_t recv_uuid, uuid_t txn_id)
+{
+ int ret = -1;
+ gd1_mgmt_v3_lock_req req = {{0},};
+ glusterd_conf_t *conf = THIS->private;
+
+ GF_ASSERT(op_ctx);
+ GF_ASSERT(peerinfo);
+ GF_ASSERT(args);
+
+ ret = dict_allocate_and_serialize (op_ctx,
+ &req.dict.dict_val,
+ &req.dict.dict_len);
+ if (ret)
+ goto out;
+
+ uuid_copy (req.uuid, my_uuid);
+ uuid_copy (req.txn_id, txn_id);
+ req.op = op;
+ synclock_unlock (&conf->big_lock);
+ ret = gd_syncop_submit_request (peerinfo->rpc, &req, args, peerinfo,
+ &gd_mgmt_v3_prog,
+ GLUSTERD_MGMT_V3_LOCK,
+ gd_syncop_mgmt_v3_lock_cbk,
+ (xdrproc_t)
+ xdr_gd1_mgmt_v3_lock_req);
+ synclock_lock (&conf->big_lock);
+out:
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
+gd_syncop_mgmt_v3_unlock_cbk_fn (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ int ret = -1;
+ struct syncargs *args = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ gd1_mgmt_v3_unlock_rsp rsp = {{0},};
+ call_frame_t *frame = NULL;
+ int op_ret = -1;
+ int op_errno = -1;
+
+ GF_ASSERT(req);
+ GF_ASSERT(iov);
+ GF_ASSERT(myframe);
+
+ frame = myframe;
+ args = frame->local;
+ peerinfo = frame->cookie;
+ frame->local = NULL;
+ frame->cookie = NULL;
+
+ if (-1 == req->rpc_status) {
+ op_errno = ENOTCONN;
goto out;
}
- args->op_ret = rsp.op_ret;
- args->op_errno = rsp.op_errno;
+ ret = xdr_to_generic (*iov, &rsp,
+ (xdrproc_t)xdr_gd1_mgmt_v3_unlock_rsp);
+ if (ret < 0)
+ goto out;
uuid_copy (args->uuid, rsp.uuid);
+ /* Set peer as locked, so we unlock only the locked peers */
+ if (rsp.op_ret == 0)
+ peerinfo->locked = _gf_true;
+ op_ret = rsp.op_ret;
+ op_errno = rsp.op_errno;
out:
+ gd_mgmt_v3_collate_errors (args, op_ret, op_errno, NULL,
+ GLUSTERD_MGMT_V3_UNLOCK,
+ peerinfo, rsp.uuid);
STACK_DESTROY (frame->root);
+ synctask_barrier_wake(args);
+ return 0;
+}
- __wake (args);
+int32_t
+gd_syncop_mgmt_v3_unlock_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ gd_syncop_mgmt_v3_unlock_cbk_fn);
+}
+
+int
+gd_syncop_mgmt_v3_unlock (dict_t *op_ctx, glusterd_peerinfo_t *peerinfo,
+ struct syncargs *args, uuid_t my_uuid,
+ uuid_t recv_uuid, uuid_t txn_id)
+{
+ int ret = -1;
+ gd1_mgmt_v3_unlock_req req = {{0},};
+ glusterd_conf_t *conf = THIS->private;
+
+ GF_ASSERT(op_ctx);
+ GF_ASSERT(peerinfo);
+ GF_ASSERT(args);
+
+ ret = dict_allocate_and_serialize (op_ctx,
+ &req.dict.dict_val,
+ &req.dict.dict_len);
+ if (ret)
+ goto out;
+
+ uuid_copy (req.uuid, my_uuid);
+ uuid_copy (req.txn_id, txn_id);
+ synclock_unlock (&conf->big_lock);
+ ret = gd_syncop_submit_request (peerinfo->rpc, &req, args, peerinfo,
+ &gd_mgmt_v3_prog,
+ GLUSTERD_MGMT_V3_UNLOCK,
+ gd_syncop_mgmt_v3_unlock_cbk,
+ (xdrproc_t)
+ xdr_gd1_mgmt_v3_unlock_req);
+ synclock_lock (&conf->big_lock);
+out:
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
+_gd_syncop_mgmt_lock_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ int ret = -1;
+ struct syncargs *args = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ gd1_mgmt_cluster_lock_rsp rsp = {{0},};
+ call_frame_t *frame = NULL;
+ int op_ret = -1;
+ int op_errno = -1;
+
+ frame = myframe;
+ args = frame->local;
+ peerinfo = frame->cookie;
+ frame->local = NULL;
+ frame->cookie = NULL;
+
+ if (-1 == req->rpc_status) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ ret = xdr_to_generic (*iov, &rsp,
+ (xdrproc_t)xdr_gd1_mgmt_cluster_lock_rsp);
+ if (ret < 0)
+ goto out;
+
+ uuid_copy (args->uuid, rsp.uuid);
+ /* Set peer as locked, so we unlock only the locked peers */
+ if (rsp.op_ret == 0)
+ peerinfo->locked = _gf_true;
+ op_ret = rsp.op_ret;
+ op_errno = rsp.op_errno;
+out:
+ gd_collate_errors (args, op_ret, op_errno, NULL,
+ GLUSTERD_MGMT_CLUSTER_LOCK, peerinfo, rsp.uuid);
+ STACK_DESTROY (frame->root);
+ synctask_barrier_wake(args);
return 0;
}
+int32_t
+gd_syncop_mgmt_lock_cbk (struct rpc_req *req, struct iovec *iov, int count,
+ void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ _gd_syncop_mgmt_lock_cbk);
+}
int
-gd_syncop_mgmt_lock (struct rpc_clnt *rpc, uuid_t my_uuid, uuid_t recv_uuid)
+gd_syncop_mgmt_lock (glusterd_peerinfo_t *peerinfo, struct syncargs *args,
+ uuid_t my_uuid, uuid_t recv_uuid)
{
- struct syncargs args = {0, };
+ int ret = -1;
gd1_mgmt_cluster_lock_req req = {{0},};
+ glusterd_conf_t *conf = THIS->private;
uuid_copy (req.uuid, my_uuid);
+ synclock_unlock (&conf->big_lock);
+ ret = gd_syncop_submit_request (peerinfo->rpc, &req, args, peerinfo,
+ &gd_mgmt_prog,
+ GLUSTERD_MGMT_CLUSTER_LOCK,
+ gd_syncop_mgmt_lock_cbk,
+ (xdrproc_t) xdr_gd1_mgmt_cluster_lock_req);
+ synclock_lock (&conf->big_lock);
+ return ret;
+}
- args.op_ret = -1;
- args.op_errno = ENOTCONN;
+int32_t
+_gd_syncop_mgmt_unlock_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ int ret = -1;
+ struct syncargs *args = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ gd1_mgmt_cluster_unlock_rsp rsp = {{0},};
+ call_frame_t *frame = NULL;
+ int op_ret = -1;
+ int op_errno = -1;
- GD_SYNCOP (rpc, (&args), gd_syncop_mgmt_lock_cbk,
- &req, &gd_mgmt_prog, GLUSTERD_MGMT_CLUSTER_LOCK,
- xdr_gd1_mgmt_cluster_lock_req);
+ frame = myframe;
+ args = frame->local;
+ peerinfo = frame->cookie;
+ frame->local = NULL;
- if (!args.op_ret)
- uuid_copy (recv_uuid, args.uuid);
+ if (-1 == req->rpc_status) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
- errno = args.op_errno;
- return args.op_ret;
+ ret = xdr_to_generic (*iov, &rsp,
+ (xdrproc_t)xdr_gd1_mgmt_cluster_unlock_rsp);
+ if (ret < 0)
+ goto out;
+ uuid_copy (args->uuid, rsp.uuid);
+
+ peerinfo->locked = _gf_false;
+ op_ret = rsp.op_ret;
+ op_errno = rsp.op_errno;
+out:
+ gd_collate_errors (args, op_ret, op_errno, NULL,
+ GLUSTERD_MGMT_CLUSTER_UNLOCK, peerinfo, rsp.uuid);
+ STACK_DESTROY (frame->root);
+ synctask_barrier_wake(args);
+ return 0;
}
int32_t
gd_syncop_mgmt_unlock_cbk (struct rpc_req *req, struct iovec *iov,
int count, void *myframe)
{
- struct syncargs *args = NULL;
- gd1_mgmt_cluster_unlock_rsp rsp = {{0},};
- int ret = -1;
- call_frame_t *frame = NULL;
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ _gd_syncop_mgmt_unlock_cbk);
+}
+
+
+int
+gd_syncop_mgmt_unlock (glusterd_peerinfo_t *peerinfo, struct syncargs *args,
+ uuid_t my_uuid, uuid_t recv_uuid)
+{
+ int ret = -1;
+ gd1_mgmt_cluster_unlock_req req = {{0},};
+ glusterd_conf_t *conf = THIS->private;
+ uuid_copy (req.uuid, my_uuid);
+ synclock_unlock (&conf->big_lock);
+ ret = gd_syncop_submit_request (peerinfo->rpc, &req, args, peerinfo,
+ &gd_mgmt_prog,
+ GLUSTERD_MGMT_CLUSTER_UNLOCK,
+ gd_syncop_mgmt_unlock_cbk,
+ (xdrproc_t) xdr_gd1_mgmt_cluster_lock_req);
+ synclock_lock (&conf->big_lock);
+ return ret;
+}
+
+int32_t
+_gd_syncop_stage_op_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ int ret = -1;
+ gd1_mgmt_stage_op_rsp rsp = {{0},};
+ struct syncargs *args = NULL;
+ xlator_t *this = NULL;
+ dict_t *rsp_dict = NULL;
+ call_frame_t *frame = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ int op_ret = -1;
+ int op_errno = -1;
+
+ this = THIS;
frame = myframe;
- args = frame->local;
+ args = frame->local;
frame->local = NULL;
- /* initialize */
- args->op_ret = -1;
- args->op_errno = EINVAL;
-
if (-1 == req->rpc_status) {
- args->op_errno = ENOTCONN;
+ op_errno = ENOTCONN;
goto out;
}
ret = xdr_to_generic (*iov, &rsp,
- (xdrproc_t)xdr_gd1_mgmt_cluster_unlock_rsp);
- if (ret < 0) {
+ (xdrproc_t)xdr_gd1_mgmt_stage_op_rsp);
+ if (ret < 0)
goto out;
+
+ if (rsp.dict.dict_len) {
+ /* Unserialize the dictionary */
+ rsp_dict = dict_new ();
+
+ ret = dict_unserialize (rsp.dict.dict_val,
+ rsp.dict.dict_len,
+ &rsp_dict);
+ if (ret < 0) {
+ GF_FREE (rsp.dict.dict_val);
+ goto out;
+ } else {
+ rsp_dict->extra_stdfree = rsp.dict.dict_val;
+ }
}
- args->op_ret = rsp.op_ret;
- args->op_errno = rsp.op_errno;
+ ret = glusterd_friend_find (rsp.uuid, NULL, &peerinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_CRITICAL, "Staging response "
+ "for 'Volume %s' received from unknown "
+ "peer: %s", gd_op_list[rsp.op],
+ uuid_utoa (rsp.uuid));
+ goto out;
+ }
uuid_copy (args->uuid, rsp.uuid);
+ if (rsp.op == GD_OP_REPLACE_BRICK || rsp.op == GD_OP_QUOTA) {
+ pthread_mutex_lock (&args->lock_dict);
+ {
+ ret = glusterd_syncop_aggr_rsp_dict (rsp.op, args->dict,
+ rsp_dict);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR, "%s",
+ "Failed to aggregate response from "
+ " node/brick");
+ }
+ pthread_mutex_unlock (&args->lock_dict);
+ }
+
+ op_ret = rsp.op_ret;
+ op_errno = rsp.op_errno;
out:
- STACK_DESTROY (frame->root);
+ gd_collate_errors (args, op_ret, op_errno, rsp.op_errstr,
+ GLUSTERD_MGMT_STAGE_OP, peerinfo, rsp.uuid);
- __wake (args);
+ if (rsp_dict)
+ dict_unref (rsp_dict);
+ STACK_DESTROY (frame->root);
+ synctask_barrier_wake(args);
return 0;
}
+int32_t
+gd_syncop_stage_op_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ _gd_syncop_stage_op_cbk);
+}
+
int
-gd_syncop_mgmt_unlock (struct rpc_clnt *rpc, uuid_t my_uuid, uuid_t recv_uuid)
+gd_syncop_mgmt_stage_op (struct rpc_clnt *rpc, struct syncargs *args,
+ uuid_t my_uuid, uuid_t recv_uuid, int op,
+ dict_t *dict_out, dict_t *op_ctx)
{
- struct syncargs args = {0, };
- gd1_mgmt_cluster_unlock_req req = {{0},};
-
- uuid_copy (req.uuid, my_uuid);
+ gd1_mgmt_stage_op_req *req = NULL;
+ glusterd_conf_t *conf = THIS->private;
+ int ret = -1;
- args.op_ret = -1;
- args.op_errno = ENOTCONN;
+ req = GF_CALLOC (1, sizeof (*req), gf_gld_mt_mop_stage_req_t);
+ if (!req)
+ goto out;
- GD_SYNCOP (rpc, (&args), gd_syncop_mgmt_unlock_cbk,
- &req, &gd_mgmt_prog, GLUSTERD_MGMT_CLUSTER_UNLOCK,
- xdr_gd1_mgmt_cluster_unlock_req);
+ uuid_copy (req->uuid, my_uuid);
+ req->op = op;
- if (!args.op_ret)
- uuid_copy (recv_uuid, args.uuid);
+ ret = dict_allocate_and_serialize (dict_out,
+ &req->buf.buf_val, &req->buf.buf_len);
+ if (ret)
+ goto out;
- errno = args.op_errno;
- return args.op_ret;
+ synclock_unlock (&conf->big_lock);
+ ret = gd_syncop_submit_request (rpc, req, args, NULL, &gd_mgmt_prog,
+ GLUSTERD_MGMT_STAGE_OP,
+ gd_syncop_stage_op_cbk,
+ (xdrproc_t) xdr_gd1_mgmt_stage_op_req);
+ synclock_lock (&conf->big_lock);
+out:
+ gd_stage_op_req_free (req);
+ return ret;
}
int32_t
-gd_syncop_stage_op_cbk (struct rpc_req *req, struct iovec *iov,
+_gd_syncop_brick_op_cbk (struct rpc_req *req, struct iovec *iov,
int count, void *myframe)
{
- struct syncargs *args = NULL;
- gd1_mgmt_stage_op_rsp rsp = {{0},};
+ struct syncargs *args = NULL;
+ gd1_mgmt_brick_op_rsp rsp = {0,};
int ret = -1;
- call_frame_t *frame = NULL;
+ call_frame_t *frame = NULL;
frame = myframe;
args = frame->local;
@@ -234,102 +816,133 @@ gd_syncop_stage_op_cbk (struct rpc_req *req, struct iovec *iov,
}
ret = xdr_to_generic (*iov, &rsp,
- (xdrproc_t)xdr_gd1_mgmt_stage_op_rsp);
- if (ret < 0) {
+ (xdrproc_t)xdr_gd1_mgmt_brick_op_rsp);
+ if (ret < 0)
goto out;
- }
- if (rsp.dict.dict_len) {
- /* Unserialize the dictionary */
+ if (rsp.output.output_len) {
args->dict = dict_new ();
+ if (!args->dict) {
+ ret = -1;
+ args->op_errno = ENOMEM;
+ goto out;
+ }
- ret = dict_unserialize (rsp.dict.dict_val,
- rsp.dict.dict_len,
+ ret = dict_unserialize (rsp.output.output_val,
+ rsp.output.output_len,
&args->dict);
- if (ret < 0) {
- GF_FREE (rsp.dict.dict_val);
+ if (ret < 0)
goto out;
- } else {
- args->dict->extra_stdfree = rsp.dict.dict_val;
- }
}
args->op_ret = rsp.op_ret;
args->op_errno = rsp.op_errno;
-
- uuid_copy (args->uuid, rsp.uuid);
-
args->errstr = gf_strdup (rsp.op_errstr);
out:
- STACK_DESTROY (frame->root);
+ if ((rsp.op_errstr) && (strcmp (rsp.op_errstr, "") != 0))
+ free (rsp.op_errstr);
+ free (rsp.output.output_val);
+ STACK_DESTROY (frame->root);
__wake (args);
return 0;
}
+int32_t
+gd_syncop_brick_op_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ _gd_syncop_brick_op_cbk);
+}
int
-gd_syncop_mgmt_stage_op (struct rpc_clnt *rpc, uuid_t my_uuid, uuid_t recv_uuid,
- int op, dict_t *dict_out, dict_t **dict_in,
+gd_syncop_mgmt_brick_op (struct rpc_clnt *rpc, glusterd_pending_node_t *pnode,
+ int op, dict_t *dict_out, dict_t *op_ctx,
char **errstr)
{
- struct syncargs args = {0, };
- gd1_mgmt_stage_op_req req = {{0},};
- int ret = 0;
-
- uuid_copy (req.uuid, my_uuid);
- req.op = op;
+ struct syncargs args = {0, };
+ gd1_mgmt_brick_op_req *req = NULL;
+ int ret = 0;
+ xlator_t *this = NULL;
+ this = THIS;
args.op_ret = -1;
args.op_errno = ENOTCONN;
- ret = dict_allocate_and_serialize (dict_out,
- &req.buf.buf_val, &req.buf.buf_len);
+ if ((pnode->type == GD_NODE_NFS) ||
+ (pnode->type == GD_NODE_QUOTAD) ||
+ ((pnode->type == GD_NODE_SHD) && (op == GD_OP_STATUS_VOLUME))) {
+ ret = glusterd_node_op_build_payload
+ (op, &req, dict_out);
+
+ } else {
+ ret = glusterd_brick_op_build_payload
+ (op, pnode->node, &req, dict_out);
+
+ }
+
if (ret)
goto out;
- GD_SYNCOP (rpc, (&args), gd_syncop_stage_op_cbk,
- &req, &gd_mgmt_prog, GLUSTERD_MGMT_STAGE_OP,
- xdr_gd1_mgmt_stage_op_req);
+ GD_SYNCOP (rpc, (&args), NULL, gd_syncop_brick_op_cbk, req,
+ &gd_brick_prog, req->op, xdr_gd1_mgmt_brick_op_req);
- if (args.errstr && errstr)
- *errstr = args.errstr;
- else GF_FREE (args.errstr);
+ if (args.errstr) {
+ if ((strlen(args.errstr) > 0) && errstr)
+ *errstr = args.errstr;
+ else
+ GF_FREE (args.errstr);
+ }
- if (args.dict && dict_in)
- *dict_in = args.dict;
- else if (args.dict)
- dict_unref (args.dict);
+ if (GD_OP_STATUS_VOLUME == op) {
+ ret = dict_set_int32 (args.dict, "index", pnode->index);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Error setting index on brick status"
+ " rsp dict");
+ args.op_ret = -1;
+ goto out;
+ }
+ }
+ if (args.op_ret == 0)
+ glusterd_handle_node_rsp (dict_out, pnode->node, op,
+ args.dict, op_ctx, errstr,
+ pnode->type);
- uuid_copy (recv_uuid, args.uuid);
out:
errno = args.op_errno;
+ if (args.dict)
+ dict_unref (args.dict);
+ gd_brick_op_req_free (req);
return args.op_ret;
}
-/*TODO: Need to add syncop for brick ops*/
int32_t
-gd_syncop_commit_op_cbk (struct rpc_req *req, struct iovec *iov,
- int count, void *myframe)
+_gd_syncop_commit_op_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
{
- struct syncargs *args = NULL;
- gd1_mgmt_commit_op_rsp rsp = {{0},};
- int ret = -1;
- call_frame_t *frame = NULL;
-
+ int ret = -1;
+ gd1_mgmt_commit_op_rsp rsp = {{0},};
+ struct syncargs *args = NULL;
+ xlator_t *this = NULL;
+ dict_t *rsp_dict = NULL;
+ call_frame_t *frame = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ int op_ret = -1;
+ int op_errno = -1;
+ int type = GF_QUOTA_OPTION_TYPE_NONE;
+
+ this = THIS;
frame = myframe;
- args = frame->local;
+ args = frame->local;
frame->local = NULL;
- /* initialize */
- args->op_ret = -1;
- args->op_errno = EINVAL;
-
if (-1 == req->rpc_status) {
- args->op_errno = ENOTCONN;
+ op_errno = ENOTCONN;
goto out;
}
@@ -341,257 +954,728 @@ gd_syncop_commit_op_cbk (struct rpc_req *req, struct iovec *iov,
if (rsp.dict.dict_len) {
/* Unserialize the dictionary */
- args->dict = dict_new ();
+ rsp_dict = dict_new ();
ret = dict_unserialize (rsp.dict.dict_val,
rsp.dict.dict_len,
- &args->dict);
+ &rsp_dict);
if (ret < 0) {
GF_FREE (rsp.dict.dict_val);
goto out;
} else {
- args->dict->extra_stdfree = rsp.dict.dict_val;
+ rsp_dict->extra_stdfree = rsp.dict.dict_val;
}
}
- args->op_ret = rsp.op_ret;
- args->op_errno = rsp.op_errno;
+ ret = glusterd_friend_find (rsp.uuid, NULL, &peerinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_CRITICAL, "Commit response "
+ "for 'Volume %s' received from unknown "
+ "peer: %s", gd_op_list[rsp.op],
+ uuid_utoa (rsp.uuid));
+ goto out;
+ }
uuid_copy (args->uuid, rsp.uuid);
+ if (rsp.op == GD_OP_QUOTA) {
+ ret = dict_get_int32 (args->dict, "type", &type);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get "
+ "opcode");
+ goto out;
+ }
+ }
- args->errstr = gf_strdup (rsp.op_errstr);
+ if ((rsp.op != GD_OP_QUOTA) || (type == GF_QUOTA_OPTION_TYPE_LIST)) {
+ pthread_mutex_lock (&args->lock_dict);
+ {
+ ret = glusterd_syncop_aggr_rsp_dict (rsp.op, args->dict,
+ rsp_dict);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR, "%s",
+ "Failed to aggregate response from "
+ " node/brick");
+ }
+ pthread_mutex_unlock (&args->lock_dict);
+ }
+
+ op_ret = rsp.op_ret;
+ op_errno = rsp.op_errno;
out:
- STACK_DESTROY (frame->root);
+ gd_collate_errors (args, op_ret, op_errno, rsp.op_errstr,
+ GLUSTERD_MGMT_COMMIT_OP, peerinfo, rsp.uuid);
+ if (rsp_dict)
+ dict_unref (rsp_dict);
- __wake (args);
+ STACK_DESTROY (frame->root);
+ synctask_barrier_wake(args);
return 0;
}
+int32_t
+gd_syncop_commit_op_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe)
+{
+ return glusterd_big_locked_cbk (req, iov, count, myframe,
+ _gd_syncop_commit_op_cbk);
+}
+
int
-gd_syncop_mgmt_commit_op (struct rpc_clnt *rpc, uuid_t my_uuid, uuid_t recv_uuid,
- int op, dict_t *dict_out, dict_t **dict_in,
- char **errstr)
+gd_syncop_mgmt_commit_op (struct rpc_clnt *rpc, struct syncargs *args,
+ uuid_t my_uuid, uuid_t recv_uuid,
+ int op, dict_t *dict_out, dict_t *op_ctx)
{
- struct syncargs args = {0, };
- gd1_mgmt_commit_op_req req = {{0},};
- int ret = 0;
+ glusterd_conf_t *conf = THIS->private;
+ gd1_mgmt_commit_op_req *req = NULL;
+ int ret = -1;
- uuid_copy (req.uuid, my_uuid);
- req.op = op;
+ req = GF_CALLOC (1, sizeof (*req), gf_gld_mt_mop_commit_req_t);
+ if (!req)
+ goto out;
- args.op_ret = -1;
- args.op_errno = ENOTCONN;
+ uuid_copy (req->uuid, my_uuid);
+ req->op = op;
ret = dict_allocate_and_serialize (dict_out,
- &req.buf.buf_val, &req.buf.buf_len);
+ &req->buf.buf_val, &req->buf.buf_len);
if (ret)
goto out;
- GD_SYNCOP (rpc, (&args), gd_syncop_commit_op_cbk,
- &req, &gd_mgmt_prog, GLUSTERD_MGMT_COMMIT_OP,
- xdr_gd1_mgmt_commit_op_req);
+ synclock_unlock (&conf->big_lock);
+ ret = gd_syncop_submit_request (rpc, req, args, NULL, &gd_mgmt_prog,
+ GLUSTERD_MGMT_COMMIT_OP ,
+ gd_syncop_commit_op_cbk,
+ (xdrproc_t) xdr_gd1_mgmt_commit_op_req);
+ synclock_lock (&conf->big_lock);
+out:
+ gd_commit_op_req_free (req);
+ return ret;
+}
- if (args.errstr && errstr)
- *errstr = args.errstr;
- else GF_FREE (args.errstr);
- if (args.dict && dict_in)
- *dict_in = args.dict;
- else if (args.dict)
- dict_unref (args.dict);
+int
+gd_build_peers_list (struct list_head *peers, struct list_head *xact_peers,
+ glusterd_op_t op)
+{
+ glusterd_peerinfo_t *peerinfo = NULL;
+ int npeers = 0;
- uuid_copy (recv_uuid, args.uuid);
-
-out:
- errno = args.op_errno;
- return args.op_ret;
+ list_for_each_entry (peerinfo, peers, uuid_list) {
+ if (!peerinfo->connected)
+ continue;
+ if (op != GD_OP_SYNC_VOLUME &&
+ peerinfo->state.state != GD_FRIEND_STATE_BEFRIENDED)
+ continue;
+ list_add_tail (&peerinfo->op_peers_list, xact_peers);
+ npeers++;
+ }
+ return npeers;
}
+int
+gd_lock_op_phase (glusterd_conf_t *conf, glusterd_op_t op, dict_t *op_ctx,
+ char **op_errstr, int npeers, uuid_t txn_id)
+{
+ int ret = -1;
+ int peer_cnt = 0;
+ uuid_t peer_uuid = {0};
+ xlator_t *this = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ struct syncargs args = {0};
+ struct list_head *peers = NULL;
+
+ peers = &conf->xaction_peers;
+
+ if (!npeers) {
+ ret = 0;
+ goto out;
+ }
+
+ this = THIS;
+ synctask_barrier_init((&args));
+ peer_cnt = 0;
+ list_for_each_entry (peerinfo, peers, op_peers_list) {
+ if (conf->op_version < GD_OP_VERSION_4) {
+ /* Reset lock status */
+ peerinfo->locked = _gf_false;
+ gd_syncop_mgmt_lock (peerinfo, &args,
+ MY_UUID, peer_uuid);
+ } else
+ gd_syncop_mgmt_v3_lock (op, op_ctx, peerinfo, &args,
+ MY_UUID, peer_uuid, txn_id);
+ peer_cnt++;
+ }
+ gd_synctask_barrier_wait((&args), peer_cnt);
-static int
-glusterd_syncop_aggr_rsp_dict (glusterd_op_t op, dict_t *aggr, dict_t *rsp,
- char *op_errstr)
+ if (args.op_ret) {
+ if (args.errstr)
+ *op_errstr = gf_strdup (args.errstr);
+ else {
+ ret = gf_asprintf (op_errstr, "Another transaction "
+ "could be in progress. Please try "
+ "again after sometime.");
+ if (ret == -1)
+ *op_errstr = NULL;
+
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to acquire lock");
+
+ }
+ }
+
+ ret = args.op_ret;
+
+ gf_log (this->name, GF_LOG_DEBUG, "Sent lock op req for 'Volume %s' "
+ "to %d peers. Returning %d", gd_op_list[op], peer_cnt, ret);
+out:
+ return ret;
+}
+
+int
+gd_stage_op_phase (struct list_head *peers, glusterd_op_t op, dict_t *op_ctx,
+ dict_t *req_dict, char **op_errstr, int npeers)
{
- int ret = -1;
+ int ret = -1;
+ int peer_cnt = 0;
+ dict_t *rsp_dict = NULL;
+ char *hostname = NULL;
+ xlator_t *this = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ uuid_t tmp_uuid = {0};
+ char *errstr = NULL;
+ struct syncargs args = {0};
+
+ this = THIS;
+ rsp_dict = dict_new ();
+ if (!rsp_dict)
+ goto out;
- switch (op) {
- case GD_OP_REPLACE_BRICK:
- ret = glusterd_rb_use_rsp_dict (aggr, rsp);
- if (ret)
- goto out;
- break;
+ ret = glusterd_op_stage_validate (op, req_dict, op_errstr, rsp_dict);
+ if (ret) {
+ hostname = "localhost";
+ goto stage_done;
+ }
- case GD_OP_SYNC_VOLUME:
- ret = glusterd_sync_use_rsp_dict (aggr, rsp);
- if (ret)
+ if ((op == GD_OP_REPLACE_BRICK || op == GD_OP_QUOTA)) {
+ ret = glusterd_syncop_aggr_rsp_dict (op, op_ctx, rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "%s",
+ "Failed to aggregate response from node/brick");
goto out;
- break;
+ }
+ }
+ dict_unref (rsp_dict);
+ rsp_dict = NULL;
- case GD_OP_PROFILE_VOLUME:
- ret = glusterd_profile_volume_use_rsp_dict (aggr, rsp);
- if (ret)
- goto out;
- break;
+stage_done:
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, LOGSTR_STAGE_FAIL,
+ gd_op_list[op], hostname, (*op_errstr) ? ":" : " ",
+ (*op_errstr) ? *op_errstr : " ");
+ if (*op_errstr == NULL)
+ gf_asprintf (op_errstr, OPERRSTR_STAGE_FAIL, hostname);
+ goto out;
+ }
- case GD_OP_GSYNC_SET:
- ret = glusterd_gsync_use_rsp_dict (aggr, rsp, op_errstr);
- if (ret)
- goto out;
- break;
+ if (!npeers) {
+ ret = 0;
+ goto out;
+ }
- case GD_OP_STATUS_VOLUME:
- ret = glusterd_volume_status_copy_to_op_ctx_dict (aggr, rsp);
- if (ret)
- goto out;
- break;
+ gd_syncargs_init (&args, op_ctx);
+ synctask_barrier_init((&args));
+ peer_cnt = 0;
+ list_for_each_entry (peerinfo, peers, op_peers_list) {
+ ret = gd_syncop_mgmt_stage_op (peerinfo->rpc, &args,
+ MY_UUID, tmp_uuid,
+ op, req_dict, op_ctx);
+ peer_cnt++;
+ }
- case GD_OP_REBALANCE:
- case GD_OP_DEFRAG_BRICK_VOLUME:
- ret = glusterd_volume_rebalance_use_rsp_dict (aggr, rsp);
+ gf_log (this->name, GF_LOG_DEBUG, "Sent stage op req for 'Volume %s' "
+ "to %d peers", gd_op_list[op], peer_cnt);
+
+ gd_synctask_barrier_wait((&args), peer_cnt);
+
+ if (args.errstr)
+ *op_errstr = gf_strdup (args.errstr);
+ else if (dict_get_str (op_ctx, "errstr", &errstr) == 0)
+ *op_errstr = gf_strdup (errstr);
+
+ ret = args.op_ret;
+
+out:
+ if ((ret == 0) && (op == GD_OP_QUOTA)) {
+ ret = glusterd_validate_and_set_gfid (op_ctx, req_dict,
+ op_errstr);
if (ret)
goto out;
- break;
+ }
- case GD_OP_HEAL_VOLUME:
- ret = glusterd_volume_heal_use_rsp_dict (aggr, rsp);
- if (ret)
+ if (rsp_dict)
+ dict_unref (rsp_dict);
+ return ret;
+}
+
+int
+gd_commit_op_phase (struct list_head *peers, glusterd_op_t op, dict_t *op_ctx,
+ dict_t *req_dict, char **op_errstr, int npeers)
+{
+ dict_t *rsp_dict = NULL;
+ int peer_cnt = -1;
+ int ret = -1;
+ char *hostname = NULL;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ xlator_t *this = NULL;
+ uuid_t tmp_uuid = {0};
+ char *errstr = NULL;
+ struct syncargs args = {0};
+ int type = GF_QUOTA_OPTION_TYPE_NONE;
+
+ this = THIS;
+ rsp_dict = dict_new ();
+ if (!rsp_dict) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = glusterd_op_commit_perform (op, req_dict, op_errstr, rsp_dict);
+ if (ret) {
+ hostname = "localhost";
+ goto commit_done;
+ }
+
+ if (op == GD_OP_QUOTA) {
+ ret = dict_get_int32 (op_ctx, "type", &type);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get "
+ "opcode");
goto out;
+ }
+ }
- break;
+ if (((op == GD_OP_QUOTA) && (type == GF_QUOTA_OPTION_TYPE_LIST)) ||
+ ((op != GD_OP_SYNC_VOLUME) && (op != GD_OP_QUOTA))) {
- default:
- break;
+ ret = glusterd_syncop_aggr_rsp_dict (op, op_ctx,
+ rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "%s", "Failed to aggregate "
+ "response from node/brick");
+ goto out;
+ }
+ }
+
+ dict_unref (rsp_dict);
+ rsp_dict = NULL;
+
+commit_done:
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, LOGSTR_COMMIT_FAIL,
+ gd_op_list[op], hostname, (*op_errstr) ? ":" : " ",
+ (*op_errstr) ? *op_errstr : " ");
+ if (*op_errstr == NULL)
+ gf_asprintf (op_errstr, OPERRSTR_COMMIT_FAIL,
+ hostname);
+ goto out;
}
+
+ if (!npeers) {
+ ret = 0;
+ goto out;
+ }
+
+ gd_syncargs_init (&args, op_ctx);
+ synctask_barrier_init((&args));
+ peer_cnt = 0;
+ list_for_each_entry (peerinfo, peers, op_peers_list) {
+ ret = gd_syncop_mgmt_commit_op (peerinfo->rpc, &args,
+ MY_UUID, tmp_uuid,
+ op, req_dict, op_ctx);
+ peer_cnt++;
+ }
+ gd_synctask_barrier_wait((&args), peer_cnt);
+ ret = args.op_ret;
+ if (args.errstr)
+ *op_errstr = gf_strdup (args.errstr);
+ else if (dict_get_str (op_ctx, "errstr", &errstr) == 0)
+ *op_errstr = gf_strdup (errstr);
+
+ gf_log (this->name, GF_LOG_DEBUG, "Sent commit op req for 'Volume %s' "
+ "to %d peers", gd_op_list[op], peer_cnt);
out:
+ if (!ret)
+ glusterd_op_modify_op_ctx (op, op_ctx);
+
+ if (rsp_dict)
+ dict_unref (rsp_dict);
+
+ GF_FREE (args.errstr);
+ args.errstr = NULL;
+
return ret;
}
int
-gd_sync_task_begin (dict_t *op_ctx, char **op_errstr)
+gd_unlock_op_phase (glusterd_conf_t *conf, glusterd_op_t op, int *op_ret,
+ rpcsvc_request_t *req, dict_t *op_ctx, char *op_errstr,
+ int npeers, char *volname, gf_boolean_t is_acquired,
+ uuid_t txn_id)
{
- int ret = -1;
- dict_t *req_dict = NULL;
- dict_t *rsp_dict = NULL;
- glusterd_peerinfo_t *peerinfo = NULL;
- glusterd_conf_t *conf = NULL;
- uuid_t tmp_uuid = {0,};
- glusterd_op_t op = 0;
- int32_t tmp_op = 0;
- gf_boolean_t local_locked = _gf_false;
-
- conf = THIS->private;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ glusterd_peerinfo_t *tmp = NULL;
+ uuid_t tmp_uuid = {0};
+ int peer_cnt = 0;
+ int ret = -1;
+ xlator_t *this = NULL;
+ struct syncargs args = {0};
+ struct list_head *peers = NULL;
+
+ peers = &conf->xaction_peers;
+
+ if (!npeers) {
+ ret = 0;
+ goto out;
+ }
- ret = dict_get_int32 (op_ctx, GD_SYNC_OPCODE_KEY, &tmp_op);
- if (ret)
+ /* If the lock has not been held during this
+ * transaction, do not send unlock requests */
+ if (!is_acquired) {
+ ret = 0;
goto out;
+ }
- op = tmp_op;
+ this = THIS;
+ synctask_barrier_init((&args));
+ peer_cnt = 0;
+ if (conf->op_version < GD_OP_VERSION_4) {
+ list_for_each_entry_safe (peerinfo, tmp, peers, op_peers_list) {
+ /* Only unlock peers that were locked */
+ if (peerinfo->locked) {
+ gd_syncop_mgmt_unlock (peerinfo, &args,
+ MY_UUID, tmp_uuid);
+ peer_cnt++;
+ list_del_init (&peerinfo->op_peers_list);
+ }
+ }
+ } else {
+ if (volname) {
+ list_for_each_entry_safe (peerinfo, tmp,
+ peers, op_peers_list) {
+ gd_syncop_mgmt_v3_unlock (op_ctx, peerinfo,
+ &args, MY_UUID,
+ tmp_uuid, txn_id);
+ peer_cnt++;
+ list_del_init (&peerinfo->op_peers_list);
+ }
+ }
+ }
+ gd_synctask_barrier_wait((&args), peer_cnt);
- ret = -1;
+ ret = args.op_ret;
+
+ gf_log (this->name, GF_LOG_DEBUG, "Sent unlock op req for 'Volume %s' "
+ "to %d peers. Returning %d", gd_op_list[op], peer_cnt, ret);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to unlock "
+ "on some peer(s)");
+ }
+
+out:
+ /* If unlock failed, and op_ret was previously set
+ * priority is given to the op_ret. If op_ret was
+ * not set, and unlock failed, then set op_ret */
+ if (!*op_ret)
+ *op_ret = ret;
+
+ if (is_acquired) {
+ /* Based on the op-version,
+ * we release the cluster or mgmt_v3 lock
+ * and clear the op */
+
+ glusterd_op_clear_op (op);
+ if (conf->op_version < GD_OP_VERSION_4)
+ glusterd_unlock (MY_UUID);
+ else {
+ if (volname) {
+ ret = glusterd_mgmt_v3_unlock (volname, MY_UUID,
+ "vol");
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to release lock for %s",
+ volname);
+ }
+ }
+ }
+
+ if (!*op_ret)
+ *op_ret = ret;
+
+ return 0;
+}
+
+int
+gd_get_brick_count (struct list_head *bricks)
+{
+ glusterd_pending_node_t *pending_node = NULL;
+ int npeers = 0;
+ list_for_each_entry (pending_node, bricks, list) {
+ npeers++;
+ }
+ return npeers;
+}
+
+int
+gd_brick_op_phase (glusterd_op_t op, dict_t *op_ctx, dict_t *req_dict,
+ char **op_errstr)
+{
+ glusterd_pending_node_t *pending_node = NULL;
+ struct list_head selected = {0,};
+ xlator_t *this = NULL;
+ int brick_count = 0;
+ int ret = -1;
+ rpc_clnt_t *rpc = NULL;
+ dict_t *rsp_dict = NULL;
+ glusterd_conf_t *conf = NULL;
+
+ this = THIS;
+ conf = this->private;
rsp_dict = dict_new ();
- if (!rsp_dict)
+ if (!rsp_dict) {
+ ret = -1;
goto out;
+ }
- /* Lock everything */
- ret = glusterd_lock (MY_UUID);
- if (ret)
+ INIT_LIST_HEAD (&selected);
+ ret = glusterd_op_bricks_select (op, req_dict, op_errstr, &selected, rsp_dict);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "%s",
+ (*op_errstr)? *op_errstr: "Brick op failed. Check "
+ "glusterd log file for more details.");
goto out;
- /* successful lock in local node */
- local_locked = _gf_true;
+ }
- list_for_each_entry (peerinfo, &conf->peers, uuid_list) {
- ret = gd_syncop_mgmt_lock (peerinfo->rpc,
- MY_UUID, tmp_uuid);
+ if (op == GD_OP_HEAL_VOLUME) {
+ ret = glusterd_syncop_aggr_rsp_dict (op, op_ctx, rsp_dict);
if (ret)
goto out;
- /* TODO: Only on lock successful nodes it should unlock */
}
+ dict_unref (rsp_dict);
+ rsp_dict = NULL;
+
+ brick_count = 0;
+ list_for_each_entry (pending_node, &selected, list) {
+ rpc = glusterd_pending_node_get_rpc (pending_node);
+ if (!rpc) {
+ if (pending_node->type == GD_NODE_REBALANCE) {
+ ret = 0;
+ glusterd_defrag_volume_node_rsp (req_dict,
+ NULL, op_ctx);
+ goto out;
+ }
- ret = glusterd_op_build_payload (&req_dict, op_errstr, op_ctx);
- if (ret)
+ ret = -1;
+ gf_log (this->name, GF_LOG_ERROR, "Brick Op failed "
+ "due to rpc failure.");
+ goto out;
+ }
+ ret = gd_syncop_mgmt_brick_op (rpc, pending_node, op, req_dict,
+ op_ctx, op_errstr);
+ if (ret)
+ goto out;
+
+ brick_count++;
+ }
+
+ ret = 0;
+out:
+ if (rsp_dict)
+ dict_unref (rsp_dict);
+ gf_log (this->name, GF_LOG_DEBUG, "Sent op req to %d bricks",
+ brick_count);
+ return ret;
+}
+
+void
+gd_sync_task_begin (dict_t *op_ctx, rpcsvc_request_t * req)
+{
+ int ret = -1;
+ int op_ret = -1;
+ int npeers = 0;
+ dict_t *req_dict = NULL;
+ glusterd_conf_t *conf = NULL;
+ glusterd_op_t op = 0;
+ int32_t tmp_op = 0;
+ char *op_errstr = NULL;
+ char *tmp = NULL;
+ char *volname = NULL;
+ xlator_t *this = NULL;
+ gf_boolean_t is_acquired = _gf_false;
+ uuid_t *txn_id = NULL;
+ glusterd_op_info_t txn_opinfo;
+
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ ret = dict_get_int32 (op_ctx, GD_SYNC_OPCODE_KEY, &tmp_op);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get volume "
+ "operation");
goto out;
+ }
+ op = tmp_op;
- /* stage op */
- ret = glusterd_op_stage_validate (op, req_dict, op_errstr, rsp_dict);
+ /* Generate a transaction-id for this operation and
+ * save it in the dict */
+ ret = glusterd_generate_txn_id (op_ctx, &txn_id);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to generate transaction id");
+ goto out;
+ }
+
+ /* Save opinfo for this transaction with the transaction id */
+ glusterd_txn_opinfo_init (&txn_opinfo, NULL, &op, NULL, NULL);
+ ret = glusterd_set_txn_opinfo (txn_id, &txn_opinfo);
if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set transaction's opinfo");
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Transaction ID : %s", uuid_utoa (*txn_id));
+
+ opinfo = txn_opinfo;
+
+ /* Save the MY_UUID as the originator_uuid */
+ ret = glusterd_set_originator_uuid (op_ctx);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set originator_uuid.");
goto out;
+ }
- list_for_each_entry (peerinfo, &conf->peers, uuid_list) {
- ret = gd_syncop_mgmt_stage_op (peerinfo->rpc,
- MY_UUID, tmp_uuid,
- op, req_dict, &rsp_dict,
- op_errstr);
+ /* Based on the op_version, acquire a cluster or mgmt_v3 lock */
+ if (conf->op_version < GD_OP_VERSION_4) {
+ ret = glusterd_lock (MY_UUID);
if (ret) {
- if (*op_errstr)
- ret = dict_set_dynstr (req_dict, "error",
- *op_errstr);
-
- ret = -1;
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to acquire lock");
+ gf_asprintf (&op_errstr,
+ "Another transaction is in progress. "
+ "Please try again after sometime.");
goto out;
}
+ } else {
- if (op == GD_OP_REPLACE_BRICK)
- (void) glusterd_syncop_aggr_rsp_dict (op, op_ctx,
- rsp_dict,
- *op_errstr);
+ /* If no volname is given as a part of the command, locks will
+ * not be held */
+ ret = dict_get_str (op_ctx, "volname", &tmp);
+ if (ret) {
+ gf_log ("", GF_LOG_DEBUG, "Failed to get volume "
+ "name");
+ goto local_locking_done;
+ } else {
+ /* Use a copy of volname, as cli response will be
+ * sent before the unlock, and the volname in the
+ * dict, might be removed */
+ volname = gf_strdup (tmp);
+ if (!volname)
+ goto out;
+ }
- if (rsp_dict)
- dict_unref (rsp_dict);
+ ret = glusterd_mgmt_v3_lock (volname, MY_UUID, "vol");
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to acquire lock for %s", volname);
+ gf_asprintf (&op_errstr,
+ "Another transaction is in progress "
+ "for %s. Please try again after sometime.",
+ volname);
+ goto out;
+ }
}
- /* commit op */
- ret = glusterd_op_commit_perform (op, req_dict, op_errstr, rsp_dict);
- if (ret)
- goto out;
+ is_acquired = _gf_true;
- list_for_each_entry (peerinfo, &conf->peers, uuid_list) {
- ret = gd_syncop_mgmt_commit_op (peerinfo->rpc,
- MY_UUID, tmp_uuid,
- op, req_dict, &rsp_dict,
- op_errstr);
- if (ret) {
- if (*op_errstr)
- ret = dict_set_dynstr (req_dict, "error",
- *op_errstr);
+local_locking_done:
- ret = -1;
+ INIT_LIST_HEAD (&conf->xaction_peers);
+
+ npeers = gd_build_peers_list (&conf->peers, &conf->xaction_peers, op);
+
+ /* If no volname is given as a part of the command, locks will
+ * not be held */
+ if (volname || (conf->op_version < GD_OP_VERSION_4)) {
+ ret = gd_lock_op_phase (conf, op, op_ctx, &op_errstr,
+ npeers, *txn_id);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Locking Peers Failed.");
goto out;
}
- (void) glusterd_syncop_aggr_rsp_dict (op, op_ctx, rsp_dict,
- *op_errstr);
- if (rsp_dict)
- dict_unref (rsp_dict);
}
+ ret = glusterd_op_build_payload (&req_dict, &op_errstr, op_ctx);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, LOGSTR_BUILD_PAYLOAD,
+ gd_op_list[op]);
+ if (op_errstr == NULL)
+ gf_asprintf (&op_errstr, OPERRSTR_BUILD_PAYLOAD);
+ goto out;
+ }
+
+ ret = gd_stage_op_phase (&conf->xaction_peers, op, op_ctx, req_dict,
+ &op_errstr, npeers);
+ if (ret)
+ goto out;
+
+ ret = gd_brick_op_phase (op, op_ctx, req_dict, &op_errstr);
+ if (ret)
+ goto out;
+
+ ret = gd_commit_op_phase (&conf->xaction_peers, op, op_ctx, req_dict,
+ &op_errstr, npeers);
+ if (ret)
+ goto out;
+
ret = 0;
out:
- if (local_locked) {
- /* unlock everything as we help successful local lock */
- list_for_each_entry (peerinfo, &conf->peers, uuid_list) {
- /* No need to check the error code, as it is possible
- that if 'lock' on few nodes failed, it would come
- here, and unlock would fail on nodes where lock
- never was sent */
- gd_syncop_mgmt_unlock (peerinfo->rpc,
- MY_UUID, tmp_uuid);
- }
-
- /* Local node should be the one to be locked first,
- unlocked last to prevent races */
- glusterd_unlock (MY_UUID);
+ op_ret = ret;
+ if (txn_id) {
+ (void) gd_unlock_op_phase (conf, op, &op_ret, req,
+ op_ctx, op_errstr,
+ npeers, volname,
+ is_acquired, *txn_id);
+
+ /* Clearing the transaction opinfo */
+ ret = glusterd_clear_txn_opinfo (txn_id);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to clear transaction's "
+ "opinfo for transaction ID : %s",
+ uuid_utoa (*txn_id));
}
+ glusterd_op_send_cli_response (op, op_ret, 0, req, op_ctx, op_errstr);
+
+ if (volname)
+ GF_FREE (volname);
+
if (req_dict)
dict_unref (req_dict);
- if (rsp_dict)
- dict_unref (rsp_dict);
+ if (op_errstr) {
+ GF_FREE (op_errstr);
+ op_errstr = NULL;
+ }
- return ret;
+ return;
}
int32_t
@@ -599,8 +1683,6 @@ glusterd_op_begin_synctask (rpcsvc_request_t *req, glusterd_op_t op,
void *dict)
{
int ret = 0;
- int op_ret = 0;
- char *op_errstr = NULL;
ret = dict_set_int32 (dict, GD_SYNC_OPCODE_KEY, op);
if (ret) {
@@ -609,15 +1691,9 @@ glusterd_op_begin_synctask (rpcsvc_request_t *req, glusterd_op_t op,
goto out;
}
- op_ret = gd_sync_task_begin (dict, &op_errstr);
- glusterd_op_send_cli_response (op, op_ret, 0, req, NULL,
- op_errstr);
+ gd_sync_task_begin (dict, req);
ret = 0;
out:
- if (dict)
- dict_unref (dict);
- if (op_errstr)
- GF_FREE (op_errstr);
return ret;
}
diff --git a/xlators/mgmt/glusterd/src/glusterd-syncop.h b/xlators/mgmt/glusterd/src/glusterd-syncop.h
index 268f3bf23..e83ea2f4c 100644
--- a/xlators/mgmt/glusterd/src/glusterd-syncop.h
+++ b/xlators/mgmt/glusterd/src/glusterd-syncop.h
@@ -11,40 +11,61 @@
#define __RPC_SYNCOP_H
#include "syncop.h"
-
+#include "glusterd-sm.h"
+#include "glusterd.h"
#define GD_SYNC_OPCODE_KEY "sync-mgmt-operation"
/* gd_syncop_* */
-#define GD_SYNCOP(rpc, stb, cbk, req, prog, procnum, xdrproc) do { \
- int ret = 0; \
- struct synctask *task = NULL; \
- task = synctask_get (); \
- stb->task = task; \
- \
- ret = gd_syncop_submit_request (rpc, req, stb, \
- prog, procnum, cbk, \
- (xdrproc_t)xdrproc); \
- if (!ret) \
- synctask_yield (stb->task); \
+#define GD_SYNCOP(rpc, stb, cookie, cbk, req, prog, procnum, xdrproc) do { \
+ int ret = 0; \
+ struct synctask *task = NULL; \
+ glusterd_conf_t *conf= THIS->private; \
+ \
+ task = synctask_get (); \
+ stb->task = task; \
+ \
+ /*This is to ensure that the brick_op_cbk is able to \
+ * take the big lock*/ \
+ synclock_unlock (&conf->big_lock); \
+ ret = gd_syncop_submit_request (rpc, req, stb, cookie, \
+ prog, procnum, cbk, \
+ (xdrproc_t)xdrproc); \
+ if (!ret) \
+ synctask_yield (stb->task); \
+ synclock_lock (&conf->big_lock); \
} while (0)
-int gd_syncop_submit_request (struct rpc_clnt *rpc, void *req,
- void *cookie, rpc_clnt_prog_t *prog,
- int procnum, fop_cbk_fn_t cbkfn,
- xdrproc_t xdrproc);
+int gd_syncop_submit_request (struct rpc_clnt *rpc, void *req, void *local,
+ void *cookie, rpc_clnt_prog_t *prog, int procnum,
+ fop_cbk_fn_t cbkfn, xdrproc_t xdrproc);
+
+
+int gd_syncop_mgmt_lock (glusterd_peerinfo_t *peerinfo, struct syncargs *arg,
+ uuid_t my_uuid, uuid_t recv_uuid);
+int gd_syncop_mgmt_unlock (glusterd_peerinfo_t *peerinfo, struct syncargs *arg,
+ uuid_t my_uuid, uuid_t recv_uuid);
+int gd_syncop_mgmt_stage_op (struct rpc_clnt *rpc, struct syncargs *arg,
+ uuid_t my_uuid, uuid_t recv_uuid, int op,
+ dict_t *dict_out, dict_t *op_ctx);
+int gd_syncop_mgmt_commit_op (struct rpc_clnt *rpc, struct syncargs *arg,
+ uuid_t my_uuid, uuid_t recv_uuid, int op,
+ dict_t *dict_out, dict_t *op_ctx);
+
+void
+gd_synctask_barrier_wait (struct syncargs *args, int count);
+int
+gd_build_peers_list (struct list_head *peers, struct list_head *xact_peers,
+ glusterd_op_t op);
+int
+gd_brick_op_phase (glusterd_op_t op, dict_t *op_ctx, dict_t *req_dict,
+ char **op_errstr);
-int gd_syncop_mgmt_lock (struct rpc_clnt *rpc, uuid_t my_uuid,
- uuid_t recv_uuid);
-int gd_syncop_mgmt_unlock (struct rpc_clnt *rpc, uuid_t my_uuid,
- uuid_t recv_uuid);
-int gd_syncop_mgmt_stage_op (struct rpc_clnt *rpc, uuid_t my_uuid,
- uuid_t recv_uuid, int op, dict_t *dict_out,
- dict_t **dict_in, char **errstr);
-int gd_syncop_mgmt_commit_op (struct rpc_clnt *rpc, uuid_t my_uuid,
- uuid_t recv_uuid, int op, dict_t *dict_out,
- dict_t **dict_in, char **errstr);
+int
+glusterd_syncop_aggr_rsp_dict (glusterd_op_t op, dict_t *aggr, dict_t *rsp);
+void
+gd_syncargs_init (struct syncargs *args, dict_t *op_ctx);
#endif /* __RPC_SYNCOP_H */
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
index d9f589b20..7883a98bf 100644
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
@@ -11,9 +11,14 @@
#define _CONFIG_H
#include "config.h"
#endif
-#include <openssl/md5.h>
#include <inttypes.h>
+#if !defined(__NetBSD__) && !defined(GF_DARWIN_HOST_OS)
+#include <mntent.h>
+#else
+#include "mntent_compat.h"
+#endif
+
#include "globals.h"
#include "glusterfs.h"
#include "compat.h"
@@ -23,6 +28,7 @@
#include "timer.h"
#include "defaults.h"
#include "compat.h"
+#include "syncop.h"
#include "run.h"
#include "compat-errno.h"
#include "statedump.h"
@@ -35,22 +41,24 @@
#include "glusterd-store.h"
#include "glusterd-volgen.h"
#include "glusterd-pmap.h"
+#include "glusterfs-acl.h"
+#include "glusterd-syncop.h"
+#include "glusterd-locks.h"
#include "xdr-generic.h"
#include <sys/resource.h>
#include <inttypes.h>
#include <signal.h>
#include <sys/types.h>
-#include <net/if.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <rpc/pmap_clnt.h>
#include <unistd.h>
#include <fnmatch.h>
#include <sys/statvfs.h>
-
-#ifdef GF_LINUX_HOST_OS
-#include <mntent.h>
+#include <ifaddrs.h>
+#ifdef HAVE_BD_XLATOR
+#include <lvm2app.h>
#endif
#ifdef GF_SOLARIS_HOST_OS
@@ -68,20 +76,23 @@
#define NLMV4_VERSION 4
#define NLMV1_VERSION 1
-char *glusterd_sock_dir = "/var/run";
+#define CEILING_POS(X) (((X)-(int)(X)) > 0 ? (int)((X)+1) : (int)(X))
+
static glusterd_lock_t lock;
-static void
-md5_wrapper(const unsigned char *data, size_t len, char *md5)
+char*
+gd_peer_uuid_str (glusterd_peerinfo_t *peerinfo)
{
- unsigned short i = 0;
- unsigned short lim = MD5_DIGEST_LENGTH*2+1;
- unsigned char scratch[MD5_DIGEST_LENGTH] = {0,};
- MD5(data, len, scratch);
- for (; i < MD5_DIGEST_LENGTH; i++)
- snprintf(md5 + i * 2, lim-i*2, "%02x", scratch[i]);
+ if ((peerinfo == NULL) || uuid_is_null (peerinfo->uuid))
+ return NULL;
+
+ if (peerinfo->uuid_str[0] == '\0')
+ uuid_utoa_r (peerinfo->uuid, peerinfo->uuid_str);
+
+ return peerinfo->uuid_str;
}
+
int32_t
glusterd_get_lock_owner (uuid_t *uuid)
{
@@ -119,133 +130,6 @@ glusterd_is_fuse_available ()
return _gf_false;
}
-gf_boolean_t
-glusterd_is_loopback_localhost (const struct sockaddr *sa, char *hostname)
-{
- GF_ASSERT (sa);
-
- gf_boolean_t is_local = _gf_false;
- const struct in_addr *addr4 = NULL;
- const struct in6_addr *addr6 = NULL;
- uint8_t *ap = NULL;
- struct in6_addr loopbackaddr6 = IN6ADDR_LOOPBACK_INIT;
-
- switch (sa->sa_family) {
- case AF_INET:
- addr4 = &(((struct sockaddr_in *)sa)->sin_addr);
- ap = (uint8_t*)&addr4->s_addr;
- if (ap[0] == 127)
- is_local = _gf_true;
- break;
-
- case AF_INET6:
- addr6 = &(((struct sockaddr_in6 *)sa)->sin6_addr);
- if (memcmp (addr6, &loopbackaddr6,
- sizeof (loopbackaddr6)) == 0)
- is_local = _gf_true;
- break;
-
- default:
- if (hostname)
- gf_log ("glusterd", GF_LOG_ERROR,
- "unknown address family %d for %s",
- sa->sa_family, hostname);
- break;
- }
-
- return is_local;
-}
-
-char *
-get_ip_from_addrinfo (struct addrinfo *addr, char **ip)
-{
- char buf[64];
- void *in_addr = NULL;
- struct sockaddr_in *s4 = NULL;
- struct sockaddr_in6 *s6 = NULL;
-
- switch (addr->ai_family)
- {
- case AF_INET:
- s4 = (struct sockaddr_in *)addr->ai_addr;
- in_addr = &s4->sin_addr;
- break;
-
- case AF_INET6:
- s6 = (struct sockaddr_in6 *)addr->ai_addr;
- in_addr = &s6->sin6_addr;
- break;
-
- default:
- gf_log ("glusterd", GF_LOG_ERROR, "Invalid family");
- return NULL;
- }
-
- if (!inet_ntop(addr->ai_family, in_addr, buf, sizeof(buf))) {
- gf_log ("glusterd", GF_LOG_ERROR, "String conversion failed");
- return NULL;
- }
-
- *ip = strdup (buf);
- return *ip;
-}
-
-/*TODO:FIXME: The function is expected to return a "yes/no" result.
- change return type to bool.*/
-int32_t
-glusterd_is_local_addr (char *hostname)
-{
- int32_t ret = -1;
- struct addrinfo *result = NULL;
- struct addrinfo *res = NULL;
- int32_t found = 0;
- int sd = -1;
- char *ip = NULL;
- xlator_t *this = NULL;
-
- this = THIS;
- ret = getaddrinfo (hostname, NULL, NULL, &result);
-
- if (ret != 0) {
- gf_log (this->name, GF_LOG_ERROR, "error in getaddrinfo: %s\n",
- gai_strerror(ret));
- goto out;
- }
-
- for (res = result; res != NULL; res = res->ai_next) {
- found = glusterd_is_loopback_localhost (res->ai_addr, hostname);
- if (found)
- goto out;
- }
-
- for (res = result; res != NULL; res = res->ai_next) {
- gf_log (this->name, GF_LOG_DEBUG, "%s ",
- get_ip_from_addrinfo (res, &ip));
- sd = socket (res->ai_family, SOCK_DGRAM, 0);
- if (sd == -1)
- goto out;
- /*If bind succeeds then its a local address*/
- ret = bind (sd, res->ai_addr, res->ai_addrlen);
- if (ret == 0) {
- found = _gf_true;
- gf_log (this->name, GF_LOG_DEBUG, "%s is local",
- get_ip_from_addrinfo (res, &ip));
- close (sd);
- break;
- }
- close (sd);
- }
-
-out:
- if (result)
- freeaddrinfo (result);
-
- if (!found)
- gf_log (this->name, GF_LOG_DEBUG, "%s is not local", hostname);
-
- return !found;
-}
-
int32_t
glusterd_lock (uuid_t uuid)
{
@@ -254,13 +138,17 @@ glusterd_lock (uuid_t uuid)
char new_owner_str[50];
char owner_str[50];
int ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
GF_ASSERT (uuid);
glusterd_get_lock_owner (&owner);
if (!uuid_is_null (owner)) {
- gf_log ("glusterd", GF_LOG_ERROR, "Unable to get lock"
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get lock"
" for uuid: %s, lock held by: %s",
uuid_utoa_r (uuid, new_owner_str),
uuid_utoa_r (owner, owner_str));
@@ -270,7 +158,7 @@ glusterd_lock (uuid_t uuid)
ret = glusterd_set_lock_owner (uuid);
if (!ret) {
- gf_log ("glusterd", GF_LOG_INFO, "Cluster lock held by"
+ gf_log (this->name, GF_LOG_DEBUG, "Cluster lock held by"
" %s", uuid_utoa (uuid));
}
@@ -286,21 +174,25 @@ glusterd_unlock (uuid_t uuid)
char new_owner_str[50];
char owner_str[50];
int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
GF_ASSERT (uuid);
glusterd_get_lock_owner (&owner);
if (uuid_is_null (owner)) {
- gf_log ("glusterd", GF_LOG_ERROR, "Cluster lock not held!");
+ gf_log (this->name, GF_LOG_ERROR, "Cluster lock not held!");
goto out;
}
ret = uuid_compare (uuid, owner);
if (ret) {
- gf_log ("glusterd", GF_LOG_ERROR, "Cluster lock held by %s"
- " ,unlock req from %s!", uuid_utoa_r (owner ,owner_str)
+ gf_log (this->name, GF_LOG_ERROR, "Cluster lock held by %s ,"
+ "unlock req from %s!", uuid_utoa_r (owner ,owner_str)
, uuid_utoa_r (uuid, new_owner_str));
goto out;
}
@@ -308,7 +200,7 @@ glusterd_unlock (uuid_t uuid)
ret = glusterd_unset_lock_owner (uuid);
if (ret) {
- gf_log ("glusterd", GF_LOG_ERROR, "Unable to clear cluster "
+ gf_log (this->name, GF_LOG_ERROR, "Unable to clear cluster "
"lock");
goto out;
}
@@ -335,10 +227,11 @@ glusterd_get_uuid (uuid_t *uuid)
}
int
-glusterd_submit_request (struct rpc_clnt *rpc, void *req,
- call_frame_t *frame, rpc_clnt_prog_t *prog,
- int procnum, struct iobref *iobref,
- xlator_t *this, fop_cbk_fn_t cbkfn, xdrproc_t xdrproc)
+glusterd_submit_request_unlocked (struct rpc_clnt *rpc, void *req,
+ call_frame_t *frame, rpc_clnt_prog_t *prog,
+ int procnum, struct iobref *iobref,
+ xlator_t *this, fop_cbk_fn_t cbkfn,
+ xdrproc_t xdrproc)
{
int ret = -1;
struct iobuf *iobuf = NULL;
@@ -409,6 +302,28 @@ out:
return ret;
}
+
+int
+glusterd_submit_request (struct rpc_clnt *rpc, void *req,
+ call_frame_t *frame, rpc_clnt_prog_t *prog,
+ int procnum, struct iobref *iobref,
+ xlator_t *this, fop_cbk_fn_t cbkfn, xdrproc_t xdrproc)
+{
+ glusterd_conf_t *priv = THIS->private;
+ int ret = -1;
+
+ synclock_unlock (&priv->big_lock);
+ {
+ ret = glusterd_submit_request_unlocked (rpc, req, frame, prog,
+ procnum, iobref, this,
+ cbkfn, xdrproc);
+ }
+ synclock_lock (&priv->big_lock);
+
+ return ret;
+}
+
+
struct iobuf *
glusterd_serialize_reply (rpcsvc_request_t *req, void *arg,
struct iovec *outmsg, xdrproc_t xdrproc)
@@ -522,7 +437,7 @@ glusterd_check_volume_exists (char *volname)
ret = stat (pathname, &stbuf);
if (ret) {
- gf_log ("", GF_LOG_DEBUG, "Volume %s does not exist."
+ gf_log (THIS->name, GF_LOG_DEBUG, "Volume %s does not exist."
"stat failed with errno : %d on path: %s",
volname, errno, pathname);
return _gf_false;
@@ -531,6 +446,37 @@ glusterd_check_volume_exists (char *volname)
return _gf_true;
}
+glusterd_volinfo_t *
+glusterd_volinfo_unref (glusterd_volinfo_t *volinfo)
+{
+ int refcnt = -1;
+
+ pthread_mutex_lock (&volinfo->reflock);
+ {
+ refcnt = --volinfo->refcnt;
+ }
+ pthread_mutex_unlock (&volinfo->reflock);
+
+ if (!refcnt) {
+ glusterd_volinfo_delete (volinfo);
+ return NULL;
+ }
+
+ return volinfo;
+}
+
+glusterd_volinfo_t *
+glusterd_volinfo_ref (glusterd_volinfo_t *volinfo)
+{
+ pthread_mutex_lock (&volinfo->reflock);
+ {
+ ++volinfo->refcnt;
+ }
+ pthread_mutex_unlock (&volinfo->reflock);
+
+ return volinfo;
+}
+
int32_t
glusterd_volinfo_new (glusterd_volinfo_t **volinfo)
{
@@ -545,8 +491,11 @@ glusterd_volinfo_new (glusterd_volinfo_t **volinfo)
if (!new_volinfo)
goto out;
+ LOCK_INIT (&new_volinfo->lock);
INIT_LIST_HEAD (&new_volinfo->vol_list);
+ INIT_LIST_HEAD (&new_volinfo->snapvol_list);
INIT_LIST_HEAD (&new_volinfo->bricks);
+ INIT_LIST_HEAD (&new_volinfo->snap_volumes);
new_volinfo->dict = dict_new ();
if (!new_volinfo->dict) {
@@ -562,14 +511,233 @@ glusterd_volinfo_new (glusterd_volinfo_t **volinfo)
goto out;
}
+ snprintf (new_volinfo->parent_volname, GLUSTERD_MAX_VOLUME_NAME, "N/A");
+
+ new_volinfo->snap_max_hard_limit = GLUSTERD_SNAPS_MAX_HARD_LIMIT;
+
new_volinfo->xl = THIS;
- *volinfo = new_volinfo;
+ pthread_mutex_init (&new_volinfo->reflock, NULL);
+ *volinfo = glusterd_volinfo_ref (new_volinfo);
ret = 0;
out:
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+/* This function will create a new volinfo and then
+ * dup the entries from volinfo to the new_volinfo.
+ *
+ * @param volinfo volinfo which will be duplicated
+ * @param dup_volinfo new volinfo which will be created
+ * @param set_userauth if this true then auth info is also set
+ *
+ * @return 0 on success else -1
+ */
+int32_t
+glusterd_volinfo_dup (glusterd_volinfo_t *volinfo,
+ glusterd_volinfo_t **dup_volinfo,
+ gf_boolean_t set_userauth)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+ glusterd_volinfo_t *new_volinfo = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_VALIDATE_OR_GOTO (this->name, volinfo, out);
+ GF_VALIDATE_OR_GOTO (this->name, dup_volinfo, out);
+
+ ret = glusterd_volinfo_new (&new_volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "not able to create the "
+ "duplicate volinfo for the volume %s",
+ volinfo->volname);
+ goto out;
+ }
+
+ new_volinfo->type = volinfo->type;
+ new_volinfo->replica_count = volinfo->replica_count;
+ new_volinfo->stripe_count = volinfo->stripe_count;
+ new_volinfo->dist_leaf_count = volinfo->dist_leaf_count;
+ new_volinfo->sub_count = volinfo->sub_count;
+ new_volinfo->transport_type = volinfo->transport_type;
+ new_volinfo->nfs_transport_type = volinfo->nfs_transport_type;
+ new_volinfo->brick_count = volinfo->brick_count;
+
+ dict_copy (volinfo->dict, new_volinfo->dict);
+ gd_update_volume_op_versions (new_volinfo);
+
+ if (set_userauth) {
+ glusterd_auth_set_username (new_volinfo,
+ volinfo->auth.username);
+ glusterd_auth_set_password (new_volinfo,
+ volinfo->auth.password);
+ }
+
+ *dup_volinfo = new_volinfo;
+ ret = 0;
+out:
+ if (ret && (NULL != new_volinfo)) {
+ (void) glusterd_volinfo_delete (new_volinfo);
+ }
+ return ret;
+}
+
+/* This function will duplicate brickinfo
+ *
+ * @param brickinfo Source brickinfo
+ * @param dup_brickinfo Destination brickinfo
+ *
+ * @return 0 on success else -1
+ */
+int32_t
+glusterd_brickinfo_dup (glusterd_brickinfo_t *brickinfo,
+ glusterd_brickinfo_t *dup_brickinfo)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ GF_VALIDATE_OR_GOTO (this->name, brickinfo, out);
+ GF_VALIDATE_OR_GOTO (this->name, dup_brickinfo, out);
+
+ strcpy (dup_brickinfo->hostname, brickinfo->hostname);
+ strcpy (dup_brickinfo->path, brickinfo->path);
+ strcpy (dup_brickinfo->device_path, brickinfo->device_path);
+ ret = gf_canonicalize_path (dup_brickinfo->path);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR, "Failed to canonicalize "
+ "brick path");
+ goto out;
+ }
+ uuid_copy (dup_brickinfo->uuid, brickinfo->uuid);
+
+ dup_brickinfo->port = brickinfo->port;
+ dup_brickinfo->rdma_port = brickinfo->rdma_port;
+ if (NULL != brickinfo->logfile) {
+ dup_brickinfo->logfile = gf_strdup (brickinfo->logfile);
+ if (NULL == dup_brickinfo->logfile) {
+ ret = -1;
+ goto out;
+ }
+ }
+ strcpy (dup_brickinfo->brick_id, brickinfo->brick_id);
+ dup_brickinfo->status = brickinfo->status;
+ dup_brickinfo->snap_status = brickinfo->snap_status;
+out:
+ return ret;
+}
+
+/* This function will copy snap volinfo to the new
+ * passed volinfo and regenerate backend store files
+ * for the restored snap.
+ *
+ * @param new_volinfo new volinfo
+ * @param snap_volinfo volinfo of snap volume
+ *
+ * @return 0 on success and -1 on failure
+ *
+ * TODO: Duplicate all members of volinfo, e.g. geo-rep sync slaves
+ */
+int32_t
+glusterd_snap_volinfo_restore (dict_t *rsp_dict,
+ glusterd_volinfo_t *new_volinfo,
+ glusterd_volinfo_t *snap_volinfo)
+{
+ int32_t brick_count = -1;
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ glusterd_brickinfo_t *new_brickinfo = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (rsp_dict);
+
+ GF_VALIDATE_OR_GOTO (this->name, new_volinfo, out);
+ GF_VALIDATE_OR_GOTO (this->name, snap_volinfo, out);
+
+ brick_count = 0;
+ list_for_each_entry (brickinfo, &snap_volinfo->bricks, brick_list) {
+ ret = glusterd_brickinfo_new (&new_brickinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to create "
+ "new brickinfo");
+ goto out;
+ }
+
+ /* Duplicate brickinfo */
+ ret = glusterd_brickinfo_dup (brickinfo, new_brickinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to dup "
+ "brickinfo");
+ goto out;
+ }
+
+ /* If the brick is not of this peer, or snapshot is missed *
+ * for the brick do not replace the xattr for it */
+ if ((!uuid_compare (brickinfo->uuid, MY_UUID)) &&
+ (brickinfo->snap_status != -1)) {
+ /* We need to replace the volume id of all the bricks
+ * to the volume id of the origin volume. new_volinfo
+ * has the origin volume's volume id*/
+ ret = sys_lsetxattr (new_brickinfo->path,
+ GF_XATTR_VOL_ID_KEY,
+ new_volinfo->volume_id,
+ sizeof (new_volinfo->volume_id),
+ XATTR_REPLACE);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "set extended attribute %s on %s. "
+ "Reason: %s, snap: %s",
+ GF_XATTR_VOL_ID_KEY,
+ new_brickinfo->path, strerror (errno),
+ new_volinfo->volname);
+ goto out;
+ }
+ }
+
+ /* If a snapshot is pending for this brick then
+ * restore should also be pending
+ */
+ if (brickinfo->snap_status == -1) {
+ /* Adding missed delete to the dict */
+ ret = glusterd_add_missed_snaps_to_dict
+ (rsp_dict,
+ snap_volinfo,
+ brickinfo,
+ brick_count + 1,
+ GF_SNAP_OPTION_TYPE_RESTORE);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to add missed snapshot info "
+ "for %s:%s in the rsp_dict",
+ brickinfo->hostname,
+ brickinfo->path);
+ goto out;
+ }
+ }
+
+ list_add_tail (&new_brickinfo->brick_list,
+ &new_volinfo->bricks);
+ /* ownership of new_brickinfo is passed to new_volinfo */
+ new_brickinfo = NULL;
+ brick_count++;
+ }
+
+ /* Regenerate all volfiles */
+ ret = glusterd_create_volfiles_and_notify_services (new_volinfo);
+
+out:
+ if (ret && (NULL != new_brickinfo)) {
+ (void) glusterd_brickinfo_delete (new_brickinfo);
+ }
+
return ret;
}
@@ -653,10 +821,18 @@ glusterd_volume_brickinfos_delete (glusterd_volinfo_t *volinfo)
}
out:
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
+int
+glusterd_volinfo_remove (glusterd_volinfo_t *volinfo)
+{
+ list_del_init (&volinfo->vol_list);
+ glusterd_volinfo_unref (volinfo);
+ return 0;
+}
+
int32_t
glusterd_volinfo_delete (glusterd_volinfo_t *volinfo)
{
@@ -665,6 +841,7 @@ glusterd_volinfo_delete (glusterd_volinfo_t *volinfo)
GF_ASSERT (volinfo);
list_del_init (&volinfo->vol_list);
+ list_del_init (&volinfo->snapvol_list);
ret = glusterd_volume_brickinfos_delete (volinfo);
if (ret)
@@ -674,18 +851,22 @@ glusterd_volinfo_delete (glusterd_volinfo_t *volinfo)
if (volinfo->gsync_slaves)
dict_unref (volinfo->gsync_slaves);
GF_FREE (volinfo->logdir);
+ if (volinfo->rebal.dict)
+ dict_unref (volinfo->rebal.dict);
+
+ gf_store_handle_destroy (volinfo->quota_conf_shandle);
glusterd_auth_cleanup (volinfo);
+ pthread_mutex_destroy (&volinfo->reflock);
GF_FREE (volinfo);
ret = 0;
out:
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
-
int32_t
glusterd_brickinfo_new (glusterd_brickinfo_t **brickinfo)
{
@@ -707,19 +888,47 @@ glusterd_brickinfo_new (glusterd_brickinfo_t **brickinfo)
ret = 0;
out:
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
+int
+glusterd_get_next_available_brickid (glusterd_volinfo_t *volinfo)
+{
+ glusterd_brickinfo_t *brickinfo = NULL;
+ char *token = NULL;
+ int brickid = 0;
+ int max_brickid = -1;
+ int ret = -1;
+
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ token = strrchr (brickinfo->brick_id, '-');
+ ret = gf_string2int32 (++token, &brickid);
+ if (ret < 0) {
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "Unable to generate brick ID");
+ return ret;
+ }
+ if (brickid > max_brickid)
+ max_brickid = brickid;
+ }
+
+ return max_brickid + 1 ;
+}
+
int32_t
glusterd_resolve_brick (glusterd_brickinfo_t *brickinfo)
{
int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
GF_ASSERT (brickinfo);
ret = glusterd_hostname_to_uuid (brickinfo->hostname, brickinfo->uuid);
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -733,6 +942,7 @@ glusterd_brickinfo_new_from_brick (char *brick,
char *path = NULL;
char *tmp_host = NULL;
char *tmp_path = NULL;
+ char *vg = NULL;
GF_ASSERT (brick);
GF_ASSERT (brickinfo);
@@ -751,6 +961,17 @@ glusterd_brickinfo_new_from_brick (char *brick,
if (ret)
goto out;
+#ifdef HAVE_BD_XLATOR
+ vg = strchr (path, '?');
+ /* ? is used as a delimiter for vg */
+ if (vg) {
+ strncpy (new_brickinfo->vg, vg + 1, PATH_MAX - 1);
+ *vg = '\0';
+ }
+ new_brickinfo->caps = CAPS_BD;
+#else
+ vg = NULL; /* Avoid compiler warnings when BD not enabled */
+#endif
ret = gf_canonicalize_path (path);
if (ret)
goto out;
@@ -765,7 +986,7 @@ out:
GF_FREE (tmp_host);
if (tmp_host)
GF_FREE (tmp_path);
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -824,8 +1045,13 @@ glusterd_is_brickpath_available (uuid_t uuid, char *path)
strncpy (tmp_path, path, PATH_MAX);
/* path may not yet exist */
- if (!realpath (path, tmp_path) && (errno != ENOENT))
- goto out;
+ if (!realpath (path, tmp_path)) {
+ if (errno != ENOENT) {
+ goto out;
+ }
+ /* When realpath(3) fails, tmp_path is undefined. */
+ strncpy(tmp_path,path,PATH_MAX);
+ }
list_for_each_entry (volinfo, &priv->volumes, vol_list) {
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
@@ -849,6 +1075,169 @@ out:
return available;
}
+#ifdef HAVE_BD_XLATOR
+/*
+ * Sets the tag of the format "trusted.glusterfs.volume-id:<uuid>" in
+ * the brick VG. It is used to avoid using same VG for another brick.
+ * @volume-id - gfid, @brick - brick info, @msg - Error message returned
+ * to the caller
+ */
+int
+glusterd_bd_set_vg_tag (unsigned char *volume_id, glusterd_brickinfo_t *brick,
+ char *msg, int msg_size)
+{
+ lvm_t handle = NULL;
+ vg_t vg = NULL;
+ char *uuid = NULL;
+ int ret = -1;
+
+ gf_asprintf (&uuid, "%s:%s", GF_XATTR_VOL_ID_KEY,
+ uuid_utoa (volume_id));
+ if (!uuid) {
+ snprintf (msg, sizeof(*msg), "Could not allocate memory "
+ "for tag");
+ return -1;
+ }
+
+ handle = lvm_init (NULL);
+ if (!handle) {
+ snprintf (msg, sizeof(*msg), "lvm_init failed");
+ goto out;
+ }
+
+ vg = lvm_vg_open (handle, brick->vg, "w", 0);
+ if (!vg) {
+ snprintf (msg, sizeof(*msg), "Could not open VG %s",
+ brick->vg);
+ goto out;
+ }
+
+ if (lvm_vg_add_tag (vg, uuid) < 0) {
+ snprintf (msg, sizeof(*msg), "Could not set tag %s for "
+ "VG %s", uuid, brick->vg);
+ goto out;
+ }
+ lvm_vg_write (vg);
+ ret = 0;
+out:
+ GF_FREE (uuid);
+
+ if (vg)
+ lvm_vg_close (vg);
+ if (handle)
+ lvm_quit (handle);
+
+ return ret;
+}
+#endif
+
+int
+glusterd_validate_and_create_brickpath (glusterd_brickinfo_t *brickinfo,
+ uuid_t volume_id, char **op_errstr,
+ gf_boolean_t is_force)
+{
+ int ret = -1;
+ char parentdir[PATH_MAX] = {0,};
+ struct stat parent_st = {0,};
+ struct stat brick_st = {0,};
+ struct stat root_st = {0,};
+ char msg[2048] = {0,};
+ gf_boolean_t is_created = _gf_false;
+
+ ret = mkdir (brickinfo->path, 0777);
+ if (ret) {
+ if (errno != EEXIST) {
+ snprintf (msg, sizeof (msg), "Failed to create brick "
+ "directory for brick %s:%s. Reason : %s ",
+ brickinfo->hostname, brickinfo->path,
+ strerror (errno));
+ goto out;
+ }
+ } else {
+ is_created = _gf_true;
+ }
+
+ ret = lstat (brickinfo->path, &brick_st);
+ if (ret) {
+ snprintf (msg, sizeof (msg), "lstat failed on %s. Reason : %s",
+ brickinfo->path, strerror (errno));
+ goto out;
+ }
+
+ if ((!is_created) && (!S_ISDIR (brick_st.st_mode))) {
+ snprintf (msg, sizeof (msg), "The provided path %s which is "
+ "already present, is not a directory",
+ brickinfo->path);
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (parentdir, sizeof (parentdir), "%s/..", brickinfo->path);
+
+ ret = lstat ("/", &root_st);
+ if (ret) {
+ snprintf (msg, sizeof (msg), "lstat failed on /. Reason : %s",
+ strerror (errno));
+ goto out;
+ }
+
+ ret = lstat (parentdir, &parent_st);
+ if (ret) {
+ snprintf (msg, sizeof (msg), "lstat failed on %s. Reason : %s",
+ parentdir, strerror (errno));
+ goto out;
+ }
+
+ if (!is_force) {
+ if (brick_st.st_dev != parent_st.st_dev) {
+ snprintf (msg, sizeof (msg), "The brick %s:%s is a "
+ "mount point. Please create a sub-directory "
+ "under the mount point and use that as the "
+ "brick directory. Or use 'force' at the end "
+ "of the command if you want to override this "
+ "behavior.", brickinfo->hostname,
+ brickinfo->path);
+ ret = -1;
+ goto out;
+ }
+ else if (parent_st.st_dev == root_st.st_dev) {
+ snprintf (msg, sizeof (msg), "The brick %s:%s "
+ "is being created in the root partition. It "
+ "is recommended that you don't use the "
+ "system's root partition for storage backend."
+ " Or use 'force' at the end of the command if"
+ " you want to override this behavior.",
+ brickinfo->hostname, brickinfo->path);
+ ret = -1;
+ goto out;
+ }
+ }
+
+#ifdef HAVE_BD_XLATOR
+ if (brickinfo->vg[0]) {
+ ret = glusterd_bd_set_vg_tag (volume_id, brickinfo, msg,
+ sizeof(msg));
+ if (ret)
+ goto out;
+ }
+#endif
+ ret = glusterd_check_and_set_brick_xattr (brickinfo->hostname,
+ brickinfo->path, volume_id,
+ op_errstr, is_force);
+ if (ret)
+ goto out;
+
+ ret = 0;
+
+out:
+ if (ret && is_created)
+ rmdir (brickinfo->path);
+ if (ret && !*op_errstr && msg[0] != '\0')
+ *op_errstr = gf_strdup (msg);
+
+ return ret;
+}
+
int32_t
glusterd_volume_brickinfo_get (uuid_t uuid, char *hostname, char *path,
glusterd_volinfo_t *volinfo,
@@ -857,6 +1246,9 @@ glusterd_volume_brickinfo_get (uuid_t uuid, char *hostname, char *path,
glusterd_brickinfo_t *brickiter = NULL;
uuid_t peer_uuid = {0};
int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
if (uuid) {
uuid_copy (peer_uuid, uuid);
@@ -875,7 +1267,9 @@ glusterd_volume_brickinfo_get (uuid_t uuid, char *hostname, char *path,
continue;
if (strcmp (brickiter->path, path) == 0) {
- gf_log (THIS->name, GF_LOG_INFO, "Found brick");
+ gf_log (this->name, GF_LOG_DEBUG, LOGSTR_FOUND_BRICK,
+ brickiter->hostname, brickiter->path,
+ volinfo->volname);
ret = 0;
if (brickinfo)
*brickinfo = brickiter;
@@ -884,7 +1278,7 @@ glusterd_volume_brickinfo_get (uuid_t uuid, char *hostname, char *path,
}
out:
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -934,14 +1328,20 @@ glusterd_friend_cleanup (glusterd_peerinfo_t *peerinfo)
{
GF_ASSERT (peerinfo);
glusterd_peerctx_t *peerctx = NULL;
+ gf_boolean_t quorum_action = _gf_false;
+ glusterd_conf_t *priv = THIS->private;
+ if (peerinfo->quorum_contrib != QUORUM_NONE)
+ quorum_action = _gf_true;
if (peerinfo->rpc) {
/* cleanup the saved-frames before last unref */
+ synclock_unlock (&priv->big_lock);
rpc_clnt_connection_cleanup (&peerinfo->rpc->conn);
+ synclock_lock (&priv->big_lock);
peerctx = peerinfo->rpc->mydata;
peerinfo->rpc->mydata = NULL;
- peerinfo->rpc = rpc_clnt_unref (peerinfo->rpc);
+ peerinfo->rpc = glusterd_rpc_clnt_unref (priv, peerinfo->rpc);
peerinfo->rpc = NULL;
if (peerctx) {
GF_FREE (peerctx->errstr);
@@ -950,9 +1350,73 @@ glusterd_friend_cleanup (glusterd_peerinfo_t *peerinfo)
}
glusterd_peer_destroy (peerinfo);
+ if (quorum_action)
+ glusterd_do_quorum_action ();
return 0;
}
+int
+glusterd_volinfo_find_by_volume_id (uuid_t volume_id, glusterd_volinfo_t **volinfo)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+ glusterd_volinfo_t *voliter = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ if (!volume_id)
+ return -1;
+
+ this = THIS;
+ priv = this->private;
+
+ list_for_each_entry (voliter, &priv->volumes, vol_list) {
+ if (uuid_compare (volume_id, voliter->volume_id))
+ continue;
+ *volinfo = voliter;
+ ret = 0;
+ gf_log (this->name, GF_LOG_DEBUG, "Volume %s found",
+ voliter->volname);
+ break;
+ }
+ return ret;
+}
+
+int
+glusterd_snap_volinfo_find_by_volume_id (uuid_t volume_id,
+ glusterd_volinfo_t **volinfo)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+ glusterd_volinfo_t *voliter = NULL;
+ glusterd_snap_t *snap = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ this = THIS;
+ priv = this->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (volinfo);
+
+ if (uuid_is_null(volume_id)) {
+ gf_log (this->name, GF_LOG_WARNING, "Volume UUID is NULL");
+ goto out;
+ }
+
+ list_for_each_entry (snap, &priv->snapshots, snap_list) {
+ list_for_each_entry (voliter, &snap->volumes, vol_list) {
+ if (uuid_compare (volume_id, voliter->volume_id))
+ continue;
+ *volinfo = voliter;
+ ret = 0;
+ goto out;
+ }
+ }
+
+ gf_log (this->name, GF_LOG_WARNING, "Snap volume not found");
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
int32_t
glusterd_volinfo_find (char *volname, glusterd_volinfo_t **volinfo)
{
@@ -962,116 +1426,135 @@ glusterd_volinfo_find (char *volname, glusterd_volinfo_t **volinfo)
glusterd_conf_t *priv = NULL;
GF_ASSERT (volname);
-
this = THIS;
GF_ASSERT (this);
priv = this->private;
+ GF_ASSERT (priv);
list_for_each_entry (tmp_volinfo, &priv->volumes, vol_list) {
if (!strcmp (tmp_volinfo->volname, volname)) {
- gf_log ("", GF_LOG_DEBUG, "Volume %s found", volname);
+ gf_log (this->name, GF_LOG_DEBUG, "Volume %s found",
+ volname);
ret = 0;
*volinfo = tmp_volinfo;
break;
}
}
-
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
-
int32_t
-glusterd_service_stop (const char *service, char *pidfile, int sig,
- gf_boolean_t force_kill)
+glusterd_snap_volinfo_find (char *snap_volname, glusterd_snap_t *snap,
+ glusterd_volinfo_t **volinfo)
{
- int32_t ret = -1;
- pid_t pid = -1;
- FILE *file = NULL;
- gf_boolean_t is_locked = _gf_false;
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+ glusterd_volinfo_t *snap_vol = NULL;
+ glusterd_conf_t *priv = NULL;
- file = fopen (pidfile, "r+");
+ this = THIS;
+ priv = this->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (snap);
+ GF_ASSERT (snap_volname);
- if (!file) {
- gf_log ("", GF_LOG_ERROR, "Unable to open pidfile: %s",
- pidfile);
- if (errno == ENOENT) {
- gf_log ("",GF_LOG_TRACE, "%s may not be running",
- service);
+ list_for_each_entry (snap_vol, &snap->volumes, vol_list) {
+ if (!strcmp (snap_vol->volname, snap_volname)) {
ret = 0;
+ *volinfo = snap_vol;
goto out;
}
- ret = -1;
- goto out;
}
- ret = lockf (fileno (file), F_TLOCK, 0);
- if (!ret) {
- is_locked = _gf_true;
- ret = unlink (pidfile);
- if (ret && (ENOENT != errno)) {
- gf_log ("", GF_LOG_ERROR, "Unable to "
- "unlink stale pidfile: %s", pidfile);
- } else if (ret && (ENOENT == errno)){
+
+ gf_log (this->name, GF_LOG_WARNING, "Snap volume %s not found",
+ snap_volname);
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_snap_volinfo_find_from_parent_volname (char *origin_volname,
+ glusterd_snap_t *snap,
+ glusterd_volinfo_t **volinfo)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+ glusterd_volinfo_t *snap_vol = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ this = THIS;
+ priv = this->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (snap);
+ GF_ASSERT (origin_volname);
+
+ list_for_each_entry (snap_vol, &snap->volumes, vol_list) {
+ if (!strcmp (snap_vol->parent_volname, origin_volname)) {
ret = 0;
- gf_log ("", GF_LOG_INFO, "Brick already stopped");
+ *volinfo = snap_vol;
+ goto out;
}
- goto out;
}
+ gf_log (this->name, GF_LOG_DEBUG, "Snap volume not found(snap: %s, "
+ "origin-volume: %s", snap->snapname, origin_volname);
- ret = fscanf (file, "%d", &pid);
- if (ret <= 0) {
- gf_log ("", GF_LOG_ERROR, "Unable to read pidfile: %s",
- pidfile);
- ret = -1;
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_service_stop (const char *service, char *pidfile, int sig,
+ gf_boolean_t force_kill)
+{
+ int32_t ret = -1;
+ pid_t pid = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ if (!gf_is_service_running (pidfile, &pid)) {
+ ret = 0;
+ gf_log (this->name, GF_LOG_INFO, "%s already stopped", service);
goto out;
}
- fclose (file);
- file = NULL;
-
- gf_log ("", GF_LOG_INFO, "Stopping gluster %s running in pid: %d",
- service, pid);
+ gf_log (this->name, GF_LOG_DEBUG, "Stopping gluster %s running in pid: "
+ "%d", service, pid);
ret = kill (pid, sig);
-
- if (force_kill) {
- sleep (1);
- file = fopen (pidfile, "r+");
- if (!file) {
+ if (ret) {
+ switch (errno) {
+ case ESRCH:
+ gf_log (this->name, GF_LOG_DEBUG, "%s is already stopped",
+ service);
ret = 0;
goto out;
+ default:
+ gf_log (this->name, GF_LOG_ERROR, "Failed to kill %s: %s",
+ service, strerror (errno));
}
- ret = lockf (fileno (file), F_TLOCK, 0);
- if (ret && ((EAGAIN == errno) || (EACCES == errno))) {
- ret = kill (pid, SIGKILL);
- if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to "
- "kill pid %d reason: %s", pid,
- strerror(errno));
- goto out;
- }
+ }
+ if (!force_kill)
+ goto out;
- } else if (0 == ret){
- is_locked = _gf_true;
- }
- ret = unlink (pidfile);
- if (ret && (ENOENT != errno)) {
- gf_log ("", GF_LOG_ERROR, "Unable to "
- "unlink pidfile: %s", pidfile);
+ sleep (1);
+ if (gf_is_service_running (pidfile, NULL)) {
+ ret = kill (pid, SIGKILL);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to "
+ "kill pid %d reason: %s", pid,
+ strerror(errno));
goto out;
}
}
ret = 0;
out:
- if (is_locked && file)
- if (lockf (fileno (file), F_ULOCK, 0) < 0)
- gf_log ("", GF_LOG_WARNING, "Cannot unlock pidfile: %s"
- " reason: %s", pidfile, strerror(errno));
- if (file)
- fclose (file);
return ret;
}
@@ -1081,7 +1564,7 @@ glusterd_set_socket_filepath (char *sock_filepath, char *sockpath, size_t len)
char md5_sum[MD5_DIGEST_LENGTH*2+1] = {0,};
md5_wrapper ((unsigned char *) sock_filepath, strlen(sock_filepath), md5_sum);
- snprintf (sockpath, len, "%s/%s.socket", glusterd_sock_dir, md5_sum);
+ snprintf (sockpath, len, "%s/%s.socket", GLUSTERD_SOCK_DIR, md5_sum);
}
void
@@ -1096,7 +1579,7 @@ glusterd_set_brick_socket_filepath (glusterd_volinfo_t *volinfo,
glusterd_conf_t *priv = NULL;
int expected_file_len = 0;
- expected_file_len = strlen (glusterd_sock_dir) + strlen ("/") +
+ expected_file_len = strlen (GLUSTERD_SOCK_DIR) + strlen ("/") +
MD5_DIGEST_LENGTH*2 + strlen (".socket") + 1;
GF_ASSERT (len >= expected_file_len);
this = THIS;
@@ -1117,42 +1600,72 @@ glusterd_set_brick_socket_filepath (glusterd_volinfo_t *volinfo,
*/
int32_t
glusterd_brick_connect (glusterd_volinfo_t *volinfo,
- glusterd_brickinfo_t *brickinfo)
+ glusterd_brickinfo_t *brickinfo, char *socketpath)
{
int ret = 0;
- char socketpath[PATH_MAX] = {0};
+ char volume_id_str[64];
+ char *brickid = NULL;
dict_t *options = NULL;
struct rpc_clnt *rpc = NULL;
+ glusterd_conf_t *priv = THIS->private;
GF_ASSERT (volinfo);
GF_ASSERT (brickinfo);
+ GF_ASSERT (socketpath);
if (brickinfo->rpc == NULL) {
- glusterd_set_brick_socket_filepath (volinfo, brickinfo,
- socketpath,
- sizeof (socketpath));
-
/* Setting frame-timeout to 10mins (600seconds).
* Unix domain sockets ensures that the connection is reliable.
* The default timeout of 30mins used for unreliable network
* connections is too long for unix domain socket connections.
*/
- ret = rpc_clnt_transport_unix_options_build (&options,
- socketpath, 600);
+ ret = rpc_transport_unix_options_build (&options, socketpath,
+ 600);
if (ret)
goto out;
+
+ uuid_utoa_r (volinfo->volume_id, volume_id_str);
+ ret = gf_asprintf (&brickid, "%s:%s:%s", volume_id_str,
+ brickinfo->hostname, brickinfo->path);
+ if (ret < 0)
+ goto out;
+
+ synclock_unlock (&priv->big_lock);
ret = glusterd_rpc_create (&rpc, options,
glusterd_brick_rpc_notify,
- brickinfo);
- if (ret)
+ brickid);
+ synclock_lock (&priv->big_lock);
+ if (ret) {
+ GF_FREE (brickid);
goto out;
+ }
brickinfo->rpc = rpc;
}
out:
+
gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
+static int
+_mk_rundir_p (glusterd_volinfo_t *volinfo)
+{
+ char voldir[PATH_MAX] = {0,};
+ char rundir[PATH_MAX] = {0,};
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+ int ret = -1;
+
+ this = THIS;
+ priv = this->private;
+ GLUSTERD_GET_VOLUME_DIR (voldir, volinfo, priv);
+ snprintf (rundir, sizeof (rundir)-1, "%s/run", voldir);
+ ret = mkdir_p (rundir, 0777, _gf_true);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR, "Failed to create rundir");
+ return ret;
+}
+
int32_t
glusterd_volume_start_glusterfs (glusterd_volinfo_t *volinfo,
glusterd_brickinfo_t *brickinfo,
@@ -1161,22 +1674,17 @@ glusterd_volume_start_glusterfs (glusterd_volinfo_t *volinfo,
int32_t ret = -1;
xlator_t *this = NULL;
glusterd_conf_t *priv = NULL;
- char pidfile[PATH_MAX] = {0,};
+ char pidfile[PATH_MAX+1] = {0,};
char volfile[PATH_MAX] = {0,};
- char path[PATH_MAX] = {0,};
runner_t runner = {0,};
- char rundir[PATH_MAX] = {0,};
char exp_path[PATH_MAX] = {0,};
char logfile[PATH_MAX] = {0,};
int port = 0;
int rdma_port = 0;
- FILE *file = NULL;
- gf_boolean_t is_locked = _gf_false;
char socketpath[PATH_MAX] = {0};
char glusterd_uuid[1024] = {0,};
-#ifdef DEBUG
char valgrind_logfile[PATH_MAX] = {0};
-#endif
+
GF_ASSERT (volinfo);
GF_ASSERT (brickinfo);
@@ -1184,109 +1692,91 @@ glusterd_volume_start_glusterfs (glusterd_volinfo_t *volinfo,
GF_ASSERT (this);
priv = this->private;
+ GF_ASSERT (priv);
- GLUSTERD_GET_VOLUME_DIR (path, volinfo, priv);
- snprintf (rundir, PATH_MAX, "%s/run", path);
- ret = mkdir (rundir, 0777);
-
- if ((ret == -1) && (EEXIST != errno)) {
- gf_log ("", GF_LOG_ERROR, "Unable to create rundir %s",
- rundir);
+ if (brickinfo->snap_status == -1) {
+ gf_log (this->name, GF_LOG_INFO,
+ "Snapshot is pending on %s:%s. "
+ "Hence not starting the brick",
+ brickinfo->hostname,
+ brickinfo->path);
+ ret = 0;
goto out;
}
+ ret = _mk_rundir_p (volinfo);
+ if (ret)
+ goto out;
+
glusterd_set_brick_socket_filepath (volinfo, brickinfo, socketpath,
sizeof (socketpath));
- GLUSTERD_GET_BRICK_PIDFILE (pidfile, path, brickinfo->hostname,
- brickinfo->path);
-
- file = fopen (pidfile, "r+");
- if (file) {
- ret = lockf (fileno (file), F_TLOCK, 0);
- if (ret && ((EAGAIN == errno) || (EACCES == errno))) {
- ret = 0;
- gf_log ("", GF_LOG_INFO, "brick %s:%s "
- "already started", brickinfo->hostname,
- brickinfo->path);
- goto connect;
- }
- }
-
- ret = pmap_registry_search (this, brickinfo->path,
- GF_PMAP_PORT_BRICKSERVER);
- if (ret) {
- ret = 0;
- file = fopen (pidfile, "r+");
- if (file) {
- ret = lockf (fileno (file), F_TLOCK, 0);
- if (ret && ((EAGAIN == errno) || (EACCES == errno))) {
- ret = 0;
- gf_log ("", GF_LOG_INFO, "brick %s:%s "
- "already started", brickinfo->hostname,
- brickinfo->path);
- goto connect;
- } else if (0 == ret) {
- is_locked = _gf_true;
- }
- }
- /* This means, pmap has the entry, remove it */
- ret = pmap_registry_remove (this, 0, brickinfo->path,
- GF_PMAP_PORT_BRICKSERVER, NULL);
- }
- unlink (pidfile);
- gf_log ("", GF_LOG_INFO, "About to start glusterfs"
- " for brick %s:%s", brickinfo->hostname,
- brickinfo->path);
- GLUSTERD_REMOVE_SLASH_FROM_PATH (brickinfo->path, exp_path);
- snprintf (volfile, PATH_MAX, "%s.%s.%s", volinfo->volname,
- brickinfo->hostname, exp_path);
-
- if (!brickinfo->logfile && volinfo->logdir) {
- snprintf (logfile, PATH_MAX, "%s/%s.log", volinfo->logdir,
- exp_path);
- brickinfo->logfile = gf_strdup (logfile);
- } else if (!brickinfo->logfile) {
- snprintf (logfile, PATH_MAX, "%s/bricks/%s.log",
- DEFAULT_LOG_FILE_DIRECTORY, exp_path);
- brickinfo->logfile = gf_strdup (logfile);
- }
+ GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, brickinfo, priv);
+ if (gf_is_service_running (pidfile, NULL))
+ goto connect;
port = brickinfo->port;
if (!port)
port = pmap_registry_alloc (THIS);
+ /* Build the exp_path, before starting the glusterfsd even in
+ valgrind mode. Otherwise all the glusterfsd processes start
+ writing the valgrind log to the same file.
+ */
+ GLUSTERD_REMOVE_SLASH_FROM_PATH (brickinfo->path, exp_path);
runinit (&runner);
-#ifdef DEBUG
if (priv->valgrind) {
+ /* Run bricks with valgrind */
if (volinfo->logdir) {
snprintf (valgrind_logfile, PATH_MAX,
- "%s/valgrind-%s-%s.log", volinfo->logdir,
+ "%s/valgrind-%s-%s.log",
+ volinfo->logdir,
volinfo->volname, exp_path);
} else {
- snprintf (valgrind_logfile, PATH_MAX,
- "%s/bricks/valgrind-%s-%s.log",
- DEFAULT_LOG_FILE_DIRECTORY,
- volinfo->volname, exp_path);
+ snprintf (valgrind_logfile, PATH_MAX,
+ "%s/bricks/valgrind-%s-%s.log",
+ DEFAULT_LOG_FILE_DIRECTORY,
+ volinfo->volname, exp_path);
}
- /* Run bricks with valgrind */
+
runner_add_args (&runner, "valgrind", "--leak-check=full",
- "--trace-children=yes", NULL);
+ "--trace-children=yes", "--track-origins=yes",
+ NULL);
runner_argprintf (&runner, "--log-file=%s", valgrind_logfile);
- }
-#endif
+ }
+
+ if (volinfo->is_snap_volume) {
+ snprintf (volfile, PATH_MAX,"/%s/%s/%s.%s.%s",
+ GLUSTERD_VOL_SNAP_DIR_PREFIX,
+ volinfo->snapshot->snapname, volinfo->volname,
+ brickinfo->hostname, exp_path);
+ } else {
+ snprintf (volfile, PATH_MAX, "%s.%s.%s", volinfo->volname,
+ brickinfo->hostname, exp_path);
+ }
+
+ if (volinfo->logdir) {
+ snprintf (logfile, PATH_MAX, "%s/%s.log",
+ volinfo->logdir, exp_path);
+ } else {
+ snprintf (logfile, PATH_MAX, "%s/bricks/%s.log",
+ DEFAULT_LOG_FILE_DIRECTORY, exp_path);
+ }
+ if (!brickinfo->logfile)
+ brickinfo->logfile = gf_strdup (logfile);
+
(void) snprintf (glusterd_uuid, 1024, "*-posix.glusterd-uuid=%s",
uuid_utoa (MY_UUID));
- runner_add_args (&runner, SBIN_DIR"/glusterfsd",
- "-s", "localhost", "--volfile-id", volfile,
+ runner_add_args (&runner, SBIN_DIR"/glusterfsd",
+ "-s", brickinfo->hostname, "--volfile-id", volfile,
"-p", pidfile, "-S", socketpath,
"--brick-name", brickinfo->path,
"-l", brickinfo->logfile,
"--xlator-option", glusterd_uuid,
NULL);
- runner_add_arg (&runner, "--brick-port");
+ runner_add_arg (&runner, "--brick-port");
if (volinfo->transport_type != GF_TRANSPORT_BOTH_TCP_RDMA) {
runner_argprintf (&runner, "%d", port);
} else {
@@ -1307,29 +1797,30 @@ glusterd_volume_start_glusterfs (glusterd_volinfo_t *volinfo,
runner_add_arg (&runner, "--mem-accounting");
runner_log (&runner, "", GF_LOG_DEBUG, "Starting GlusterFS");
- if (wait)
+ if (wait) {
+ synclock_unlock (&priv->big_lock);
ret = runner_run (&runner);
- else
+ synclock_lock (&priv->big_lock);
+
+ } else {
ret = runner_run_nowait (&runner);
+ }
if (ret)
goto out;
- //pmap_registry_bind (THIS, port, brickinfo->path);
brickinfo->port = port;
brickinfo->rdma_port = rdma_port;
connect:
- ret = glusterd_brick_connect (volinfo, brickinfo);
- if (ret)
+ ret = glusterd_brick_connect (volinfo, brickinfo, socketpath);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to connect to brick %s:%s on %s",
+ brickinfo->hostname, brickinfo->path, socketpath);
goto out;
+ }
out:
- if (is_locked && file)
- if (lockf (fileno (file), F_ULOCK, 0) < 0)
- gf_log ("", GF_LOG_WARNING, "Cannot unlock pidfile: %s"
- " reason: %s", pidfile, strerror(errno));
- if (file)
- fclose (file);
return ret;
}
@@ -1357,7 +1848,7 @@ glusterd_brick_unlink_socket_file (glusterd_volinfo_t *volinfo,
if (ret && (ENOENT == errno)) {
ret = 0;
} else {
- gf_log ("glusterd", GF_LOG_ERROR, "Failed to remove %s"
+ gf_log (this->name, GF_LOG_ERROR, "Failed to remove %s"
" error: %s", socketpath, strerror (errno));
}
@@ -1367,15 +1858,23 @@ glusterd_brick_unlink_socket_file (glusterd_volinfo_t *volinfo,
int32_t
glusterd_brick_disconnect (glusterd_brickinfo_t *brickinfo)
{
+ rpc_clnt_t *rpc = NULL;
+ glusterd_conf_t *priv = THIS->private;
+
GF_ASSERT (brickinfo);
- if (brickinfo->rpc) {
- /* cleanup the saved-frames before last unref */
- rpc_clnt_connection_cleanup (&brickinfo->rpc->conn);
+ if (!brickinfo) {
+ gf_log_callingfn ("glusterd", GF_LOG_WARNING, "!brickinfo");
+ return -1;
+ }
- rpc_clnt_unref (brickinfo->rpc);
- brickinfo->rpc = NULL;
+ rpc = brickinfo->rpc;
+ brickinfo->rpc = NULL;
+
+ if (rpc) {
+ glusterd_rpc_clnt_unref (priv, rpc);
}
+
return 0;
}
@@ -1384,11 +1883,10 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t *volinfo,
glusterd_brickinfo_t *brickinfo,
gf_boolean_t del_brick)
{
- xlator_t *this = NULL;
- glusterd_conf_t *priv = NULL;
- char pidfile[PATH_MAX] = {0,};
- char path[PATH_MAX] = {0,};
- int ret = 0;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ char pidfile[PATH_MAX] = {0,};
+ int ret = 0;
GF_ASSERT (volinfo);
GF_ASSERT (brickinfo);
@@ -1400,20 +1898,21 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t *volinfo,
if (del_brick)
list_del_init (&brickinfo->brick_list);
- (void) glusterd_brick_disconnect (brickinfo);
-
- GLUSTERD_GET_VOLUME_DIR (path, volinfo, priv);
- GLUSTERD_GET_BRICK_PIDFILE (pidfile, path, brickinfo->hostname,
- brickinfo->path);
-
- ret = glusterd_service_stop ("brick", pidfile, SIGTERM, _gf_false);
- if (ret == 0) {
- glusterd_set_brick_status (brickinfo, GF_BRICK_STOPPED);
- (void) glusterd_brick_unlink_socket_file (volinfo, brickinfo);
+ if (GLUSTERD_STATUS_STARTED == volinfo->status) {
+ (void) glusterd_brick_disconnect (brickinfo);
+ GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, brickinfo, priv);
+ ret = glusterd_service_stop ("brick", pidfile, SIGTERM, _gf_false);
+ if (ret == 0) {
+ glusterd_set_brick_status (brickinfo, GF_BRICK_STOPPED);
+ (void) glusterd_brick_unlink_socket_file (volinfo, brickinfo);
+ GLUSTERD_GET_BRICK_RECON_PIDFILE (pidfile, volinfo, brickinfo, priv);
+ ret = glusterd_service_stop ("recon", pidfile, SIGTERM, _gf_false);
+ }
}
if (del_brick)
glusterd_delete_brick (volinfo, brickinfo);
+
return ret;
}
@@ -1552,89 +2051,85 @@ glusterd_sort_and_redirect (const char *src_filepath, int dest_fd)
}
int
-glusterd_volume_compute_cksum (glusterd_volinfo_t *volinfo)
-{
- int32_t ret = -1;
- glusterd_conf_t *priv = NULL;
- char path[PATH_MAX] = {0,};
- char cksum_path[PATH_MAX] = {0,};
- char filepath[PATH_MAX] = {0,};
- int fd = -1;
- uint32_t cksum = 0;
- char buf[4096] = {0,};
+glusterd_volume_compute_cksum (glusterd_volinfo_t *volinfo, char *cksum_path,
+ char *filepath, gf_boolean_t is_quota_conf,
+ uint32_t *cs)
+{
+ int32_t ret = -1;
+ uint32_t cksum = 0;
+ int fd = -1;
+ int sort_fd = 0;
char sort_filepath[PATH_MAX] = {0};
- gf_boolean_t unlink_sortfile = _gf_false;
- int sort_fd = 0;
- xlator_t *this = NULL;
+ char *cksum_path_final = NULL;
+ char buf[4096] = {0,};
+ gf_boolean_t unlink_sortfile = _gf_false;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
GF_ASSERT (volinfo);
this = THIS;
priv = THIS->private;
GF_ASSERT (priv);
- GLUSTERD_GET_VOLUME_DIR (path, volinfo, priv);
-
- snprintf (cksum_path, sizeof (cksum_path), "%s/%s",
- path, GLUSTERD_CKSUM_FILE);
-
fd = open (cksum_path, O_RDWR | O_APPEND | O_CREAT| O_TRUNC, 0600);
if (-1 == fd) {
- gf_log (this->name, GF_LOG_ERROR, "Unable to open %s, errno: %d",
- cksum_path, errno);
+ gf_log (this->name, GF_LOG_ERROR, "Unable to open %s,"
+ " errno: %d", cksum_path, errno);
ret = -1;
goto out;
}
- snprintf (filepath, sizeof (filepath), "%s/%s", path,
- GLUSTERD_VOLUME_INFO_FILE);
- snprintf (sort_filepath, sizeof (sort_filepath), "/tmp/%s.XXXXXX",
- volinfo->volname);
+ if (!is_quota_conf) {
+ snprintf (sort_filepath, sizeof (sort_filepath),
+ "/tmp/%s.XXXXXX", volinfo->volname);
- sort_fd = mkstemp (sort_filepath);
- if (sort_fd < 0) {
- gf_log (this->name, GF_LOG_ERROR, "Could not generate temp file, "
- "reason: %s for volume: %s", strerror (errno),
- volinfo->volname);
- goto out;
- } else {
- unlink_sortfile = _gf_true;
- }
+ sort_fd = mkstemp (sort_filepath);
+ if (sort_fd < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not generate "
+ "temp file, reason: %s for volume: %s",
+ strerror (errno), volinfo->volname);
+ goto out;
+ } else {
+ unlink_sortfile = _gf_true;
+ }
- /* sort the info file, result in sort_filepath */
+ /* sort the info file, result in sort_filepath */
- ret = glusterd_sort_and_redirect (filepath, sort_fd);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "sorting info file failed");
- goto out;
- }
+ ret = glusterd_sort_and_redirect (filepath, sort_fd);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "sorting info file "
+ "failed");
+ goto out;
+ }
- ret = close (sort_fd);
- if (ret)
- goto out;
+ ret = close (sort_fd);
+ if (ret)
+ goto out;
+ }
- ret = get_checksum_for_path (sort_filepath, &cksum);
+ cksum_path_final = is_quota_conf ? filepath : sort_filepath;
+ ret = get_checksum_for_path (cksum_path_final, &cksum);
if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Unable to get checksum"
- " for path: %s", sort_filepath);
+ gf_log (this->name, GF_LOG_ERROR, "unable to get "
+ "checksum for path: %s", cksum_path_final);
goto out;
}
-
- snprintf (buf, sizeof (buf), "%s=%u\n", "info", cksum);
- ret = write (fd, buf, strlen (buf));
-
- if (ret <= 0) {
- ret = -1;
- goto out;
+ if (!is_quota_conf) {
+ snprintf (buf, sizeof (buf), "%s=%u\n", "info", cksum);
+ ret = write (fd, buf, strlen (buf));
+ if (ret <= 0) {
+ ret = -1;
+ goto out;
+ }
}
ret = get_checksum_for_file (fd, &cksum);
-
if (ret)
goto out;
- volinfo->cksum = cksum;
+ *cs = cksum;
out:
if (fd > 0)
@@ -1646,26 +2141,74 @@ out:
return ret;
}
+int glusterd_compute_cksum (glusterd_volinfo_t *volinfo,
+ gf_boolean_t is_quota_conf)
+{
+ int ret = -1;
+ uint32_t cs = 0;
+ char cksum_path[PATH_MAX] = {0,};
+ char path[PATH_MAX] = {0,};
+ char filepath[PATH_MAX] = {0,};
+ glusterd_conf_t *conf = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ GLUSTERD_GET_VOLUME_DIR (path, volinfo, conf);
+
+ if (is_quota_conf) {
+ snprintf (cksum_path, sizeof (cksum_path), "%s/%s", path,
+ GLUSTERD_VOL_QUOTA_CKSUM_FILE);
+ snprintf (filepath, sizeof (filepath), "%s/%s", path,
+ GLUSTERD_VOLUME_QUOTA_CONFIG);
+ } else {
+ snprintf (cksum_path, sizeof (cksum_path), "%s/%s", path,
+ GLUSTERD_CKSUM_FILE);
+ snprintf (filepath, sizeof (filepath), "%s/%s", path,
+ GLUSTERD_VOLUME_INFO_FILE);
+ }
+
+ ret = glusterd_volume_compute_cksum (volinfo, cksum_path, filepath,
+ is_quota_conf, &cs);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to compute checksum "
+ "for volume %s", volinfo->volname);
+ goto out;
+ }
+
+ if (is_quota_conf)
+ volinfo->quota_conf_cksum = cs;
+ else
+ volinfo->cksum = cs;
+
+ ret = 0;
+out:
+ return ret;
+}
+
int
-_add_volinfo_dict_to_prdict (dict_t *this, char *key, data_t *value, void *data)
+_add_dict_to_prdict (dict_t *this, char *key, data_t *value, void *data)
{
- glusterd_voldict_ctx_t *ctx = NULL;
+ glusterd_dict_ctx_t *ctx = NULL;
char optkey[512] = {0,};
int ret = -1;
ctx = data;
- snprintf (optkey, sizeof (optkey), "volume%d.%s%d", ctx->count,
+ snprintf (optkey, sizeof (optkey), "%s.%s%d", ctx->prefix,
ctx->key_name, ctx->opt_count);
ret = dict_set_str (ctx->dict, optkey, key);
if (ret)
gf_log ("", GF_LOG_ERROR, "option add for %s%d %s",
- ctx->key_name, ctx->count, key);
- snprintf (optkey, sizeof (optkey), "volume%d.%s%d", ctx->count,
+ ctx->key_name, ctx->opt_count, key);
+ snprintf (optkey, sizeof (optkey), "%s.%s%d", ctx->prefix,
ctx->val_name, ctx->opt_count);
ret = dict_set_str (ctx->dict, optkey, value->data);
if (ret)
gf_log ("", GF_LOG_ERROR, "option add for %s%d %s",
- ctx->val_name, ctx->count, value->data);
+ ctx->val_name, ctx->opt_count, value->data);
ctx->opt_count++;
return ret;
@@ -1698,100 +2241,148 @@ out:
return ret;
}
+/* The prefix represents the type of volume to be added.
+ * It will be "volume" for normal volumes, and snap# like
+ * snap1, snap2, for snapshot volumes
+ */
int32_t
glusterd_add_volume_to_dict (glusterd_volinfo_t *volinfo,
- dict_t *dict, int32_t count)
-{
- int32_t ret = -1;
- char key[512] = {0,};
- glusterd_brickinfo_t *brickinfo = NULL;
- int32_t i = 1;
- char *volume_id_str = NULL;
- char *src_brick = NULL;
- char *dst_brick = NULL;
- char *str = NULL;
- glusterd_voldict_ctx_t ctx = {0};
+ dict_t *dict, int32_t count,
+ char *prefix)
+{
+ int32_t ret = -1;
+ char pfx[512] = {0,};
+ char key[512] = {0,};
+ glusterd_brickinfo_t *brickinfo = NULL;
+ int32_t i = 1;
+ char *volume_id_str = NULL;
+ char *src_brick = NULL;
+ char *dst_brick = NULL;
+ char *str = NULL;
+ glusterd_dict_ctx_t ctx = {0};
+ char *rebalance_id_str = NULL;
+ char *rb_id_str = NULL;
+ xlator_t *this = NULL;
+ this = THIS;
+ GF_ASSERT (this);
GF_ASSERT (dict);
GF_ASSERT (volinfo);
+ GF_ASSERT (prefix);
- snprintf (key, sizeof (key), "volume%d.name", count);
+ snprintf (key, sizeof (key), "%s%d.name", prefix, count);
ret = dict_set_str (dict, key, volinfo->volname);
if (ret)
goto out;
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.type", count);
+ snprintf (key, sizeof (key), "%s%d.type", prefix, count);
ret = dict_set_int32 (dict, key, volinfo->type);
if (ret)
goto out;
+ snprintf (key, sizeof (key), "volume%d.restored_from_snap", count);
+ ret = dict_set_dynstr_with_alloc
+ (dict, key,
+ uuid_utoa (volinfo->restored_from_snap));
+ if (ret)
+ goto out;
+
+ if (strlen (volinfo->parent_volname) > 0) {
+ snprintf (key, sizeof (key), "%s%d.parent_volname",
+ prefix, count);
+ ret = dict_set_dynstr_with_alloc (dict, key,
+ volinfo->parent_volname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set parent_volname for %s",
+ volinfo->volname);
+ goto out;
+ }
+ }
+
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.brick_count", count);
+ snprintf (key, sizeof (key), "%s%d.brick_count", prefix, count);
ret = dict_set_int32 (dict, key, volinfo->brick_count);
if (ret)
goto out;
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.version", count);
+ snprintf (key, sizeof (key), "%s%d.version", prefix, count);
ret = dict_set_int32 (dict, key, volinfo->version);
if (ret)
goto out;
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.status", count);
+ snprintf (key, sizeof (key), "%s%d.status", prefix, count);
ret = dict_set_int32 (dict, key, volinfo->status);
if (ret)
goto out;
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.sub_count", count);
+ snprintf (key, sizeof (key), "%s%d.sub_count", prefix, count);
ret = dict_set_int32 (dict, key, volinfo->sub_count);
if (ret)
goto out;
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.stripe_count", count);
+ snprintf (key, sizeof (key), "%s%d.stripe_count", prefix, count);
ret = dict_set_int32 (dict, key, volinfo->stripe_count);
if (ret)
goto out;
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.replica_count", count);
+ snprintf (key, sizeof (key), "%s%d.replica_count", prefix, count);
ret = dict_set_int32 (dict, key, volinfo->replica_count);
if (ret)
goto out;
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.dist_count", count);
+ snprintf (key, sizeof (key), "%s%d.dist_count", prefix, count);
ret = dict_set_int32 (dict, key, volinfo->dist_leaf_count);
if (ret)
goto out;
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.ckusm", count);
+ snprintf (key, sizeof (key), "%s%d.ckusm", prefix, count);
ret = dict_set_int64 (dict, key, volinfo->cksum);
if (ret)
goto out;
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.transport_type", count);
+ snprintf (key, sizeof (key), "%s%d.transport_type", prefix, count);
ret = dict_set_uint32 (dict, key, volinfo->transport_type);
if (ret)
goto out;
- volume_id_str = gf_strdup (uuid_utoa (volinfo->volume_id));
- if (!volume_id_str)
+ snprintf (key, sizeof (key), "%s%d.is_snap_volume", prefix, count);
+ ret = dict_set_uint32 (dict, key, volinfo->is_snap_volume);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR, "Unable to set %s", key);
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "%s%d.snap-max-hard-limit", prefix, count);
+ ret = dict_set_uint64 (dict, key, volinfo->snap_max_hard_limit);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR, "Unable to set %s", key);
goto out;
+ }
+ volume_id_str = gf_strdup (uuid_utoa (volinfo->volume_id));
+ if (!volume_id_str) {
+ ret = -1;
+ goto out;
+ }
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.volume_id", count);
+ snprintf (key, sizeof (key), "%s%d.volume_id", prefix, count);
ret = dict_set_dynstr (dict, key, volume_id_str);
if (ret)
goto out;
+ volume_id_str = NULL;
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.username", count);
+ snprintf (key, sizeof (key), "%s%d.username", prefix, count);
str = glusterd_auth_get_username (volinfo);
if (str) {
ret = dict_set_dynstr (dict, key, gf_strdup (str));
@@ -1800,7 +2391,7 @@ glusterd_add_volume_to_dict (glusterd_volinfo_t *volinfo,
}
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.password", count);
+ snprintf (key, sizeof (key), "%s%d.password", prefix, count);
str = glusterd_auth_get_password (volinfo);
if (str) {
ret = dict_set_dynstr (dict, key, gf_strdup (str));
@@ -1809,134 +2400,586 @@ glusterd_add_volume_to_dict (glusterd_volinfo_t *volinfo,
}
memset (key, 0, sizeof (key));
- snprintf (key, 256, "volume%d."GLUSTERD_STORE_KEY_RB_STATUS, count);
- ret = dict_set_int32 (dict, key, volinfo->rb_status);
+ snprintf (key, 256, "%s%d.rebalance", prefix, count);
+ ret = dict_set_int32 (dict, key, volinfo->rebal.defrag_cmd);
+ if (ret)
+ goto out;
+
+ rebalance_id_str = gf_strdup (uuid_utoa
+ (volinfo->rebal.rebalance_id));
+ if (!rebalance_id_str) {
+ ret = -1;
+ goto out;
+ }
+ memset (key, 0, sizeof (key));
+ snprintf (key, 256, "%s%d.rebalance-id", prefix, count);
+ ret = dict_set_dynstr (dict, key, rebalance_id_str);
+ if (ret)
+ goto out;
+ rebalance_id_str = NULL;
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "%s%d.rebalance-op", prefix, count);
+ ret = dict_set_uint32 (dict, key, volinfo->rebal.op);
if (ret)
goto out;
+ if (volinfo->rebal.dict) {
+ snprintf (pfx, sizeof (pfx), "%s%d", prefix, count);
+ ctx.dict = dict;
+ ctx.prefix = pfx;
+ ctx.opt_count = 1;
+ ctx.key_name = "rebal-dict-key";
+ ctx.val_name = "rebal-dict-value";
+
+ dict_foreach (volinfo->rebal.dict, _add_dict_to_prdict, &ctx);
+ ctx.opt_count--;
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "volume%d.rebal-dict-count", count);
+ ret = dict_set_int32 (dict, key, ctx.opt_count);
+ if (ret)
+ goto out;
+ }
+
memset (key, 0, sizeof (key));
- snprintf (key, 256, "volume%d.rebalance", count);
- ret = dict_set_int32 (dict, key, volinfo->defrag_cmd);
+ snprintf (key, 256, "%s%d."GLUSTERD_STORE_KEY_RB_STATUS, prefix, count);
+ ret = dict_set_int32 (dict, key, volinfo->rep_brick.rb_status);
if (ret)
goto out;
- if (volinfo->rb_status > GF_RB_STATUS_NONE) {
+
+ if (volinfo->rep_brick.rb_status > GF_RB_STATUS_NONE) {
memset (key, 0, sizeof (key));
- snprintf (key, 256, "volume%d."GLUSTERD_STORE_KEY_RB_SRC_BRICK,
- count);
+ snprintf (key, 256, "%s%d."GLUSTERD_STORE_KEY_RB_SRC_BRICK,
+ prefix, count);
gf_asprintf (&src_brick, "%s:%s",
- volinfo->src_brick->hostname,
- volinfo->src_brick->path);
+ volinfo->rep_brick.src_brick->hostname,
+ volinfo->rep_brick.src_brick->path);
ret = dict_set_dynstr (dict, key, src_brick);
if (ret)
goto out;
memset (key, 0, sizeof (key));
- snprintf (key, 256, "volume%d."GLUSTERD_STORE_KEY_RB_DST_BRICK,
- count);
+ snprintf (key, 256, "%s%d."GLUSTERD_STORE_KEY_RB_DST_BRICK,
+ prefix, count);
gf_asprintf (&dst_brick, "%s:%s",
- volinfo->dst_brick->hostname,
- volinfo->dst_brick->path);
+ volinfo->rep_brick.dst_brick->hostname,
+ volinfo->rep_brick.dst_brick->path);
ret = dict_set_dynstr (dict, key, dst_brick);
if (ret)
goto out;
+
+ rb_id_str = gf_strdup (uuid_utoa (volinfo->rep_brick.rb_id));
+ if (!rb_id_str) {
+ ret = -1;
+ goto out;
+ }
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "%s%d.rb_id", prefix, count);
+ ret = dict_set_dynstr (dict, key, rb_id_str);
+ if (ret)
+ goto out;
+ rb_id_str = NULL;
}
+ snprintf (pfx, sizeof (pfx), "%s%d", prefix, count);
ctx.dict = dict;
- ctx.count = count;
+ ctx.prefix = pfx;
ctx.opt_count = 1;
ctx.key_name = "key";
ctx.val_name = "value";
GF_ASSERT (volinfo->dict);
- dict_foreach (volinfo->dict, _add_volinfo_dict_to_prdict, &ctx);
+ dict_foreach (volinfo->dict, _add_dict_to_prdict, &ctx);
ctx.opt_count--;
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.opt-count", count);
+ snprintf (key, sizeof (key), "%s%d.opt-count", prefix, count);
ret = dict_set_int32 (dict, key, ctx.opt_count);
if (ret)
goto out;
ctx.dict = dict;
- ctx.count = count;
+ ctx.prefix = pfx;
ctx.opt_count = 1;
ctx.key_name = "slave-num";
ctx.val_name = "slave-val";
GF_ASSERT (volinfo->gsync_slaves);
- dict_foreach (volinfo->gsync_slaves, _add_volinfo_dict_to_prdict, &ctx);
+ dict_foreach (volinfo->gsync_slaves, _add_dict_to_prdict, &ctx);
ctx.opt_count--;
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.gsync-count", count);
+ snprintf (key, sizeof (key), "%s%d.gsync-count", prefix, count);
ret = dict_set_int32 (dict, key, ctx.opt_count);
if (ret)
goto out;
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.brick%d.hostname",
- count, i);
+ snprintf (key, sizeof (key), "%s%d.brick%d.hostname",
+ prefix, count, i);
ret = dict_set_str (dict, key, brickinfo->hostname);
if (ret)
goto out;
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.brick%d.path",
- count, i);
+ snprintf (key, sizeof (key), "%s%d.brick%d.path",
+ prefix, count, i);
ret = dict_set_str (dict, key, brickinfo->path);
if (ret)
goto out;
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "%s%d.brick%d.decommissioned",
+ prefix, count, i);
+ ret = dict_set_int32 (dict, key, brickinfo->decommissioned);
+ if (ret)
+ goto out;
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "%s%d.brick%d.brick_id",
+ prefix, count, i);
+ ret = dict_set_str (dict, key, brickinfo->brick_id);
+ if (ret)
+ goto out;
+
+ snprintf (key, sizeof (key), "%s%d.brick%d.snap_status",
+ prefix, count, i);
+ ret = dict_set_int32 (dict, key, brickinfo->snap_status);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set snap_status for %s:%s",
+ brickinfo->hostname,
+ brickinfo->path);
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "%s%d.brick%d.device_path",
+ prefix, count, i);
+ ret = dict_set_str (dict, key, brickinfo->device_path);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set snap_device for %s:%s",
+ brickinfo->hostname,
+ brickinfo->path);
+ goto out;
+ }
+
i++;
}
+ /* Add volume op-versions to dict. This prevents volume inconsistencies
+ * in the cluster
+ */
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "%s%d.op-version", prefix, count);
+ ret = dict_set_int32 (dict, key, volinfo->op_version);
+ if (ret)
+ goto out;
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "%s%d.client-op-version", prefix, count);
+ ret = dict_set_int32 (dict, key, volinfo->client_op_version);
+ if (ret)
+ goto out;
+
+ /*Add volume Capability (BD Xlator) to dict*/
+ memset (key, 0 ,sizeof (key));
+ snprintf (key, sizeof (key), "%s%d.caps", prefix, count);
+ ret = dict_set_int32 (dict, key, volinfo->caps);
out:
- gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
+ GF_FREE (volume_id_str);
+ GF_FREE (rebalance_id_str);
+ GF_FREE (rb_id_str);
+
+ gf_log (this->name, GF_LOG_DEBUG, "Returning with %d", ret);
+ return ret;
+}
+
+/* The prefix represents the type of volume to be added.
+ * It will be "volume" for normal volumes, and snap# like
+ * snap1, snap2, for snapshot volumes
+ */
+int
+glusterd_vol_add_quota_conf_to_dict (glusterd_volinfo_t *volinfo, dict_t* load,
+ int vol_idx, char *prefix)
+{
+ int fd = -1;
+ char *gfid_str = NULL;
+ unsigned char buf[16] = {0};
+ char key[PATH_MAX] = {0};
+ int gfid_idx = 0;
+ int ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (prefix);
+ ret = glusterd_store_create_quota_conf_sh_on_absence (volinfo);
+ if (ret)
+ goto out;
+
+ fd = open (volinfo->quota_conf_shandle->path, O_RDONLY);
+ if (fd == -1) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = glusterd_store_quota_conf_skip_header (this, fd);
+ if (ret)
+ goto out;
+
+ for (gfid_idx=0; ; gfid_idx++) {
+
+ ret = read (fd, (void*)&buf, 16) ;
+ if (ret <= 0) {
+ //Finished reading all entries in the conf file
+ break;
+ }
+ if (ret != 16) {
+ //This should never happen. We must have a multiple of
+ //entry_sz bytes in our configuration file.
+ gf_log (this->name, GF_LOG_CRITICAL, "Quota "
+ "configuration store may be corrupt.");
+ goto out;
+ }
+
+ gfid_str = gf_strdup (uuid_utoa (buf));
+ if (!gfid_str) {
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (key, sizeof(key)-1, "%s%d.gfid%d", prefix,
+ vol_idx, gfid_idx);
+ key[sizeof(key)-1] = '\0';
+ ret = dict_set_dynstr (load, key, gfid_str);
+ if (ret) {
+ goto out;
+ }
+
+ gfid_str = NULL;
+ }
+
+ snprintf (key, sizeof(key)-1, "%s%d.gfid-count", prefix, vol_idx);
+ key[sizeof(key)-1] = '\0';
+ ret = dict_set_int32 (load, key, gfid_idx);
+ if (ret)
+ goto out;
+
+ snprintf (key, sizeof(key)-1, "%s%d.quota-cksum", prefix, vol_idx);
+ key[sizeof(key)-1] = '\0';
+ ret = dict_set_uint32 (load, key, volinfo->quota_conf_cksum);
+ if (ret)
+ goto out;
+
+ snprintf (key, sizeof(key)-1, "%s%d.quota-version", prefix, vol_idx);
+ key[sizeof(key)-1] = '\0';
+ ret = dict_set_uint32 (load, key, volinfo->quota_conf_version);
+ if (ret)
+ goto out;
+
+ ret = 0;
+out:
+ if (fd != -1)
+ close (fd);
+ GF_FREE (gfid_str);
return ret;
}
int32_t
-glusterd_build_volume_dict (dict_t **vols)
+glusterd_add_missed_snaps_to_export_dict (dict_t *peer_data)
+{
+ char name_buf[PATH_MAX] = "";
+ char value[PATH_MAX] = "";
+ int32_t missed_snap_count = 0;
+ int32_t ret = -1;
+ glusterd_conf_t *priv = NULL;
+ glusterd_missed_snap_info *missed_snapinfo = NULL;
+ glusterd_snap_op_t *snap_opinfo = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (peer_data);
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ /* Add the missed_entries in the dict */
+ list_for_each_entry (missed_snapinfo, &priv->missed_snaps_list,
+ missed_snaps) {
+ list_for_each_entry (snap_opinfo,
+ &missed_snapinfo->snap_ops,
+ snap_ops_list) {
+ snprintf (name_buf, sizeof(name_buf),
+ "missed_snaps_%d", missed_snap_count);
+ snprintf (value, sizeof(value), "%s:%s=%s:%d:%s:%d:%d",
+ missed_snapinfo->node_uuid,
+ missed_snapinfo->snap_uuid,
+ snap_opinfo->snap_vol_id,
+ snap_opinfo->brick_num,
+ snap_opinfo->brick_path,
+ snap_opinfo->op,
+ snap_opinfo->status);
+
+ ret = dict_set_dynstr_with_alloc (peer_data, name_buf,
+ value);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set %s",
+ name_buf);
+ goto out;
+ }
+ missed_snap_count++;
+ }
+ }
+
+ ret = dict_set_int32 (peer_data, "missed_snap_count",
+ missed_snap_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set missed_snap_count");
+ goto out;
+ }
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_add_snap_to_dict (glusterd_snap_t *snap, dict_t *peer_data,
+ int32_t snap_count)
+{
+ char buf[NAME_MAX] = "";
+ char prefix[NAME_MAX] = "";
+ int32_t ret = -1;
+ int32_t volcount = 0;
+ glusterd_volinfo_t *volinfo = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ gf_boolean_t host_bricks = _gf_false;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (snap);
+ GF_ASSERT (peer_data);
+
+ snprintf (prefix, sizeof(prefix), "snap%d", snap_count);
+
+ list_for_each_entry (volinfo, &snap->volumes, vol_list) {
+ volcount++;
+ ret = glusterd_add_volume_to_dict (volinfo, peer_data,
+ volcount, prefix);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to add snap:%s volume:%s "
+ "to peer_data dict for handshake",
+ snap->snapname, volinfo->volname);
+ goto out;
+ }
+
+ ret = glusterd_vol_add_quota_conf_to_dict (volinfo, peer_data,
+ volcount, prefix);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to add quota conf for "
+ "snap:%s volume:%s to peer_data "
+ "dict for handshake", snap->snapname,
+ volinfo->volname);
+ goto out;
+ }
+
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ if (!uuid_compare (brickinfo->uuid, MY_UUID)) {
+ host_bricks = _gf_true;
+ break;
+ }
+ }
+ }
+
+ snprintf (buf, sizeof(buf), "%s.host_bricks", prefix);
+ ret = dict_set_int8 (peer_data, buf, (int8_t) host_bricks);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set host_bricks for snap %s",
+ snap->snapname);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf), "%s.volcount", prefix);
+ ret = dict_set_int32 (peer_data, buf, volcount);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set volcount for snap %s",
+ snap->snapname);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf), "%s.snapname", prefix);
+ ret = dict_set_dynstr_with_alloc (peer_data, buf, snap->snapname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set snapname for snap %s",
+ snap->snapname);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf), "%s.snap_id", prefix);
+ ret = dict_set_dynstr_with_alloc (peer_data, buf,
+ uuid_utoa (snap->snap_id));
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set snap_id for snap %s",
+ snap->snapname);
+ goto out;
+ }
+
+ if (snap->description) {
+ snprintf (buf, sizeof(buf), "%s.snapid", prefix);
+ ret = dict_set_dynstr_with_alloc (peer_data, buf,
+ snap->description);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set description for snap %s",
+ snap->snapname);
+ goto out;
+ }
+ }
+
+ snprintf (buf, sizeof(buf), "%s.time_stamp", prefix);
+ ret = dict_set_int64 (peer_data, buf, (int64_t)snap->time_stamp);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set time_stamp for snap %s",
+ snap->snapname);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf), "%s.snap_restored", prefix);
+ ret = dict_set_int8 (peer_data, buf, snap->snap_restored);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set snap_restored for snap %s",
+ snap->snapname);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf), "%s.snap_status", prefix);
+ ret = dict_set_int32 (peer_data, buf, snap->snap_status);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set snap_status for snap %s",
+ snap->snapname);
+ goto out;
+ }
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_add_snapshots_to_export_dict (dict_t *peer_data)
+{
+ int32_t snap_count = 0;
+ int32_t ret = -1;
+ glusterd_conf_t *priv = NULL;
+ glusterd_snap_t *snap = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (peer_data);
+
+ list_for_each_entry (snap, &priv->snapshots, snap_list) {
+ snap_count++;
+ ret = glusterd_add_snap_to_dict (snap, peer_data, snap_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to add snap(%s) to the "
+ " peer_data dict for handshake",
+ snap->snapname);
+ goto out;
+ }
+ }
+
+ ret = dict_set_int32 (peer_data, "snap_count", snap_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set snap_count");
+ goto out;
+ }
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_add_volumes_to_export_dict (dict_t **peer_data)
{
int32_t ret = -1;
dict_t *dict = NULL;
glusterd_conf_t *priv = NULL;
glusterd_volinfo_t *volinfo = NULL;
int32_t count = 0;
+ glusterd_dict_ctx_t ctx = {0};
+ xlator_t *this = NULL;
- priv = THIS->private;
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
dict = dict_new ();
-
if (!dict)
goto out;
list_for_each_entry (volinfo, &priv->volumes, vol_list) {
count++;
- ret = glusterd_add_volume_to_dict (volinfo, dict, count);
+ ret = glusterd_add_volume_to_dict (volinfo, dict, count,
+ "volume");
+ if (ret)
+ goto out;
+ if (!glusterd_is_volume_quota_enabled (volinfo))
+ continue;
+ ret = glusterd_vol_add_quota_conf_to_dict (volinfo, dict,
+ count, "volume");
if (ret)
goto out;
}
-
ret = dict_set_int32 (dict, "count", count);
if (ret)
goto out;
- *vols = dict;
+ ctx.dict = dict;
+ ctx.prefix = "global";
+ ctx.opt_count = 1;
+ ctx.key_name = "key";
+ ctx.val_name = "val";
+ dict_foreach (priv->opts, _add_dict_to_prdict, &ctx);
+ ctx.opt_count--;
+ ret = dict_set_int32 (dict, "global-opt-count", ctx.opt_count);
+ if (ret)
+ goto out;
+
+ *peer_data = dict;
out:
- gf_log ("", GF_LOG_DEBUG, "Returning with %d", ret);
if (ret)
dict_unref (dict);
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
return ret;
}
int32_t
-glusterd_compare_friend_volume (dict_t *vols, int32_t count, int32_t *status)
+glusterd_compare_friend_volume (dict_t *peer_data, int32_t count,
+ int32_t *status, char *hostname)
{
int32_t ret = -1;
@@ -1944,13 +2987,19 @@ glusterd_compare_friend_volume (dict_t *vols, int32_t count, int32_t *status)
glusterd_volinfo_t *volinfo = NULL;
char *volname = NULL;
uint32_t cksum = 0;
+ uint32_t quota_cksum = 0;
+ uint32_t quota_version = 0;
int32_t version = 0;
+ xlator_t *this = NULL;
- GF_ASSERT (vols);
+ GF_ASSERT (peer_data);
GF_ASSERT (status);
+ this = THIS;
+ GF_ASSERT (this);
+
snprintf (key, sizeof (key), "volume%d.name", count);
- ret = dict_get_str (vols, key, &volname);
+ ret = dict_get_str (peer_data, key, &volname);
if (ret)
goto out;
@@ -1964,16 +3013,16 @@ glusterd_compare_friend_volume (dict_t *vols, int32_t count, int32_t *status)
memset (key, 0, sizeof (key));
snprintf (key, sizeof (key), "volume%d.version", count);
- ret = dict_get_int32 (vols, key, &version);
+ ret = dict_get_int32 (peer_data, key, &version);
if (ret)
goto out;
if (version > volinfo->version) {
//Mismatch detected
ret = 0;
- gf_log ("", GF_LOG_ERROR, "Version of volume %s differ."
- "local version = %d, remote version = %d",
- volinfo->volname, volinfo->version, version);
+ gf_log (this->name, GF_LOG_ERROR, "Version of volume %s differ."
+ "local version = %d, remote version = %d on peer %s",
+ volinfo->volname, volinfo->version, version, hostname);
*status = GLUSTERD_VOL_COMP_UPDATE_REQ;
goto out;
} else if (version < volinfo->version) {
@@ -1985,30 +3034,79 @@ glusterd_compare_friend_volume (dict_t *vols, int32_t count, int32_t *status)
//
memset (key, 0, sizeof (key));
snprintf (key, sizeof (key), "volume%d.ckusm", count);
- ret = dict_get_uint32 (vols, key, &cksum);
+ ret = dict_get_uint32 (peer_data, key, &cksum);
if (ret)
goto out;
if (cksum != volinfo->cksum) {
ret = 0;
- gf_log ("", GF_LOG_ERROR, "Cksums of volume %s differ."
- " local cksum = %d, remote cksum = %d",
- volinfo->volname, volinfo->cksum, cksum);
+ gf_log (this->name, GF_LOG_ERROR, "Cksums of volume %s differ."
+ " local cksum = %u, remote cksum = %u on peer %s",
+ volinfo->volname, volinfo->cksum, cksum, hostname);
*status = GLUSTERD_VOL_COMP_RJT;
goto out;
}
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "volume%d.quota-version", count);
+ ret = dict_get_uint32 (peer_data, key, &quota_version);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "quota-version key absent for"
+ " volume %s in peer %s's response", volinfo->volname,
+ hostname);
+ ret = 0;
+ } else {
+ if (quota_version > volinfo->quota_conf_version) {
+ //Mismatch detected
+ ret = 0;
+ gf_log (this->name, GF_LOG_ERROR, "Quota configuration "
+ "versions of volume %s differ. "
+ "local version = %d, remote version = %d "
+ "on peer %s", volinfo->volname,
+ volinfo->quota_conf_version, quota_version,
+ hostname);
+ *status = GLUSTERD_VOL_COMP_UPDATE_REQ;
+ goto out;
+ } else if (quota_version < volinfo->quota_conf_version) {
+ *status = GLUSTERD_VOL_COMP_SCS;
+ goto out;
+ }
+ }
+
+ //Now, versions are same, compare cksums.
+ //
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "volume%d.quota-cksum", count);
+ ret = dict_get_uint32 (peer_data, key, &quota_cksum);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "quota checksum absent for "
+ "volume %s in peer %s's response", volinfo->volname,
+ hostname);
+ ret = 0;
+ } else {
+ if (quota_cksum != volinfo->quota_conf_cksum) {
+ ret = 0;
+ gf_log (this->name, GF_LOG_ERROR, "Cksums of quota "
+ "configurations of volume %s differ. "
+ "local cksum = %u, remote cksum = %u on "
+ "peer %s", volinfo->volname,
+ volinfo->quota_conf_cksum, quota_cksum,
+ hostname);
+ *status = GLUSTERD_VOL_COMP_RJT;
+ goto out;
+ }
+ }
*status = GLUSTERD_VOL_COMP_SCS;
out:
- gf_log ("", GF_LOG_DEBUG, "Returning with ret: %d, status: %d",
+ gf_log (this->name, GF_LOG_DEBUG, "Returning with ret: %d, status: %d",
ret, *status);
return ret;
}
static int32_t
-import_prdict_volinfo_dict (dict_t *vols, dict_t *dst_dict, char *key_prefix,
- char *value_prefix, int opt_count, int count)
+import_prdict_dict (dict_t *peer_data, dict_t *dst_dict, char *key_prefix,
+ char *value_prefix, int opt_count, char *prefix)
{
char key[512] = {0,};
int32_t ret = 0;
@@ -2020,9 +3118,9 @@ import_prdict_volinfo_dict (dict_t *vols, dict_t *dst_dict, char *key_prefix,
while (i <= opt_count) {
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.%s%d",
- count, key_prefix, i);
- ret = dict_get_str (vols, key, &opt_key);
+ snprintf (key, sizeof (key), "%s.%s%d",
+ prefix, key_prefix, i);
+ ret = dict_get_str (peer_data, key, &opt_key);
if (ret) {
snprintf (msg, sizeof (msg), "Volume dict key not "
"specified");
@@ -2030,9 +3128,9 @@ import_prdict_volinfo_dict (dict_t *vols, dict_t *dst_dict, char *key_prefix,
}
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.%s%d",
- count, value_prefix, i);
- ret = dict_get_str (vols, key, &opt_val);
+ snprintf (key, sizeof (key), "%s.%s%d",
+ prefix, value_prefix, i);
+ ret = dict_get_str (peer_data, key, &opt_val);
if (ret) {
snprintf (msg, sizeof (msg), "Volume dict value not "
"specified");
@@ -2060,26 +3158,290 @@ out:
}
+gf_boolean_t
+glusterd_is_quorum_option (char *option)
+{
+ gf_boolean_t res = _gf_false;
+ int i = 0;
+ char *keys[] = {GLUSTERD_QUORUM_TYPE_KEY,
+ GLUSTERD_QUORUM_RATIO_KEY, NULL};
+
+ for (i = 0; keys[i]; i++) {
+ if (strcmp (option, keys[i]) == 0) {
+ res = _gf_true;
+ break;
+ }
+ }
+ return res;
+}
+
+gf_boolean_t
+glusterd_is_quorum_changed (dict_t *options, char *option, char *value)
+{
+ int ret = 0;
+ gf_boolean_t reconfigured = _gf_false;
+ gf_boolean_t all = _gf_false;
+ char *oldquorum = NULL;
+ char *newquorum = NULL;
+ char *oldratio = NULL;
+ char *newratio = NULL;
+
+ if ((strcmp ("all", option) != 0) &&
+ !glusterd_is_quorum_option (option))
+ goto out;
+
+ if (strcmp ("all", option) == 0)
+ all = _gf_true;
+
+ if (all || (strcmp (GLUSTERD_QUORUM_TYPE_KEY, option) == 0)) {
+ newquorum = value;
+ ret = dict_get_str (options, GLUSTERD_QUORUM_TYPE_KEY,
+ &oldquorum);
+ }
+
+ if (all || (strcmp (GLUSTERD_QUORUM_RATIO_KEY, option) == 0)) {
+ newratio = value;
+ ret = dict_get_str (options, GLUSTERD_QUORUM_RATIO_KEY,
+ &oldratio);
+ }
+
+ reconfigured = _gf_true;
+
+ if (oldquorum && newquorum && (strcmp (oldquorum, newquorum) == 0))
+ reconfigured = _gf_false;
+ if (oldratio && newratio && (strcmp (oldratio, newratio) == 0))
+ reconfigured = _gf_false;
+
+ if ((oldratio == NULL) && (newratio == NULL) && (oldquorum == NULL) &&
+ (newquorum == NULL))
+ reconfigured = _gf_false;
+out:
+ return reconfigured;
+}
+
+static inline gf_boolean_t
+_is_contributing_to_quorum (gd_quorum_contrib_t contrib)
+{
+ if ((contrib == QUORUM_UP) || (contrib == QUORUM_DOWN))
+ return _gf_true;
+ return _gf_false;
+}
+
+static inline gf_boolean_t
+_does_quorum_meet (int active_count, int quorum_count)
+{
+ return (active_count >= quorum_count);
+}
+
+int
+glusterd_get_quorum_cluster_counts (xlator_t *this, int *active_count,
+ int *quorum_count)
+{
+ glusterd_peerinfo_t *peerinfo = NULL;
+ glusterd_conf_t *conf = NULL;
+ int ret = -1;
+ int inquorum_count = 0;
+ char *val = NULL;
+ double quorum_percentage = 0.0;
+ gf_boolean_t ratio = _gf_false;
+ int count = 0;
+
+ conf = this->private;
+ //Start with counting self
+ inquorum_count = 1;
+ if (active_count)
+ *active_count = 1;
+ list_for_each_entry (peerinfo, &conf->peers, uuid_list) {
+ if (peerinfo->quorum_contrib == QUORUM_WAITING)
+ goto out;
+
+ if (_is_contributing_to_quorum (peerinfo->quorum_contrib))
+ inquorum_count = inquorum_count + 1;
+
+ if (active_count && (peerinfo->quorum_contrib == QUORUM_UP))
+ *active_count = *active_count + 1;
+ }
+
+ ret = dict_get_str (conf->opts, GLUSTERD_QUORUM_RATIO_KEY, &val);
+ if (ret == 0) {
+ ratio = _gf_true;
+ ret = gf_string2percent (val, &quorum_percentage);
+ if (!ret)
+ ratio = _gf_true;
+ }
+ if (ratio)
+ count = CEILING_POS (inquorum_count *
+ quorum_percentage / 100.0);
+ else
+ count = (inquorum_count * 50 / 100) + 1;
+
+ *quorum_count = count;
+ ret = 0;
+out:
+ return ret;
+}
+
+gf_boolean_t
+glusterd_is_volume_in_server_quorum (glusterd_volinfo_t *volinfo)
+{
+ gf_boolean_t res = _gf_false;
+ char *quorum_type = NULL;
+ int ret = 0;
+
+ ret = dict_get_str (volinfo->dict, GLUSTERD_QUORUM_TYPE_KEY,
+ &quorum_type);
+ if (ret)
+ goto out;
+
+ if (strcmp (quorum_type, GLUSTERD_SERVER_QUORUM) == 0)
+ res = _gf_true;
+out:
+ return res;
+}
+
+gf_boolean_t
+glusterd_is_any_volume_in_server_quorum (xlator_t *this)
+{
+ glusterd_conf_t *conf = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+
+ conf = this->private;
+ list_for_each_entry (volinfo, &conf->volumes, vol_list) {
+ if (glusterd_is_volume_in_server_quorum (volinfo)) {
+ return _gf_true;
+ }
+ }
+ return _gf_false;
+}
+
+gf_boolean_t
+does_gd_meet_server_quorum (xlator_t *this)
+{
+ int quorum_count = 0;
+ int active_count = 0;
+ gf_boolean_t in = _gf_false;
+ glusterd_conf_t *conf = NULL;
+ int ret = -1;
+
+ conf = this->private;
+ ret = glusterd_get_quorum_cluster_counts (this, &active_count,
+ &quorum_count);
+ if (ret)
+ goto out;
+
+ if (!_does_quorum_meet (active_count, quorum_count)) {
+ goto out;
+ }
+
+ in = _gf_true;
+out:
+ return in;
+}
+
+int
+glusterd_spawn_daemons (void *opaque)
+{
+ glusterd_conf_t *conf = THIS->private;
+ gf_boolean_t start_bricks = !conf->restart_done;
+
+ if (start_bricks) {
+ glusterd_restart_bricks (conf);
+ conf->restart_done = _gf_true;
+ }
+ glusterd_restart_gsyncds (conf);
+ glusterd_restart_rebalance (conf);
+ return 0;
+}
+
+void
+glusterd_do_volume_quorum_action (xlator_t *this, glusterd_volinfo_t *volinfo,
+ gf_boolean_t meets_quorum)
+{
+ glusterd_brickinfo_t *brickinfo = NULL;
+ glusterd_conf_t *conf = NULL;
+
+ conf = this->private;
+ if (volinfo->status != GLUSTERD_STATUS_STARTED)
+ goto out;
+
+ if (!glusterd_is_volume_in_server_quorum (volinfo))
+ meets_quorum = _gf_true;
+
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ if (!glusterd_is_local_brick (this, volinfo, brickinfo))
+ continue;
+ if (meets_quorum)
+ glusterd_brick_start (volinfo, brickinfo, _gf_false);
+ else
+ glusterd_brick_stop (volinfo, brickinfo, _gf_false);
+ }
+out:
+ return;
+}
+
+int
+glusterd_do_quorum_action ()
+{
+ xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ int ret = 0;
+ int active_count = 0;
+ int quorum_count = 0;
+ gf_boolean_t meets = _gf_false;
+
+ this = THIS;
+ conf = this->private;
+
+ conf->pending_quorum_action = _gf_true;
+ ret = glusterd_lock (conf->uuid);
+ if (ret)
+ goto out;
+
+ {
+ ret = glusterd_get_quorum_cluster_counts (this, &active_count,
+ &quorum_count);
+ if (ret)
+ goto unlock;
+
+ if (_does_quorum_meet (active_count, quorum_count))
+ meets = _gf_true;
+ list_for_each_entry (volinfo, &conf->volumes, vol_list) {
+ glusterd_do_volume_quorum_action (this, volinfo, meets);
+ }
+ }
+unlock:
+ (void)glusterd_unlock (conf->uuid);
+ conf->pending_quorum_action = _gf_false;
+out:
+ return ret;
+}
+
int32_t
-glusterd_import_friend_volume_opts (dict_t *vols, int count,
+glusterd_import_friend_volume_opts (dict_t *peer_data, int count,
glusterd_volinfo_t *volinfo)
{
char key[512] = {0,};
int32_t ret = -1;
int opt_count = 0;
char msg[2048] = {0};
+ char volume_prefix[1024] = {0};
+
+ GF_ASSERT (peer_data);
+ GF_ASSERT (volinfo);
memset (key, 0, sizeof (key));
snprintf (key, sizeof (key), "volume%d.opt-count", count);
- ret = dict_get_int32 (vols, key, &opt_count);
+ ret = dict_get_int32 (peer_data, key, &opt_count);
if (ret) {
snprintf (msg, sizeof (msg), "Volume option count not "
"specified for %s", volinfo->volname);
goto out;
}
- ret = import_prdict_volinfo_dict (vols, volinfo->dict, "key",
- "value", opt_count, count);
+ snprintf (volume_prefix, sizeof (volume_prefix), "volume%d", count);
+ ret = import_prdict_dict (peer_data, volinfo->dict, "key", "value",
+ opt_count, volume_prefix);
if (ret) {
snprintf (msg, sizeof (msg), "Unable to import options dict "
"specified for %s", volinfo->volname);
@@ -2088,16 +3450,15 @@ glusterd_import_friend_volume_opts (dict_t *vols, int count,
memset (key, 0, sizeof (key));
snprintf (key, sizeof (key), "volume%d.gsync-count", count);
- ret = dict_get_int32 (vols, key, &opt_count);
+ ret = dict_get_int32 (peer_data, key, &opt_count);
if (ret) {
snprintf (msg, sizeof (msg), "Gsync count not "
"specified for %s", volinfo->volname);
goto out;
}
- ret = import_prdict_volinfo_dict (vols, volinfo->gsync_slaves,
- "slave-num", "slave-val", opt_count,
- count);
+ ret = import_prdict_dict (peer_data, volinfo->gsync_slaves, "slave-num",
+ "slave-val", opt_count, volume_prefix);
if (ret) {
snprintf (msg, sizeof (msg), "Unable to import gsync sessions "
"specified for %s", volinfo->volname);
@@ -2111,35 +3472,75 @@ out:
return ret;
}
+/* The prefix represents the type of volume to be added.
+ * It will be "volume" for normal volumes, and snap# like
+ * snap1, snap2, for snapshot volumes
+ */
int32_t
-glusterd_import_new_brick (dict_t *vols, int32_t vol_count,
+glusterd_import_new_brick (dict_t *peer_data, int32_t vol_count,
int32_t brick_count,
- glusterd_brickinfo_t **brickinfo)
+ glusterd_brickinfo_t **brickinfo,
+ char *prefix)
{
char key[512] = {0,};
int ret = -1;
+ int32_t snap_status = 0;
+ char *snap_device = NULL;
char *hostname = NULL;
char *path = NULL;
+ char *brick_id = NULL;
+ int decommissioned = 0;
glusterd_brickinfo_t *new_brickinfo = NULL;
char msg[2048] = {0};
- GF_ASSERT (vols);
+ GF_ASSERT (peer_data);
GF_ASSERT (vol_count >= 0);
GF_ASSERT (brickinfo);
+ GF_ASSERT (prefix);
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "%s%d.brick%d.hostname",
+ prefix, vol_count, brick_count);
+ ret = dict_get_str (peer_data, key, &hostname);
+ if (ret) {
+ snprintf (msg, sizeof (msg), "%s missing in payload", key);
+ goto out;
+ }
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.brick%d.hostname",
- vol_count, brick_count);
- ret = dict_get_str (vols, key, &hostname);
+ snprintf (key, sizeof (key), "%s%d.brick%d.path",
+ prefix, vol_count, brick_count);
+ ret = dict_get_str (peer_data, key, &path);
if (ret) {
snprintf (msg, sizeof (msg), "%s missing in payload", key);
goto out;
}
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.brick%d.path",
- vol_count, brick_count);
- ret = dict_get_str (vols, key, &path);
+ snprintf (key, sizeof (key), "%s%d.brick%d.brick_id",
+ prefix, vol_count, brick_count);
+ ret = dict_get_str (peer_data, key, &brick_id);
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "%s%d.brick%d.decommissioned",
+ prefix, vol_count, brick_count);
+ ret = dict_get_int32 (peer_data, key, &decommissioned);
+ if (ret) {
+ /* For backward compatibility */
+ ret = 0;
+ }
+
+ snprintf (key, sizeof (key), "%s%d.brick%d.snap_status",
+ prefix, vol_count, brick_count);
+ ret = dict_get_int32 (peer_data, key, &snap_status);
+ if (ret) {
+ snprintf (msg, sizeof (msg), "%s missing in payload", key);
+ goto out;
+ }
+
+ snprintf (key, sizeof (key), "%s%d.brick%d.device_path",
+ prefix, vol_count, brick_count);
+ ret = dict_get_str (peer_data, key, &snap_device);
if (ret) {
snprintf (msg, sizeof (msg), "%s missing in payload", key);
goto out;
@@ -2151,6 +3552,11 @@ glusterd_import_new_brick (dict_t *vols, int32_t vol_count,
strcpy (new_brickinfo->path, path);
strcpy (new_brickinfo->hostname, hostname);
+ strcpy (new_brickinfo->device_path, snap_device);
+ new_brickinfo->snap_status = snap_status;
+ new_brickinfo->decommissioned = decommissioned;
+ if (brick_id)
+ strcpy (new_brickinfo->brick_id, brick_id);
//peerinfo might not be added yet
(void) glusterd_resolve_brick (new_brickinfo);
ret = 0;
@@ -2162,23 +3568,36 @@ out:
return ret;
}
+/* The prefix represents the type of volume to be added.
+ * It will be "volume" for normal volumes, and snap# like
+ * snap1, snap2, for snapshot volumes
+ */
int32_t
-glusterd_import_bricks (dict_t *vols, int32_t vol_count,
- glusterd_volinfo_t *new_volinfo)
+glusterd_import_bricks (dict_t *peer_data, int32_t vol_count,
+ glusterd_volinfo_t *new_volinfo, char *prefix)
{
int ret = -1;
int brick_count = 1;
+ int brickid = 0;
glusterd_brickinfo_t *new_brickinfo = NULL;
- GF_ASSERT (vols);
+ GF_ASSERT (peer_data);
GF_ASSERT (vol_count >= 0);
GF_ASSERT (new_volinfo);
+ GF_ASSERT (prefix);
while (brick_count <= new_volinfo->brick_count) {
- ret = glusterd_import_new_brick (vols, vol_count, brick_count,
- &new_brickinfo);
+ ret = glusterd_import_new_brick (peer_data, vol_count,
+ brick_count,
+ &new_brickinfo, prefix);
if (ret)
goto out;
+ if (new_brickinfo->brick_id[0] == '\0')
+ /*We were probed from a peer having op-version
+ less than GD_OP_VER_PERSISTENT_AFR_XATTRS*/
+ GLUSTERD_ASSIGN_BRICKID_TO_BRICKINFO (new_brickinfo,
+ new_volinfo,
+ brickid++);
list_add_tail (&new_brickinfo->brick_list, &new_volinfo->bricks);
brick_count++;
}
@@ -2188,49 +3607,228 @@ out:
return ret;
}
+/* The prefix represents the type of volume to be added.
+ * It will be "volume" for normal volumes, and snap# like
+ * snap1, snap2, for snapshot volumes
+ */
+static int
+glusterd_import_quota_conf (dict_t *peer_data, int vol_idx,
+ glusterd_volinfo_t *new_volinfo,
+ char *prefix)
+{
+ int gfid_idx = 0;
+ int gfid_count = 0;
+ int ret = -1;
+ int fd = -1;
+ char key[PATH_MAX] = {0};
+ char *gfid_str = NULL;
+ uuid_t gfid = {0,};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (peer_data);
+ GF_ASSERT (prefix);
+
+ if (!glusterd_is_volume_quota_enabled (new_volinfo)) {
+ (void) glusterd_clean_up_quota_store (new_volinfo);
+ return 0;
+ }
+
+ ret = glusterd_store_create_quota_conf_sh_on_absence (new_volinfo);
+ if (ret)
+ goto out;
+
+ fd = gf_store_mkstemp (new_volinfo->quota_conf_shandle);
+ if (fd < 0) {
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (key, sizeof (key)-1, "%s%d.quota-cksum", prefix, vol_idx);
+ key[sizeof(key)-1] = '\0';
+ ret = dict_get_uint32 (peer_data, key, &new_volinfo->quota_conf_cksum);
+ if (ret)
+ gf_log (this->name, GF_LOG_DEBUG, "Failed to get quota cksum");
+
+ snprintf (key, sizeof (key)-1, "%s%d.quota-version", prefix, vol_idx);
+ key[sizeof(key)-1] = '\0';
+ ret = dict_get_uint32 (peer_data, key,
+ &new_volinfo->quota_conf_version);
+ if (ret)
+ gf_log (this->name, GF_LOG_DEBUG, "Failed to get quota "
+ "version");
+
+ snprintf (key, sizeof (key)-1, "%s%d.gfid-count", prefix, vol_idx);
+ key[sizeof(key)-1] = '\0';
+ ret = dict_get_int32 (peer_data, key, &gfid_count);
+ if (ret)
+ goto out;
+
+ ret = glusterd_store_quota_conf_stamp_header (this, fd);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to add header to tmp "
+ "file");
+ goto out;
+ }
+
+ gfid_idx = 0;
+ for (gfid_idx = 0; gfid_idx < gfid_count; gfid_idx++) {
+
+ snprintf (key, sizeof (key)-1, "%s%d.gfid%d",
+ prefix, vol_idx, gfid_idx);
+ key[sizeof(key)-1] = '\0';
+ ret = dict_get_str (peer_data, key, &gfid_str);
+ if (ret)
+ goto out;
+
+ uuid_parse (gfid_str, gfid);
+ ret = write (fd, (void*)gfid, 16);
+ if (ret != 16) {
+ gf_log (this->name, GF_LOG_CRITICAL, "Unable to write "
+ "gfid %s into quota.conf for %s", gfid_str,
+ new_volinfo->volname);
+ ret = -1;
+ goto out;
+ }
+
+ }
+
+ ret = gf_store_rename_tmppath (new_volinfo->quota_conf_shandle);
+
+ ret = 0;
+
+out:
+ if (fd != -1)
+ close (fd);
+
+ if (!ret) {
+ ret = glusterd_compute_cksum (new_volinfo, _gf_true);
+ if (ret)
+ goto out;
+
+ ret = glusterd_store_save_quota_version_and_cksum (new_volinfo);
+ if (ret)
+ goto out;
+ }
+
+ if (ret && (fd > 0)) {
+ gf_store_unlink_tmppath (new_volinfo->quota_conf_shandle);
+ (void) gf_store_handle_destroy
+ (new_volinfo->quota_conf_shandle);
+ new_volinfo->quota_conf_shandle = NULL;
+ }
+
+ return ret;
+}
+
+int
+gd_import_friend_volume_rebal_dict (dict_t *dict, int count,
+ glusterd_volinfo_t *volinfo)
+{
+ int ret = -1;
+ char key[256] = {0,};
+ int dict_count = 0;
+ char prefix[64] = {0};
+
+ GF_ASSERT (dict);
+ GF_ASSERT (volinfo);
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "volume%d.rebal-dict-count", count);
+ ret = dict_get_int32 (dict, key, &dict_count);
+ if (ret) {
+ /* Older peers will not have this dict */
+ ret = 0;
+ goto out;
+ }
+
+ volinfo->rebal.dict = dict_new ();
+ if(!volinfo->rebal.dict) {
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (prefix, sizeof (prefix), "volume%d", count);
+ ret = import_prdict_dict (dict, volinfo->rebal.dict, "rebal-dict-key",
+ "rebal-dict-value", dict_count, prefix);
+out:
+ if (ret && volinfo->rebal.dict)
+ dict_unref (volinfo->rebal.dict);
+ gf_log (THIS->name, GF_LOG_DEBUG, "Returning with %d", ret);
+ return ret;
+}
+
+/* The prefix represents the type of volume to be added.
+ * It will be "volume" for normal volumes, and snap# like
+ * snap1, snap2, for snapshot volumes
+ */
int32_t
-glusterd_import_volinfo (dict_t *vols, int count,
- glusterd_volinfo_t **volinfo)
+glusterd_import_volinfo (dict_t *peer_data, int count,
+ glusterd_volinfo_t **volinfo,
+ char *prefix)
{
int ret = -1;
char key[256] = {0};
+ char *parent_volname = NULL;
char *volname = NULL;
glusterd_volinfo_t *new_volinfo = NULL;
char *volume_id_str = NULL;
+ char *restored_snap = NULL;
char msg[2048] = {0};
char *src_brick = NULL;
char *dst_brick = NULL;
char *str = NULL;
int rb_status = 0;
+ char *rebalance_id_str = NULL;
+ char *rb_id_str = NULL;
+ int op_version = 0;
+ int client_op_version = 0;
+ uint32_t is_snap_volume = 0;
- GF_ASSERT (vols);
+ GF_ASSERT (peer_data);
GF_ASSERT (volinfo);
+ GF_ASSERT (prefix);
- snprintf (key, sizeof (key), "volume%d.name", count);
- ret = dict_get_str (vols, key, &volname);
+ snprintf (key, sizeof (key), "%s%d.name", prefix, count);
+ ret = dict_get_str (peer_data, key, &volname);
if (ret) {
snprintf (msg, sizeof (msg), "%s missing in payload", key);
goto out;
}
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "%s%d.is_snap_volume", prefix, count);
+ ret = dict_get_uint32 (peer_data, key, &is_snap_volume);
+ if (ret) {
+ snprintf (msg, sizeof (msg), "%s missing in payload for %s",
+ key, volname);
+ goto out;
+ }
+
ret = glusterd_volinfo_new (&new_volinfo);
if (ret)
goto out;
strncpy (new_volinfo->volname, volname, sizeof (new_volinfo->volname));
-
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.type", count);
- ret = dict_get_int32 (vols, key, &new_volinfo->type);
+ snprintf (key, sizeof (key), "%s%d.type", prefix, count);
+ ret = dict_get_int32 (peer_data, key, &new_volinfo->type);
if (ret) {
snprintf (msg, sizeof (msg), "%s missing in payload for %s",
key, volname);
goto out;
}
+ snprintf (key, sizeof (key), "%s%d.parent_volname", prefix, count);
+ ret = dict_get_str (peer_data, key, &parent_volname);
+ if (!ret)
+ strncpy (new_volinfo->parent_volname, parent_volname,
+ sizeof(new_volinfo->parent_volname));
+
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.brick_count", count);
- ret = dict_get_int32 (vols, key, &new_volinfo->brick_count);
+ snprintf (key, sizeof (key), "%s%d.brick_count", prefix, count);
+ ret = dict_get_int32 (peer_data, key, &new_volinfo->brick_count);
if (ret) {
snprintf (msg, sizeof (msg), "%s missing in payload for %s",
key, volname);
@@ -2238,8 +3836,8 @@ glusterd_import_volinfo (dict_t *vols, int count,
}
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.version", count);
- ret = dict_get_int32 (vols, key, &new_volinfo->version);
+ snprintf (key, sizeof (key), "%s%d.version", prefix, count);
+ ret = dict_get_int32 (peer_data, key, &new_volinfo->version);
if (ret) {
snprintf (msg, sizeof (msg), "%s missing in payload for %s",
key, volname);
@@ -2247,8 +3845,8 @@ glusterd_import_volinfo (dict_t *vols, int count,
}
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.status", count);
- ret = dict_get_int32 (vols, key, (int32_t *)&new_volinfo->status);
+ snprintf (key, sizeof (key), "%s%d.status", prefix, count);
+ ret = dict_get_int32 (peer_data, key, (int32_t *)&new_volinfo->status);
if (ret) {
snprintf (msg, sizeof (msg), "%s missing in payload for %s",
key, volname);
@@ -2256,8 +3854,8 @@ glusterd_import_volinfo (dict_t *vols, int count,
}
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.sub_count", count);
- ret = dict_get_int32 (vols, key, &new_volinfo->sub_count);
+ snprintf (key, sizeof (key), "%s%d.sub_count", prefix, count);
+ ret = dict_get_int32 (peer_data, key, &new_volinfo->sub_count);
if (ret) {
snprintf (msg, sizeof (msg), "%s missing in payload for %s",
key, volname);
@@ -2267,8 +3865,8 @@ glusterd_import_volinfo (dict_t *vols, int count,
/* not having a 'stripe_count' key is not a error
(as peer may be of old version) */
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.stripe_count", count);
- ret = dict_get_int32 (vols, key, &new_volinfo->stripe_count);
+ snprintf (key, sizeof (key), "%s%d.stripe_count", prefix, count);
+ ret = dict_get_int32 (peer_data, key, &new_volinfo->stripe_count);
if (ret)
gf_log (THIS->name, GF_LOG_INFO,
"peer is possibly old version");
@@ -2276,8 +3874,8 @@ glusterd_import_volinfo (dict_t *vols, int count,
/* not having a 'replica_count' key is not a error
(as peer may be of old version) */
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.replica_count", count);
- ret = dict_get_int32 (vols, key, &new_volinfo->replica_count);
+ snprintf (key, sizeof (key), "%s%d.replica_count", prefix, count);
+ ret = dict_get_int32 (peer_data, key, &new_volinfo->replica_count);
if (ret)
gf_log (THIS->name, GF_LOG_INFO,
"peer is possibly old version");
@@ -2285,15 +3883,16 @@ glusterd_import_volinfo (dict_t *vols, int count,
/* not having a 'dist_count' key is not a error
(as peer may be of old version) */
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.dist_count", count);
- ret = dict_get_int32 (vols, key, &new_volinfo->dist_leaf_count);
+ snprintf (key, sizeof (key), "%s%d.dist_count", prefix, count);
+ ret = dict_get_int32 (peer_data, key, &new_volinfo->dist_leaf_count);
if (ret)
gf_log (THIS->name, GF_LOG_INFO,
"peer is possibly old version");
-
+ new_volinfo->subvol_count = new_volinfo->brick_count/
+ glusterd_get_dist_leaf_count (new_volinfo);
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.ckusm", count);
- ret = dict_get_uint32 (vols, key, &new_volinfo->cksum);
+ snprintf (key, sizeof (key), "%s%d.ckusm", prefix, count);
+ ret = dict_get_uint32 (peer_data, key, &new_volinfo->cksum);
if (ret) {
snprintf (msg, sizeof (msg), "%s missing in payload for %s",
key, volname);
@@ -2301,17 +3900,19 @@ glusterd_import_volinfo (dict_t *vols, int count,
}
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.volume_id", count);
- ret = dict_get_str (vols, key, &volume_id_str);
+ snprintf (key, sizeof (key), "%s%d.volume_id", prefix, count);
+ ret = dict_get_str (peer_data, key, &volume_id_str);
if (ret) {
snprintf (msg, sizeof (msg), "%s missing in payload for %s",
key, volname);
goto out;
}
+ uuid_parse (volume_id_str, new_volinfo->volume_id);
+
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.username", count);
- ret = dict_get_str (vols, key, &str);
+ snprintf (key, sizeof (key), "%s%d.username", prefix, count);
+ ret = dict_get_str (peer_data, key, &str);
if (!ret) {
ret = glusterd_auth_set_username (new_volinfo, str);
if (ret)
@@ -2319,8 +3920,8 @@ glusterd_import_volinfo (dict_t *vols, int count,
}
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.password", count);
- ret = dict_get_str (vols, key, &str);
+ snprintf (key, sizeof (key), "%s%d.password", prefix, count);
+ ret = dict_get_str (peer_data, key, &str);
if (!ret) {
ret = glusterd_auth_set_password (new_volinfo, str);
if (ret)
@@ -2328,8 +3929,29 @@ glusterd_import_volinfo (dict_t *vols, int count,
}
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.transport_type", count);
- ret = dict_get_uint32 (vols, key, &new_volinfo->transport_type);
+ snprintf (key, sizeof (key), "%s%d.transport_type", prefix, count);
+ ret = dict_get_uint32 (peer_data, key, &new_volinfo->transport_type);
+ if (ret) {
+ snprintf (msg, sizeof (msg), "%s missing in payload for %s",
+ key, volname);
+ goto out;
+ }
+
+ new_volinfo->is_snap_volume = is_snap_volume;
+
+ snprintf (key, sizeof (key), "%s%d.restored_from_snap", prefix, count);
+ ret = dict_get_str (peer_data, key, &restored_snap);
+ if (ret) {
+ snprintf (msg, sizeof (msg), "%s missing in payload for %s",
+ key, volname);
+ goto out;
+ }
+
+ uuid_parse (restored_snap, new_volinfo->restored_from_snap);
+
+ snprintf (key, sizeof (key), "%s%d.snap-max-hard-limit", prefix, count);
+ ret = dict_get_uint64 (peer_data, key,
+ &new_volinfo->snap_max_hard_limit);
if (ret) {
snprintf (msg, sizeof (msg), "%s missing in payload for %s",
key, volname);
@@ -2337,34 +3959,62 @@ glusterd_import_volinfo (dict_t *vols, int count,
}
memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "volume%d.rebalance", count);
- ret = dict_get_uint32 (vols, key, &new_volinfo->defrag_cmd);
+ snprintf (key, sizeof (key), "%s%d.rebalance", prefix, count);
+ ret = dict_get_uint32 (peer_data, key, &new_volinfo->rebal.defrag_cmd);
if (ret) {
snprintf (msg, sizeof (msg), "%s missing in payload for %s",
key, volname);
goto out;
}
- uuid_parse (volume_id_str, new_volinfo->volume_id);
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "%s%d.rebalance-id", prefix, count);
+ ret = dict_get_str (peer_data, key, &rebalance_id_str);
+ if (ret) {
+ /* This is not present in older glusterfs versions,
+ * so don't error out
+ */
+ ret = 0;
+ } else {
+ uuid_parse (rebalance_id_str, new_volinfo->rebal.rebalance_id);
+ }
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "%s%d.rebalance-op", prefix, count);
+ ret = dict_get_uint32 (peer_data, key,
+ (uint32_t *) &new_volinfo->rebal.op);
+ if (ret) {
+ /* This is not present in older glusterfs versions,
+ * so don't error out
+ */
+ ret = 0;
+ }
+ ret = gd_import_friend_volume_rebal_dict (peer_data, count,
+ new_volinfo);
+ if (ret) {
+ snprintf (msg, sizeof (msg), "Failed to import rebalance dict "
+ "for volume.");
+ goto out;
+ }
memset (key, 0, sizeof (key));
- snprintf (key, 256, "volume%d."GLUSTERD_STORE_KEY_RB_STATUS, count);
- ret = dict_get_int32 (vols, key, &rb_status);
+ snprintf (key, 256, "%s%d."GLUSTERD_STORE_KEY_RB_STATUS, prefix, count);
+ ret = dict_get_int32 (peer_data, key, &rb_status);
if (ret)
goto out;
- new_volinfo->rb_status = rb_status;
+ new_volinfo->rep_brick.rb_status = rb_status;
- if (new_volinfo->rb_status > GF_RB_STATUS_NONE) {
+ if (new_volinfo->rep_brick.rb_status > GF_RB_STATUS_NONE) {
memset (key, 0, sizeof (key));
- snprintf (key, 256, "volume%d."GLUSTERD_STORE_KEY_RB_SRC_BRICK,
- count);
- ret = dict_get_str (vols, key, &src_brick);
+ snprintf (key, 256, "%s%d."GLUSTERD_STORE_KEY_RB_SRC_BRICK,
+ prefix, count);
+ ret = dict_get_str (peer_data, key, &src_brick);
if (ret)
goto out;
ret = glusterd_brickinfo_new_from_brick (src_brick,
- &new_volinfo->src_brick);
+ &new_volinfo->rep_brick.src_brick);
if (ret) {
gf_log ("", GF_LOG_ERROR, "Unable to create"
" src brickinfo");
@@ -2372,28 +4022,81 @@ glusterd_import_volinfo (dict_t *vols, int count,
}
memset (key, 0, sizeof (key));
- snprintf (key, 256, "volume%d."GLUSTERD_STORE_KEY_RB_DST_BRICK,
- count);
- ret = dict_get_str (vols, key, &dst_brick);
+ snprintf (key, 256, "%s%d."GLUSTERD_STORE_KEY_RB_DST_BRICK,
+ prefix, count);
+ ret = dict_get_str (peer_data, key, &dst_brick);
if (ret)
goto out;
ret = glusterd_brickinfo_new_from_brick (dst_brick,
- &new_volinfo->dst_brick);
+ &new_volinfo->rep_brick.dst_brick);
if (ret) {
gf_log ("", GF_LOG_ERROR, "Unable to create"
" dst brickinfo");
goto out;
}
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "%s%d.rb_id", prefix, count);
+ ret = dict_get_str (peer_data, key, &rb_id_str);
+ if (ret) {
+ /* This is not present in older glusterfs versions,
+ * so don't error out
+ */
+ ret = 0;
+ } else {
+ uuid_parse (rb_id_str, new_volinfo->rep_brick.rb_id);
+ }
}
- ret = glusterd_import_friend_volume_opts (vols, count, new_volinfo);
+ ret = glusterd_import_friend_volume_opts (peer_data, count,
+ new_volinfo);
+ if (ret)
+ goto out;
+
+ /* Import the volume's op-versions if available else set it to 1.
+ * Not having op-versions implies this informtation was obtained from a
+ * op-version 1 friend (gluster-3.3), ergo the cluster is at op-version
+ * 1 and all volumes are at op-versions 1.
+ *
+ * Either both the volume op-versions should be absent or both should be
+ * present. Only one being present is a failure
+ */
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "%s%d.op-version", prefix, count);
+ ret = dict_get_int32 (peer_data, key, &op_version);
+ if (ret)
+ ret = 0;
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "%s%d.client-op-version", prefix, count);
+ ret = dict_get_int32 (peer_data, key, &client_op_version);
if (ret)
+ ret = 0;
+
+ if (op_version && client_op_version) {
+ new_volinfo->op_version = op_version;
+ new_volinfo->client_op_version = client_op_version;
+ } else if (((op_version == 0) && (client_op_version != 0)) ||
+ ((op_version != 0) && (client_op_version == 0))) {
+ ret = -1;
+ gf_log ("glusterd", GF_LOG_ERROR,
+ "Only one volume op-version found");
goto out;
- ret = glusterd_import_bricks (vols, count, new_volinfo);
+ } else {
+ new_volinfo->op_version = 1;
+ new_volinfo->client_op_version = 1;
+ }
+
+ memset (key, 0 ,sizeof (key));
+ snprintf (key, sizeof (key), "%s%d.caps", prefix, count);
+ /*This is not present in older glusterfs versions, so ignore ret value*/
+ ret = dict_get_int32 (peer_data, key, &new_volinfo->caps);
+
+ ret = glusterd_import_bricks (peer_data, count, new_volinfo, prefix);
if (ret)
goto out;
+
*volinfo = new_volinfo;
out:
if (msg[0])
@@ -2467,7 +4170,11 @@ glusterd_volinfo_stop_stale_bricks (glusterd_volinfo_t *new_volinfo,
old_brickinfo->hostname,
old_brickinfo->path,
new_volinfo, &new_brickinfo);
- if (ret) {
+ /* If the brick is stale, i.e it's not a part of the new volume
+ * or if it's part of the new volume and is pending a snap,
+ * then stop the brick process
+ */
+ if (ret || (new_brickinfo->snap_status == -1)) {
/*TODO: may need to switch to 'atomic' flavour of
* brick_stop, once we make peer rpc program also
* synctask enabled*/
@@ -2489,9 +4196,34 @@ int32_t
glusterd_delete_stale_volume (glusterd_volinfo_t *stale_volinfo,
glusterd_volinfo_t *valid_volinfo)
{
+ int32_t ret = -1;
+ glusterd_volinfo_t *temp_volinfo = NULL;
+ glusterd_volinfo_t *voliter = NULL;
+ xlator_t *this = NULL;
+
GF_ASSERT (stale_volinfo);
GF_ASSERT (valid_volinfo);
+ /* Copy snap_volumes list from stale_volinfo to valid_volinfo */
+ valid_volinfo->snap_count = 0;
+ list_for_each_entry_safe (voliter, temp_volinfo,
+ &stale_volinfo->snap_volumes, snapvol_list) {
+ list_add_tail (&voliter->snapvol_list,
+ &valid_volinfo->snap_volumes);
+ valid_volinfo->snap_count++;
+ }
+
+ if ((!uuid_is_null (stale_volinfo->restored_from_snap)) &&
+ (uuid_compare (stale_volinfo->restored_from_snap,
+ valid_volinfo->restored_from_snap))) {
+ ret = glusterd_lvm_snapshot_remove (NULL, stale_volinfo);
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "Failed to remove lvm snapshot for "
+ "restored volume %s", stale_volinfo->volname);
+ }
+ }
+
/* If stale volume is in started state, copy the port numbers of the
* local bricks if they exist in the valid volume information.
* stop stale bricks. Stale volume information is going to be deleted.
@@ -2522,15 +4254,63 @@ glusterd_delete_stale_volume (glusterd_volinfo_t *stale_volinfo,
(void) glusterd_delete_all_bricks (stale_volinfo);
if (stale_volinfo->shandle) {
unlink (stale_volinfo->shandle->path);
- (void) glusterd_store_handle_destroy (stale_volinfo->shandle);
+ (void) gf_store_handle_destroy (stale_volinfo->shandle);
stale_volinfo->shandle = NULL;
}
- (void) glusterd_volinfo_delete (stale_volinfo);
+ (void) glusterd_volinfo_remove (stale_volinfo);
return 0;
}
+/* This function updates the rebalance information of the new volinfo using the
+ * information from the old volinfo.
+ */
+int
+gd_check_and_update_rebalance_info (glusterd_volinfo_t *old_volinfo,
+ glusterd_volinfo_t *new_volinfo)
+{
+ int ret = -1;
+ glusterd_rebalance_t *old = NULL;
+ glusterd_rebalance_t *new = NULL;
+
+ GF_ASSERT (old_volinfo);
+ GF_ASSERT (new_volinfo);
+
+ old = &(old_volinfo->rebal);
+ new = &(new_volinfo->rebal);
+
+ //Disconnect from rebalance process
+ if (old->defrag && old->defrag->rpc) {
+ rpc_transport_disconnect (old->defrag->rpc->conn.trans);
+ }
+
+ if (!uuid_is_null (old->rebalance_id) &&
+ uuid_compare (old->rebalance_id, new->rebalance_id)) {
+ (void)gd_stop_rebalance_process (old_volinfo);
+ goto out;
+ }
+
+ /* If the tasks match, copy the status and other information of the
+ * rebalance process from old_volinfo to new_volinfo
+ */
+ new->defrag_status = old->defrag_status;
+ new->rebalance_files = old->rebalance_files;
+ new->rebalance_data = old->rebalance_data;
+ new->lookedup_files = old->lookedup_files;
+ new->skipped_files = old->skipped_files;
+ new->rebalance_failures = old->rebalance_failures;
+ new->rebalance_time = old->rebalance_time;
+ new->dict = (old->dict ? dict_ref (old->dict) : NULL);
+
+ /* glusterd_rebalance_t.{op, id, defrag_cmd} are copied during volume
+ * import
+ * a new defrag object should come to life with rebalance being restarted
+ */
+out:
+ return ret;
+}
+
int32_t
-glusterd_import_friend_volume (dict_t *vols, size_t count)
+glusterd_import_friend_volume (dict_t *peer_data, size_t count)
{
int32_t ret = -1;
@@ -2539,18 +4319,27 @@ glusterd_import_friend_volume (dict_t *vols, size_t count)
glusterd_volinfo_t *old_volinfo = NULL;
glusterd_volinfo_t *new_volinfo = NULL;
- GF_ASSERT (vols);
+ GF_ASSERT (peer_data);
this = THIS;
GF_ASSERT (this);
priv = this->private;
GF_ASSERT (priv);
- ret = glusterd_import_volinfo (vols, count, &new_volinfo);
+ ret = glusterd_import_volinfo (peer_data, count,
+ &new_volinfo, "volume");
if (ret)
goto out;
+ if (!new_volinfo) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Not importing snap volume");
+ goto out;
+ }
+
ret = glusterd_volinfo_find (new_volinfo->volname, &old_volinfo);
if (0 == ret) {
+ (void) gd_check_and_update_rebalance_info (old_volinfo,
+ new_volinfo);
(void) glusterd_delete_stale_volume (old_volinfo, new_volinfo);
}
@@ -2563,27 +4352,33 @@ glusterd_import_friend_volume (dict_t *vols, size_t count)
if (ret)
goto out;
- list_add_tail (&new_volinfo->vol_list, &priv->volumes);
+ ret = glusterd_import_quota_conf (peer_data, count,
+ new_volinfo, "volume");
+ if (ret)
+ goto out;
+
+ list_add_order (&new_volinfo->vol_list, &priv->volumes,
+ glusterd_compare_volume_name);
out:
gf_log ("", GF_LOG_DEBUG, "Returning with ret: %d", ret);
return ret;
}
int32_t
-glusterd_import_friend_volumes (dict_t *vols)
+glusterd_import_friend_volumes (dict_t *peer_data)
{
int32_t ret = -1;
int32_t count = 0;
int i = 1;
- GF_ASSERT (vols);
+ GF_ASSERT (peer_data);
- ret = dict_get_int32 (vols, "count", &count);
+ ret = dict_get_int32 (peer_data, "count", &count);
if (ret)
goto out;
while (i <= count) {
- ret = glusterd_import_friend_volume (vols, i);
+ ret = glusterd_import_friend_volume (peer_data, i);
if (ret)
goto out;
i++;
@@ -2594,25 +4389,931 @@ out:
return ret;
}
+int
+glusterd_get_global_opt_version (dict_t *opts, uint32_t *version)
+{
+ int ret = -1;
+ char *version_str = NULL;
+
+ ret = dict_get_str (opts, GLUSTERD_GLOBAL_OPT_VERSION, &version_str);
+ if (ret)
+ goto out;
+
+ ret = gf_string2uint (version_str, version);
+ if (ret)
+ goto out;
+ ret = 0;
+out:
+ return ret;
+}
+
+int
+glusterd_get_next_global_opt_version_str (dict_t *opts, char **version_str)
+{
+ int ret = -1;
+ char version_string[64] = {0};
+ uint32_t version = 0;
+
+ ret = glusterd_get_global_opt_version (opts, &version);
+ if (ret)
+ goto out;
+ version++;
+ snprintf (version_string, sizeof (version_string), "%"PRIu32, version);
+ *version_str = gf_strdup (version_string);
+ if (*version_str)
+ ret = 0;
+out:
+ return ret;
+}
+
int32_t
-glusterd_compare_friend_data (dict_t *vols, int32_t *status)
+glusterd_import_global_opts (dict_t *friend_data)
{
- int32_t ret = -1;
- int32_t count = 0;
- int i = 1;
- gf_boolean_t update = _gf_false;
- gf_boolean_t stale_nfs = _gf_false;
- gf_boolean_t stale_shd = _gf_false;
+ xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
+ int ret = -1;
+ dict_t *import_options = NULL;
+ int count = 0;
+ uint32_t local_version = 0;
+ uint32_t remote_version = 0;
+
+ this = THIS;
+ conf = this->private;
+
+ ret = dict_get_int32 (friend_data, "global-opt-count", &count);
+ if (ret) {
+ //old version peer
+ ret = 0;
+ goto out;
+ }
+
+ import_options = dict_new ();
+ if (!import_options)
+ goto out;
+ ret = import_prdict_dict (friend_data, import_options, "key", "val",
+ count, "global");
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to import"
+ " global options");
+ goto out;
+ }
+
+ ret = glusterd_get_global_opt_version (conf->opts, &local_version);
+ if (ret)
+ goto out;
+ ret = glusterd_get_global_opt_version (import_options, &remote_version);
+ if (ret)
+ goto out;
+ if (remote_version > local_version) {
+ ret = glusterd_store_options (this, import_options);
+ if (ret)
+ goto out;
+ dict_unref (conf->opts);
+ conf->opts = dict_ref (import_options);
+ }
+ ret = 0;
+out:
+ if (import_options)
+ dict_unref (import_options);
+ return ret;
+}
+
+int32_t
+glusterd_perform_missed_op (glusterd_snap_t *snap, int32_t op)
+{
+ dict_t *dict = NULL;
+ int32_t ret = -1;
+ glusterd_conf_t *priv = NULL;
+ glusterd_volinfo_t *snap_volinfo = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ xlator_t *this = NULL;
+ uuid_t null_uuid = {0};
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ priv = this->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (snap);
+
+ dict = dict_new();
+ if (!dict) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to create dict");
+ ret = -1;
+ goto out;
+ }
+
+ switch (op) {
+ case GF_SNAP_OPTION_TYPE_DELETE:
+ ret = glusterd_snap_remove (dict, snap, _gf_true, _gf_false);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to remove snap");
+ goto out;
+ }
+
+ break;
+ case GF_SNAP_OPTION_TYPE_RESTORE:
+ /* TODO : As of now there is only volume in snapshot.
+ * Change this when multiple volume snapshot is introduced
+ */
+ snap_volinfo = list_entry (snap->volumes.next,
+ glusterd_volinfo_t, vol_list);
+
+ /* Find the parent volinfo */
+ ret = glusterd_volinfo_find (snap_volinfo->parent_volname,
+ &volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not get volinfo of %s",
+ snap_volinfo->parent_volname);
+ goto out;
+ }
+
+ /* Bump down the original volinfo's version, coz it would have
+ * incremented already due to volume handshake
+ */
+ volinfo->version--;
+ uuid_copy (volinfo->restored_from_snap, null_uuid);
+
+ /* Perform the restore */
+ ret = gd_restore_snap_volume (dict, volinfo, snap_volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to restore "
+ "snap for %s", snap->snapname);
+ volinfo->version++;
+ goto out;
+ }
+
+ break;
+ default:
+ /* The entry must be a create, delete, or
+ * restore entry
+ */
+ gf_log (this->name, GF_LOG_ERROR, "Invalid missed snap entry");
+ ret = -1;
+ goto out;
+ }
+
+out:
+ dict_unref (dict);
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* Perform missed deletes and restores on this node */
+int32_t
+glusterd_perform_missed_snap_ops ()
+{
+ int32_t ret = -1;
+ int32_t op_status = -1;
+ glusterd_conf_t *priv = NULL;
+ glusterd_missed_snap_info *missed_snapinfo = NULL;
+ glusterd_snap_op_t *snap_opinfo = NULL;
+ glusterd_snap_t *snap = NULL;
+ uuid_t snap_uuid = {0,};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ list_for_each_entry (missed_snapinfo, &priv->missed_snaps_list,
+ missed_snaps) {
+ /* If the pending snap_op is not for this node then continue */
+ if (strcmp (missed_snapinfo->node_uuid, uuid_utoa (MY_UUID)))
+ continue;
+
+ /* Find the snap id */
+ uuid_parse (missed_snapinfo->snap_uuid, snap_uuid);
+ snap = NULL;
+ snap = glusterd_find_snap_by_id (snap_uuid);
+ if (!snap) {
+ /* If the snap is not found, then a delete or a
+ * restore can't be pending on that snap_uuid.
+ */
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Not a pending delete or restore op");
+ continue;
+ }
+
+ op_status = GD_MISSED_SNAP_PENDING;
+ list_for_each_entry (snap_opinfo, &missed_snapinfo->snap_ops,
+ snap_ops_list) {
+ /* If the snap_op is create or its status is
+ * GD_MISSED_SNAP_DONE then continue
+ */
+ if ((snap_opinfo->status == GD_MISSED_SNAP_DONE) ||
+ (snap_opinfo->op == GF_SNAP_OPTION_TYPE_CREATE))
+ continue;
+
+ /* Perform the actual op for the first time for
+ * this snap, and mark the snap_status as
+ * GD_MISSED_SNAP_DONE. For other entries for the same
+ * snap, just mark the entry as done.
+ */
+ if (op_status == GD_MISSED_SNAP_PENDING) {
+ ret = glusterd_perform_missed_op
+ (snap,
+ snap_opinfo->op);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to perform missed snap op");
+ goto out;
+ }
+ op_status = GD_MISSED_SNAP_DONE;
+ }
+
+ snap_opinfo->status = GD_MISSED_SNAP_DONE;
+ }
+ }
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* Import friend volumes missed_snap_list and update *
+ * missed_snap_list if need be */
+int32_t
+glusterd_import_friend_missed_snap_list (dict_t *peer_data)
+{
+ int32_t missed_snap_count = -1;
+ int32_t ret = -1;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (peer_data);
+
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ /* Add the friends missed_snaps entries to the in-memory list */
+ ret = dict_get_int32 (peer_data, "missed_snap_count",
+ &missed_snap_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_INFO,
+ "No missed snaps");
+ ret = 0;
+ goto out;
+ }
+
+ ret = glusterd_add_missed_snaps_to_list (peer_data,
+ missed_snap_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to add missed snaps to list");
+ goto out;
+ }
+
+ ret = glusterd_perform_missed_snap_ops ();
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to perform snap operations");
+ /* Not going to out at this point coz some *
+ * missed ops might have been performed. We *
+ * need to persist the current list *
+ */
+ }
+
+ ret = glusterd_store_update_missed_snaps ();
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to update missed_snaps_list");
+ goto out;
+ }
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* Check for the peer_snap_name in the list of existing snapshots.
+ * If a snap exists with the same name and a different snap_id, then
+ * there is a conflict. Set conflict as _gf_true, and snap to the
+ * conflicting snap object. If a snap exists with the same name, and the
+ * same snap_id, then there is no conflict. Set conflict as _gf_false
+ * and snap to the existing snap object. If no snap exists with the
+ * peer_snap_name, then there is no conflict. Set conflict as _gf_false
+ * and snap to NULL.
+ */
+void
+glusterd_is_peer_snap_conflicting (char *peer_snap_name, char *peer_snap_id,
+ gf_boolean_t *conflict,
+ glusterd_snap_t **snap, char *hostname)
+{
+ uuid_t peer_snap_uuid = {0,};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (peer_snap_name);
+ GF_ASSERT (peer_snap_id);
+ GF_ASSERT (conflict);
+ GF_ASSERT (snap);
+ GF_ASSERT (hostname);
+
+ *snap = glusterd_find_snap_by_name (peer_snap_name);
+ if (*snap) {
+ uuid_parse (peer_snap_id, peer_snap_uuid);
+ if (!uuid_compare (peer_snap_uuid, (*snap)->snap_id)) {
+ /* Current node contains the same snap having
+ * the same snapname and snap_id
+ */
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Snapshot %s from peer %s present in "
+ "localhost", peer_snap_name, hostname);
+ *conflict = _gf_false;
+ } else {
+ /* Current node contains the same snap having
+ * the same snapname but different snap_id
+ */
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Snapshot %s from peer %s conflicts with "
+ "snapshot in localhost", peer_snap_name,
+ hostname);
+ *conflict = _gf_true;
+ }
+ } else {
+ /* Peer contains snapshots missing on the current node */
+ gf_log (this->name, GF_LOG_INFO,
+ "Snapshot %s from peer %s missing on localhost",
+ peer_snap_name, hostname);
+ *conflict = _gf_false;
+ }
+}
+
+/* Check if the local node is hosting any bricks for the given snapshot */
+gf_boolean_t
+glusterd_are_snap_bricks_local (glusterd_snap_t *snap)
+{
+ gf_boolean_t is_local = _gf_false;
+ glusterd_volinfo_t *volinfo = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (snap);
+
+ list_for_each_entry (volinfo, &snap->volumes, vol_list) {
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ if (!uuid_compare (brickinfo->uuid, MY_UUID)) {
+ is_local = _gf_true;
+ goto out;
+ }
+ }
+ }
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", is_local);
+ return is_local;
+}
+
+/* Check if the peer has missed any snap delete for the given snap_id */
+gf_boolean_t
+glusterd_peer_has_missed_snap_delete (glusterd_peerinfo_t *peerinfo,
+ char *peer_snap_id)
+{
+ char *peer_uuid = NULL;
+ gf_boolean_t missed_delete = _gf_false;
+ glusterd_conf_t *priv = NULL;
+ glusterd_missed_snap_info *missed_snapinfo = NULL;
+ glusterd_snap_op_t *snap_opinfo = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (peerinfo);
+ GF_ASSERT (peer_snap_id);
+
+ peer_uuid = uuid_utoa (peerinfo->uuid);
+
+ list_for_each_entry (missed_snapinfo, &priv->missed_snaps_list,
+ missed_snaps) {
+ /* Look for missed snap for the same peer, and
+ * the same snap_id
+ */
+ if ((!strcmp (peer_uuid, missed_snapinfo->node_uuid)) &&
+ (!strcmp (peer_snap_id, missed_snapinfo->snap_uuid))) {
+ /* Check if the missed snap's op is delete and the
+ * status is pending
+ */
+ list_for_each_entry (snap_opinfo,
+ &missed_snapinfo->snap_ops,
+ snap_ops_list) {
+ if ((snap_opinfo->op ==
+ GF_SNAP_OPTION_TYPE_DELETE) &&
+ (snap_opinfo->status ==
+ GD_MISSED_SNAP_PENDING)) {
+ missed_delete = _gf_true;
+ goto out;
+ }
+ }
+ }
+ }
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", missed_delete);
+ return missed_delete;
+}
+
+/* Genrate and store snap volfiles for imported snap object */
+int32_t
+glusterd_gen_snap_volfiles (glusterd_volinfo_t *snap_vol, char *peer_snap_name)
+{
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+ glusterd_volinfo_t *parent_volinfo = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (snap_vol);
+ GF_ASSERT (peer_snap_name);
+
+ ret = glusterd_store_volinfo (snap_vol, GLUSTERD_VOLINFO_VER_AC_NONE);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to store snapshot "
+ "volinfo (%s) for snap %s", snap_vol->volname,
+ peer_snap_name);
+ goto out;
+ }
+
+ ret = generate_brick_volfiles (snap_vol);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "generating the brick volfiles for the "
+ "snap %s failed", peer_snap_name);
+ goto out;
+ }
+
+ ret = generate_client_volfiles (snap_vol, GF_CLIENT_TRUSTED);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "generating the trusted client volfiles for "
+ "the snap %s failed", peer_snap_name);
+ goto out;
+ }
+
+ ret = generate_client_volfiles (snap_vol, GF_CLIENT_OTHER);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "generating the client volfiles for the "
+ "snap %s failed", peer_snap_name);
+ goto out;
+ }
+
+ ret = glusterd_volinfo_find (snap_vol->parent_volname,
+ &parent_volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Parent volinfo "
+ "not found for %s volume of snap %s",
+ snap_vol->volname, peer_snap_name);
+ goto out;
+ }
+
+ glusterd_list_add_snapvol (parent_volinfo, snap_vol);
+
+ list_for_each_entry (brickinfo, &snap_vol->bricks, brick_list) {
+ if (uuid_compare (brickinfo->uuid, MY_UUID))
+ continue;
+
+ if (brickinfo->snap_status == -1) {
+ gf_log (this->name, GF_LOG_INFO,
+ "not starting snap brick %s:%s for "
+ "for the snap %s (volume: %s)",
+ brickinfo->hostname, brickinfo->path,
+ peer_snap_name, parent_volinfo->volname);
+ continue;
+ }
+
+ ret = glusterd_brick_start (snap_vol, brickinfo, _gf_true);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "starting the "
+ "brick %s:%s for the snap %s (volume: %s) "
+ "failed", brickinfo->hostname, brickinfo->path,
+ peer_snap_name, parent_volinfo->volname);
+ goto out;
+ }
+ }
+
+ snap_vol->status = GLUSTERD_STATUS_STARTED;
+
+ ret = glusterd_store_volinfo (snap_vol, GLUSTERD_VOLINFO_VER_AC_NONE);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to store snap volinfo");
+ goto out;
+ }
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* Import snapshot info from peer_data and add it to priv */
+int32_t
+glusterd_import_friend_snap (dict_t *peer_data, int32_t snap_count,
+ char *peer_snap_name, char *peer_snap_id)
+{
+ char buf[NAME_MAX] = "";
+ char prefix[NAME_MAX] = "";
+ dict_t *dict = NULL;
+ glusterd_snap_t *snap = NULL;
+ glusterd_volinfo_t *snap_vol = NULL;
+ glusterd_conf_t *priv = NULL;
+ int32_t ret = -1;
+ int32_t volcount = -1;
+ int32_t i = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (peer_data);
+ GF_ASSERT (peer_snap_name);
+ GF_ASSERT (peer_snap_id);
+
+ snprintf (prefix, sizeof(prefix), "snap%d", snap_count);
+
+ snap = glusterd_new_snap_object ();
+ if (!snap) {
+ gf_log (this->name, GF_LOG_ERROR, "Could not create "
+ "the snap object for snap %s", peer_snap_name);
+ goto out;
+ }
+
+ strcpy (snap->snapname, peer_snap_name);
+ uuid_parse (peer_snap_id, snap->snap_id);
+
+ snprintf (buf, sizeof(buf), "%s.snapid", prefix);
+ ret = dict_get_str (peer_data, buf, &snap->description);
+
+ snprintf (buf, sizeof(buf), "%s.time_stamp", prefix);
+ ret = dict_get_int64 (peer_data, buf, &snap->time_stamp);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to get time_stamp for snap %s",
+ peer_snap_name);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf), "%s.snap_restored", prefix);
+ ret = dict_get_int8 (peer_data, buf, (int8_t *) &snap->snap_restored);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to get snap_restored for snap %s",
+ peer_snap_name);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf), "%s.snap_status", prefix);
+ ret = dict_get_int32 (peer_data, buf, (int32_t *) &snap->snap_status);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to get snap_status for snap %s",
+ peer_snap_name);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf), "%s.volcount", prefix);
+ ret = dict_get_int32 (peer_data, buf, &volcount);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to get volcount for snap %s",
+ peer_snap_name);
+ goto out;
+ }
+
+ ret = glusterd_store_create_snap_dir (snap);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR, "Failed to create snap dir");
+ goto out;
+ }
+
+ list_add_order (&snap->snap_list, &priv->snapshots,
+ glusterd_compare_snap_time);
+
+ for (i = 1; i <= volcount; i++) {
+ ret = glusterd_import_volinfo (peer_data, i,
+ &snap_vol, prefix);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to import snap volinfo for "
+ "snap %s", peer_snap_name);
+ goto out;
+ }
+
+ snap_vol->snapshot = snap;
+
+ ret = glusterd_gen_snap_volfiles (snap_vol, peer_snap_name);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to generate snap vol files "
+ "for snap %s", peer_snap_name);
+ goto out;
+ }
+
+ ret = glusterd_import_quota_conf (peer_data, i,
+ snap_vol, prefix);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to import quota conf "
+ "for snap %s", peer_snap_name);
+ goto out;
+ }
+
+ snap_vol = NULL;
+ }
+
+ ret = glusterd_store_snap (snap);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "Could not store snap"
+ "object %s", peer_snap_name);
+ goto out;
+ }
+
+out:
+ if (ret)
+ glusterd_snap_remove (dict, snap,
+ _gf_true, _gf_true);
+
+ if (dict)
+ dict_unref (dict);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* During a peer-handshake, after the volumes have synced, and the list of
+ * missed snapshots have synced, the node will perform the pending deletes
+ * and restores on this list. At this point, the current snapshot list in
+ * the node will be updated, and hence in case of conflicts arising during
+ * snapshot handshake, the peer hosting the bricks will be given precedence
+ * Likewise, if there will be a conflict, and both peers will be in the same
+ * state, i.e either both would be hosting bricks or both would not be hosting
+ * bricks, then a decision can't be taken and a peer-reject will happen.
+ *
+ * glusterd_compare_and_update_snap() implements the following algorithm to
+ * perform the above task:
+ * Step 1: Start.
+ * Step 2: Check if the peer is missing a delete on the said snap.
+ * If yes, goto step 6.
+ * Step 3: Check if there is a conflict between the peer's data and the
+ * local snap. If no, goto step 5.
+ * Step 4: As there is a conflict, check if both the peer and the local nodes
+ * are hosting bricks. Based on the results perform the following:
+ * Peer Hosts Bricks Local Node Hosts Bricks Action
+ * Yes Yes Goto Step 7
+ * No No Goto Step 7
+ * Yes No Goto Step 8
+ * No Yes Goto Step 6
+ * Step 5: Check if the local node is missing the peer's data.
+ * If yes, goto step 9.
+ * Step 6: It's a no-op. Goto step 10
+ * Step 7: Peer Reject. Goto step 10
+ * Step 8: Delete local node's data.
+ * Step 9: Accept Peer Data.
+ * Step 10: Stop
+ *
+ */
+int32_t
+glusterd_compare_and_update_snap (dict_t *peer_data, int32_t snap_count,
+ glusterd_peerinfo_t *peerinfo)
+{
+ char buf[NAME_MAX] = "";
+ char prefix[NAME_MAX] = "";
+ char *peer_snap_name = NULL;
+ char *peer_snap_id = NULL;
+ dict_t *dict = NULL;
+ glusterd_snap_t *snap = NULL;
+ gf_boolean_t conflict = _gf_false;
+ gf_boolean_t is_local = _gf_false;
+ gf_boolean_t is_hosted = _gf_false;
+ gf_boolean_t missed_delete = _gf_false;
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (peer_data);
+ GF_ASSERT (peerinfo);
+
+ snprintf (prefix, sizeof(prefix), "snap%d", snap_count);
+
+ /* Fetch the peer's snapname */
+ snprintf (buf, sizeof(buf), "%s.snapname", prefix);
+ ret = dict_get_str (peer_data, buf, &peer_snap_name);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to fetch snapname from peer: %s",
+ peerinfo->hostname);
+ goto out;
+ }
+
+ /* Fetch the peer's snap_id */
+ snprintf (buf, sizeof(buf), "%s.snap_id", prefix);
+ ret = dict_get_str (peer_data, buf, &peer_snap_id);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to fetch snap_id from peer: %s",
+ peerinfo->hostname);
+ goto out;
+ }
+
+ /* Check if the peer has missed a snap delete for the
+ * snap in question
+ */
+ missed_delete = glusterd_peer_has_missed_snap_delete (peerinfo,
+ peer_snap_id);
+ if (missed_delete == _gf_true) {
+ /* Peer has missed delete on the missing/conflicting snap_id */
+ gf_log (this->name, GF_LOG_INFO, "Peer %s has missed a delete "
+ "on snap %s", peerinfo->hostname, peer_snap_name);
+ ret = 0;
+ goto out;
+ }
+
+ /* Check if there is a conflict, and if the
+ * peer data is already present
+ */
+ glusterd_is_peer_snap_conflicting (peer_snap_name, peer_snap_id,
+ &conflict, &snap,
+ peerinfo->hostname);
+ if (conflict == _gf_false) {
+ if (snap) {
+ /* Peer has snap with the same snapname
+ * and snap_id. No need to accept peer data
+ */
+ ret = 0;
+ goto out;
+ } else {
+ /* Peer has snap with the same snapname
+ * and snap_id, which local node doesn't have.
+ */
+ goto accept_peer_data;
+ }
+ }
+
+ /* There is a conflict. Check if the current node is
+ * hosting bricks for the conflicted snap.
+ */
+ is_local = glusterd_are_snap_bricks_local (snap);
+
+ /* Check if the peer is hosting any bricks for the
+ * conflicting snap
+ */
+ snprintf (buf, sizeof(buf), "%s.host_bricks", prefix);
+ ret = dict_get_int8 (peer_data, buf, (int8_t *) &is_hosted);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to fetch host_bricks from peer: %s "
+ "for %s", peerinfo->hostname, peer_snap_name);
+ goto out;
+ }
+
+ /* As there is a conflict at this point of time, the data of the
+ * node that hosts a brick takes precedence. If both the local
+ * node and the peer are in the same state, i.e if both of them
+ * are either hosting or not hosting the bricks, for the snap,
+ * then it's a peer reject
+ */
+ if (is_hosted == is_local) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Conflict in snapshot %s with peer %s",
+ peer_snap_name, peerinfo->hostname);
+ ret = -1;
+ goto out;
+ }
- GF_ASSERT (vols);
+ if (is_hosted == _gf_false) {
+ /* If there was a conflict, and the peer is not hosting
+ * any brick, then don't accept peer data
+ */
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Peer doesn't hosts bricks for conflicting "
+ "snap(%s). Not accepting peer data.",
+ peer_snap_name);
+ ret = 0;
+ goto out;
+ }
+
+ /* The peer is hosting a brick in case of conflict
+ * And local node isn't. Hence remove local node's
+ * data and accept peer data
+ */
+
+ gf_log (this->name, GF_LOG_DEBUG, "Peer hosts bricks for conflicting "
+ "snap(%s). Removing local data. Accepting peer data.",
+ peer_snap_name);
+
+ dict = dict_new();
+ if (!dict) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to create dict");
+ ret = -1;
+ goto out;
+ }
+
+ ret = glusterd_snap_remove (dict, snap, _gf_true, _gf_false);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to remove snap %s", snap->snapname);
+ goto out;
+ }
+
+accept_peer_data:
+
+ /* Accept Peer Data */
+ ret = glusterd_import_friend_snap (peer_data, snap_count,
+ peer_snap_name, peer_snap_id);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to import snap %s from peer %s",
+ peer_snap_name, peerinfo->hostname);
+ goto out;
+ }
+
+out:
+ if (dict)
+ dict_unref (dict);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* Compare snapshots present in peer_data, with the snapshots in
+ * the current node
+ */
+int32_t
+glusterd_compare_friend_snapshots (dict_t *peer_data,
+ glusterd_peerinfo_t *peerinfo)
+{
+ int32_t ret = -1;
+ int32_t snap_count = 0;
+ int i = 1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (peer_data);
+ GF_ASSERT (peerinfo);
+
+ ret = dict_get_int32 (peer_data, "snap_count", &snap_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to fetch snap_count");
+ goto out;
+ }
+
+ for (i = 1; i <= snap_count; i++) {
+ /* Compare one snapshot from peer_data at a time */
+ ret = glusterd_compare_and_update_snap (peer_data, i, peerinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to compare snapshots with peer %s",
+ peerinfo->hostname);
+ goto out;
+ }
+ }
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_compare_friend_data (dict_t *peer_data, int32_t *status,
+ char *hostname)
+{
+ int32_t ret = -1;
+ int32_t count = 0;
+ int i = 1;
+ gf_boolean_t update = _gf_false;
+ gf_boolean_t stale_nfs = _gf_false;
+ gf_boolean_t stale_shd = _gf_false;
+ gf_boolean_t stale_qd = _gf_false;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (peer_data);
GF_ASSERT (status);
- ret = dict_get_int32 (vols, "count", &count);
+ ret = dict_get_int32 (peer_data, "count", &count);
if (ret)
goto out;
while (i <= count) {
- ret = glusterd_compare_friend_volume (vols, i, status);
+ ret = glusterd_compare_friend_volume (peer_data, i, status,
+ hostname);
if (ret)
goto out;
@@ -2631,7 +5332,12 @@ glusterd_compare_friend_data (dict_t *vols, int32_t *status)
stale_nfs = _gf_true;
if (glusterd_is_nodesvc_running ("glustershd"))
stale_shd = _gf_true;
- ret = glusterd_import_friend_volumes (vols);
+ if (glusterd_is_nodesvc_running ("quotad"))
+ stale_qd = _gf_true;
+ ret = glusterd_import_global_opts (peer_data);
+ if (ret)
+ goto out;
+ ret = glusterd_import_friend_volumes (peer_data);
if (ret)
goto out;
if (_gf_false == glusterd_are_all_volumes_stopped ()) {
@@ -2641,50 +5347,17 @@ glusterd_compare_friend_data (dict_t *vols, int32_t *status)
glusterd_nfs_server_stop ();
if (stale_shd)
glusterd_shd_stop ();
+ if (stale_qd)
+ glusterd_quotad_stop ();
}
}
out:
- gf_log ("", GF_LOG_DEBUG, "Returning with ret: %d, status: %d",
- ret, *status);
-
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Returning with ret: %d, status: %d", ret, *status);
return ret;
}
-/* Valid only in if service is 'local' to glusterd.
- * pid can be -1, if reading pidfile failed */
-gf_boolean_t
-glusterd_is_service_running (char *pidfile, int *pid)
-{
- FILE *file = NULL;
- gf_boolean_t running = _gf_false;
- int ret = 0;
- int fno = 0;
-
- file = fopen (pidfile, "r+");
- if (!file)
- goto out;
-
- fno = fileno (file);
- ret = lockf (fno, F_TEST, 0);
- if (ret == -1)
- running = _gf_true;
- if (!pid)
- goto out;
-
- ret = fscanf (file, "%d", pid);
- if (ret <= 0) {
- gf_log ("", GF_LOG_ERROR, "Unable to read pidfile: %s, %s",
- pidfile, strerror (errno));
- *pid = -1;
- }
-
-out:
- if (file)
- fclose (file);
- return running;
-}
-
void
glusterd_get_nodesvc_dir (char *server, char *workdir,
char *path, size_t len)
@@ -2723,11 +5396,14 @@ glusterd_get_nodesvc_volfile (char *server, char *workdir,
GF_ASSERT (len == PATH_MAX);
glusterd_get_nodesvc_dir (server, workdir, dir, sizeof (dir));
- snprintf (volfile, len, "%s/%s-server.vol", dir, server);
+ if (strcmp ("quotad", server) != 0)
+ snprintf (volfile, len, "%s/%s-server.vol", dir, server);
+ else
+ snprintf (volfile, len, "%s/%s.vol", dir, server);
}
void
-glusterd_nodesvc_set_running (char *server, gf_boolean_t status)
+glusterd_nodesvc_set_online_status (char *server, gf_boolean_t status)
{
glusterd_conf_t *priv = NULL;
@@ -2736,31 +5412,37 @@ glusterd_nodesvc_set_running (char *server, gf_boolean_t status)
GF_ASSERT (priv);
GF_ASSERT (priv->shd);
GF_ASSERT (priv->nfs);
+ GF_ASSERT (priv->quotad);
if (!strcmp("glustershd", server))
- priv->shd->running = status;
+ priv->shd->online = status;
else if (!strcmp ("nfs", server))
- priv->nfs->running = status;
+ priv->nfs->online = status;
+ else if (!strcmp ("quotad", server))
+ priv->quotad->online = status;
}
gf_boolean_t
-glusterd_nodesvc_is_running (char *server)
+glusterd_is_nodesvc_online (char *server)
{
glusterd_conf_t *conf = NULL;
- gf_boolean_t running = _gf_false;
+ gf_boolean_t online = _gf_false;
GF_ASSERT (server);
conf = THIS->private;
GF_ASSERT (conf);
GF_ASSERT (conf->shd);
GF_ASSERT (conf->nfs);
+ GF_ASSERT (conf->quotad);
if (!strcmp (server, "glustershd"))
- running = conf->shd->running;
+ online = conf->shd->online;
else if (!strcmp (server, "nfs"))
- running = conf->nfs->running;
+ online = conf->nfs->online;
+ else if (!strcmp (server, "quotad"))
+ online = conf->quotad->online;
- return running;
+ return online;
}
int32_t
@@ -2784,6 +5466,7 @@ glusterd_pending_node_get_rpc (glusterd_pending_node_t *pending_node)
nodesrv_t *shd = NULL;
glusterd_volinfo_t *volinfo = NULL;
nodesrv_t *nfs = NULL;
+ nodesrv_t *quotad = NULL;
GF_VALIDATE_OR_GOTO (THIS->name, pending_node, out);
GF_VALIDATE_OR_GOTO (THIS->name, pending_node->node, out);
@@ -2798,13 +5481,17 @@ glusterd_pending_node_get_rpc (glusterd_pending_node_t *pending_node)
} else if (pending_node->type == GD_NODE_REBALANCE) {
volinfo = pending_node->node;
- if (volinfo->defrag)
- rpc = volinfo->defrag->rpc;
+ if (volinfo->rebal.defrag)
+ rpc = volinfo->rebal.defrag->rpc;
} else if (pending_node->type == GD_NODE_NFS) {
nfs = pending_node->node;
rpc = nfs->rpc;
+ } else if (pending_node->type == GD_NODE_QUOTAD) {
+ quotad = pending_node->node;
+ rpc = quotad->rpc;
+
} else {
GF_ASSERT (0);
}
@@ -2824,11 +5511,14 @@ glusterd_nodesvc_get_rpc (char *server)
GF_ASSERT (priv);
GF_ASSERT (priv->shd);
GF_ASSERT (priv->nfs);
+ GF_ASSERT (priv->quotad);
if (!strcmp (server, "glustershd"))
rpc = priv->shd->rpc;
else if (!strcmp (server, "nfs"))
rpc = priv->nfs->rpc;
+ else if (!strcmp (server, "quotad"))
+ rpc = priv->quotad->rpc;
return rpc;
}
@@ -2846,11 +5536,14 @@ glusterd_nodesvc_set_rpc (char *server, struct rpc_clnt *rpc)
GF_ASSERT (priv);
GF_ASSERT (priv->shd);
GF_ASSERT (priv->nfs);
+ GF_ASSERT (priv->quotad);
if (!strcmp ("glustershd", server))
priv->shd->rpc = rpc;
else if (!strcmp ("nfs", server))
priv->nfs->rpc = rpc;
+ else if (!strcmp ("quotad", server))
+ priv->quotad->rpc = rpc;
return ret;
}
@@ -2860,6 +5553,7 @@ glusterd_nodesvc_connect (char *server, char *socketpath) {
int ret = 0;
dict_t *options = NULL;
struct rpc_clnt *rpc = NULL;
+ glusterd_conf_t *priv = THIS->private;
rpc = glusterd_nodesvc_get_rpc (server);
@@ -2869,13 +5563,15 @@ glusterd_nodesvc_connect (char *server, char *socketpath) {
* The default timeout of 30mins used for unreliable network
* connections is too long for unix domain socket connections.
*/
- ret = rpc_clnt_transport_unix_options_build (&options,
- socketpath, 600);
+ ret = rpc_transport_unix_options_build (&options, socketpath,
+ 600);
if (ret)
goto out;
+ synclock_unlock (&priv->big_lock);
ret = glusterd_rpc_create (&rpc, options,
glusterd_nodesvc_rpc_notify,
server);
+ synclock_lock (&priv->big_lock);
if (ret)
goto out;
(void) glusterd_nodesvc_set_rpc (server, rpc);
@@ -2888,20 +5584,19 @@ int32_t
glusterd_nodesvc_disconnect (char *server)
{
struct rpc_clnt *rpc = NULL;
+ glusterd_conf_t *priv = THIS->private;
rpc = glusterd_nodesvc_get_rpc (server);
+ (void)glusterd_nodesvc_set_rpc (server, NULL);
- if (rpc) {
- rpc_clnt_connection_cleanup (&rpc->conn);
- rpc_clnt_unref (rpc);
- (void)glusterd_nodesvc_set_rpc (server, NULL);
- }
+ if (rpc)
+ glusterd_rpc_clnt_unref (priv, rpc);
return 0;
}
int32_t
-glusterd_nodesvc_start (char *server)
+glusterd_nodesvc_start (char *server, gf_boolean_t wait)
{
int32_t ret = -1;
xlator_t *this = NULL;
@@ -2914,9 +5609,7 @@ glusterd_nodesvc_start (char *server)
char sockfpath[PATH_MAX] = {0,};
char volfileid[256] = {0};
char glusterd_uuid_option[1024] = {0};
-#ifdef DEBUG
char valgrind_logfile[PATH_MAX] = {0};
-#endif
this = THIS;
GF_ASSERT(this);
@@ -2953,7 +5646,6 @@ glusterd_nodesvc_start (char *server)
runinit (&runner);
-#ifdef DEBUG
if (priv->valgrind) {
snprintf (valgrind_logfile, PATH_MAX,
"%s/valgrind-%s.log",
@@ -2961,17 +5653,19 @@ glusterd_nodesvc_start (char *server)
server);
runner_add_args (&runner, "valgrind", "--leak-check=full",
- "--trace-children=yes", NULL);
+ "--trace-children=yes", "--track-origins=yes",
+ NULL);
runner_argprintf (&runner, "--log-file=%s", valgrind_logfile);
- }
-#endif
+ }
runner_add_args (&runner, SBIN_DIR"/glusterfs",
"-s", "localhost",
"--volfile-id", volfileid,
"-p", pidfile,
"-l", logfile,
- "-S", sockfpath, NULL);
+ "-S", sockfpath,
+ "-L", "DEBUG",
+ NULL);
if (!strcmp (server, "glustershd")) {
snprintf (glusterd_uuid_option, sizeof (glusterd_uuid_option),
@@ -2979,10 +5673,27 @@ glusterd_nodesvc_start (char *server)
runner_add_args (&runner, "--xlator-option",
glusterd_uuid_option, NULL);
}
+ if (!strcmp (server, "quotad")) {
+ runner_add_args (&runner, "--xlator-option",
+ "*replicate*.data-self-heal=off",
+ "--xlator-option",
+ "*replicate*.metadata-self-heal=off",
+ "--xlator-option",
+ "*replicate*.entry-self-heal=off", NULL);
+ }
runner_log (&runner, "", GF_LOG_DEBUG,
"Starting the nfs/glustershd services");
- ret = runner_run_nowait (&runner);
+ if (!wait) {
+ ret = runner_run_nowait (&runner);
+ } else {
+ synclock_unlock (&priv->big_lock);
+ {
+ ret = runner_run (&runner);
+ }
+ synclock_lock (&priv->big_lock);
+ }
+
if (ret == 0) {
glusterd_nodesvc_connect (server, sockfpath);
}
@@ -2993,13 +5704,19 @@ out:
int
glusterd_nfs_server_start ()
{
- return glusterd_nodesvc_start ("nfs");
+ return glusterd_nodesvc_start ("nfs", _gf_false);
}
int
glusterd_shd_start ()
{
- return glusterd_nodesvc_start ("glustershd");
+ return glusterd_nodesvc_start ("glustershd", _gf_false);
+}
+
+int
+glusterd_quotad_start ()
+{
+ return glusterd_nodesvc_start ("quotad", _gf_true);
}
gf_boolean_t
@@ -3010,7 +5727,7 @@ glusterd_is_nodesvc_running (char *server)
glusterd_get_nodesvc_pidfile (server, priv->workdir,
pidfile, sizeof (pidfile));
- return glusterd_is_service_running (pidfile, NULL);
+ return gf_is_service_running (pidfile, NULL);
}
int32_t
@@ -3055,7 +5772,7 @@ glusterd_nodesvc_stop (char *server, int sig)
ret = glusterd_service_stop (server, pidfile, sig, _gf_true);
if (ret == 0) {
- glusterd_nodesvc_set_running (server, _gf_false);
+ glusterd_nodesvc_set_online_status (server, _gf_false);
(void)glusterd_nodesvc_unlink_socket_file (server);
}
out:
@@ -3090,6 +5807,10 @@ glusterd_nfs_pmap_deregister ()
else
gf_log ("", GF_LOG_ERROR, "De-registration of NLM v1 failed");
+ if (pmap_unset (ACL_PROGRAM, ACLV3_VERSION))
+ gf_log ("", GF_LOG_INFO, "De-registered ACL v3 successfully");
+ else
+ gf_log ("", GF_LOG_ERROR, "De-registration of ACL v3 failed");
}
int
@@ -3116,6 +5837,12 @@ glusterd_shd_stop ()
}
int
+glusterd_quotad_stop ()
+{
+ return glusterd_nodesvc_stop ("quotad", SIGTERM);
+}
+
+int
glusterd_add_node_to_dict (char *server, dict_t *dict, int count,
dict_t *vol_opts)
{
@@ -3129,7 +5856,9 @@ glusterd_add_node_to_dict (char *server, dict_t *dict, int count,
glusterd_get_nodesvc_pidfile (server, priv->workdir, pidfile,
sizeof (pidfile));
- running = glusterd_is_service_running (pidfile, &pid);
+ //Consider service to be running only when glusterd sees it Online
+ if (glusterd_is_nodesvc_online (server))
+ running = gf_is_service_running (pidfile, &pid);
/* For nfs-servers/self-heal-daemon setting
* brick<n>.hostname = "NFS Server" / "Self-heal Daemon"
@@ -3145,6 +5874,8 @@ glusterd_add_node_to_dict (char *server, dict_t *dict, int count,
ret = dict_set_str (dict, key, "NFS Server");
else if (!strcmp (server, "glustershd"))
ret = dict_set_str (dict, key, "Self-heal Daemon");
+ else if (!strcmp (server, "quotad"))
+ ret = dict_set_str (dict, key, "Quota Daemon");
if (ret)
goto out;
@@ -3263,11 +5994,22 @@ glusterd_reconfigure_shd ()
}
int
+glusterd_reconfigure_quotad ()
+{
+ return glusterd_reconfigure_nodesvc (glusterd_create_quotad_volfile);
+}
+
+int
glusterd_reconfigure_nfs ()
{
int ret = -1;
gf_boolean_t identical = _gf_false;
+ /*
+ * Check both OLD and NEW volfiles, if they are SAME by size
+ * and cksum i.e. "character-by-character". If YES, then
+ * NOTHING has been changed, just return.
+ */
ret = glusterd_check_nfs_volfile_identical (&identical);
if (ret)
goto out;
@@ -3277,6 +6019,31 @@ glusterd_reconfigure_nfs ()
goto out;
}
+ /*
+ * They are not identical. Find out if the topology is changed
+ * OR just the volume options. If just the options which got
+ * changed, then inform the xlator to reconfigure the options.
+ */
+ identical = _gf_false; /* RESET the FLAG */
+ ret = glusterd_check_nfs_topology_identical (&identical);
+ if (ret)
+ goto out;
+
+ /* Topology is not changed, but just the options. But write the
+ * options to NFS volfile, so that NFS will be reconfigured.
+ */
+ if (identical) {
+ ret = glusterd_create_nfs_volfile();
+ if (ret == 0) {/* Only if above PASSES */
+ ret = glusterd_fetchspec_notify (THIS);
+ }
+ goto out;
+ }
+
+ /*
+ * NFS volfile's topology has been changed. NFS server needs
+ * to be RESTARTED to ACT on the changed volfile.
+ */
ret = glusterd_check_generate_start_nfs ();
out:
@@ -3309,21 +6076,52 @@ glusterd_check_generate_start_shd ()
}
int
-glusterd_nodesvcs_batch_op (glusterd_volinfo_t *volinfo,
- int (*nfs_op) (), int (*shd_op) ())
+glusterd_check_generate_start_quotad ()
{
+ int ret = 0;
+
+ ret = glusterd_check_generate_start_service (glusterd_create_quotad_volfile,
+ glusterd_quotad_stop,
+ glusterd_quotad_start);
+ if (ret == -EINVAL)
+ ret = 0;
+ return ret;
+}
+
+int
+glusterd_nodesvcs_batch_op (glusterd_volinfo_t *volinfo, int (*nfs_op) (),
+ int (*shd_op) (), int (*qd_op) ())
+ {
int ret = 0;
+ xlator_t *this = THIS;
+ glusterd_conf_t *conf = NULL;
+
+ GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
ret = nfs_op ();
if (ret)
goto out;
- if (volinfo && !glusterd_is_volume_replicate (volinfo))
+ if (volinfo && !glusterd_is_volume_replicate (volinfo)) {
+ ; //do nothing
+ } else {
+ ret = shd_op ();
+ if (ret)
+ goto out;
+ }
+
+ if (conf->op_version == GD_OP_VERSION_MIN)
+ goto out;
+
+ if (volinfo && !glusterd_is_volume_quota_enabled (volinfo))
goto out;
- ret = shd_op ();
+ ret = qd_op ();
if (ret)
goto out;
+
out:
return ret;
}
@@ -3333,7 +6131,8 @@ glusterd_nodesvcs_start (glusterd_volinfo_t *volinfo)
{
return glusterd_nodesvcs_batch_op (volinfo,
glusterd_nfs_server_start,
- glusterd_shd_start);
+ glusterd_shd_start,
+ glusterd_quotad_start);
}
int
@@ -3341,7 +6140,8 @@ glusterd_nodesvcs_stop (glusterd_volinfo_t *volinfo)
{
return glusterd_nodesvcs_batch_op (volinfo,
glusterd_nfs_server_stop,
- glusterd_shd_stop);
+ glusterd_shd_stop,
+ glusterd_quotad_stop);
}
gf_boolean_t
@@ -3387,21 +6187,53 @@ glusterd_all_replicate_volumes_stopped ()
return _gf_true;
}
+gf_boolean_t
+glusterd_all_volumes_with_quota_stopped ()
+{
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+ glusterd_volinfo_t *voliter = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ list_for_each_entry (voliter, &priv->volumes, vol_list) {
+ if (!glusterd_is_volume_quota_enabled (voliter))
+ continue;
+ if (voliter->status == GLUSTERD_STATUS_STARTED)
+ return _gf_false;
+ }
+
+ return _gf_true;
+}
+
+
int
glusterd_nodesvcs_handle_graph_change (glusterd_volinfo_t *volinfo)
{
int (*shd_op) () = NULL;
int (*nfs_op) () = NULL;
+ int (*qd_op) () = NULL;
shd_op = glusterd_check_generate_start_shd;
nfs_op = glusterd_check_generate_start_nfs;
+ qd_op = glusterd_check_generate_start_quotad;
if (glusterd_are_all_volumes_stopped ()) {
shd_op = glusterd_shd_stop;
nfs_op = glusterd_nfs_server_stop;
- } else if (glusterd_all_replicate_volumes_stopped()) {
- shd_op = glusterd_shd_stop;
+ qd_op = glusterd_quotad_stop;
+ } else {
+ if (glusterd_all_replicate_volumes_stopped()) {
+ shd_op = glusterd_shd_stop;
+ }
+ if (glusterd_all_volumes_with_quota_stopped ()) {
+ qd_op = glusterd_quotad_stop;
+ }
}
- return glusterd_nodesvcs_batch_op (volinfo, nfs_op, shd_op);
+
+ return glusterd_nodesvcs_batch_op (volinfo, nfs_op, shd_op, qd_op);
}
int
@@ -3409,7 +6241,8 @@ glusterd_nodesvcs_handle_reconfigure (glusterd_volinfo_t *volinfo)
{
return glusterd_nodesvcs_batch_op (volinfo,
glusterd_reconfigure_nfs,
- glusterd_reconfigure_shd);
+ glusterd_reconfigure_shd,
+ glusterd_reconfigure_quotad);
}
int
@@ -3483,8 +6316,7 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo,
if (uuid_is_null (brickinfo->uuid)) {
ret = glusterd_resolve_brick (brickinfo);
if (ret) {
- gf_log ("glusterd", GF_LOG_ERROR,
- "cannot resolve brick: %s:%s",
+ gf_log (this->name, GF_LOG_ERROR, FMTSTR_RESOLVE_BRICK,
brickinfo->hostname, brickinfo->path);
goto out;
}
@@ -3496,33 +6328,52 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo,
}
ret = glusterd_volume_start_glusterfs (volinfo, brickinfo, wait);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to start "
- "glusterfs, ret: %d", ret);
+ gf_log (this->name, GF_LOG_ERROR, "Unable to start brick %s:%s",
+ brickinfo->hostname, brickinfo->path);
goto out;
}
out:
- gf_log ("", GF_LOG_DEBUG, "returning %d ", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "returning %d ", ret);
return ret;
}
int
glusterd_restart_bricks (glusterd_conf_t *conf)
{
+ int ret = 0;
glusterd_volinfo_t *volinfo = NULL;
glusterd_brickinfo_t *brickinfo = NULL;
+ glusterd_snap_t *snap = NULL;
gf_boolean_t start_nodesvcs = _gf_false;
- int ret = 0;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
list_for_each_entry (volinfo, &conf->volumes, vol_list) {
- /* If volume status is not started, do not proceed */
- if (volinfo->status == GLUSTERD_STATUS_STARTED) {
+ if (volinfo->status != GLUSTERD_STATUS_STARTED)
+ continue;
+ start_nodesvcs = _gf_true;
+ gf_log (this->name, GF_LOG_DEBUG, "starting the volume %s",
+ volinfo->volname);
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ glusterd_brick_start (volinfo, brickinfo, _gf_false);
+ }
+ }
+
+ list_for_each_entry (snap, &conf->snapshots, snap_list) {
+ list_for_each_entry (volinfo, &snap->volumes, vol_list) {
+ if (volinfo->status != GLUSTERD_STATUS_STARTED)
+ continue;
+ start_nodesvcs = _gf_true;
+ gf_log (this->name, GF_LOG_DEBUG, "starting the snap "
+ "volume %s", volinfo->volname);
list_for_each_entry (brickinfo, &volinfo->bricks,
brick_list) {
glusterd_brick_start (volinfo, brickinfo,
- _gf_true);
+ _gf_false);
}
- start_nodesvcs = _gf_true;
}
}
@@ -3535,13 +6386,26 @@ glusterd_restart_bricks (glusterd_conf_t *conf)
int
_local_gsyncd_start (dict_t *this, char *key, data_t *value, void *data)
{
+ char *path_list = NULL;
char *slave = NULL;
+ char *slave_ip = NULL;
+ char *slave_vol = NULL;
+ char *statefile = NULL;
+ char buf[1024] = "faulty";
int uuid_len = 0;
+ int ret = 0;
char uuid_str[64] = {0};
- glusterd_volinfo_t *volinfo = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ char confpath[PATH_MAX] = "";
+ char *op_errstr = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ GF_ASSERT (THIS);
+ priv = THIS->private;
+ GF_ASSERT (priv);
+ GF_ASSERT (data);
volinfo = data;
- GF_ASSERT (volinfo);
slave = strchr(value->data, ':');
if (slave)
slave ++;
@@ -3550,9 +6414,65 @@ _local_gsyncd_start (dict_t *this, char *key, data_t *value, void *data)
uuid_len = (slave - value->data - 1);
strncpy (uuid_str, (char*)value->data, uuid_len);
- glusterd_start_gsync (volinfo, slave, uuid_str, NULL);
- return 0;
+ /* Getting Local Brickpaths */
+ ret = glusterd_get_local_brickpaths (volinfo, &path_list);
+
+ /*Generating the conf file path needed by gsyncd */
+ ret = glusterd_get_slave_info (slave, &slave_ip,
+ &slave_vol, &op_errstr);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to fetch slave details.");
+ ret = -1;
+ goto out;
+ }
+
+ ret = snprintf (confpath, sizeof(confpath) - 1,
+ "%s/"GEOREP"/%s_%s_%s/gsyncd.conf",
+ priv->workdir, volinfo->volname,
+ slave_ip, slave_vol);
+ confpath[ret] = '\0';
+
+ /* Fetching the last status of the node */
+ ret = glusterd_get_statefile_name (volinfo, slave,
+ confpath, &statefile);
+ if (ret) {
+ if (!strstr(slave, "::"))
+ gf_log ("", GF_LOG_INFO,
+ "%s is not a valid slave url.", slave);
+ else
+ gf_log ("", GF_LOG_INFO, "Unable to get"
+ " statefile's name");
+ goto out;
+ }
+
+ ret = glusterd_gsync_read_frm_status (statefile, buf, sizeof (buf));
+ if (ret < 0) {
+ gf_log ("", GF_LOG_ERROR, "Unable to read the status");
+ goto out;
+ }
+
+ /* Looks for the last status, to find if the sessiom was running
+ * when the node went down. If the session was not started or
+ * not started, do not restart the geo-rep session */
+ if ((!strcmp (buf, "Not Started")) ||
+ (!strcmp (buf, "Stopped"))) {
+ gf_log ("", GF_LOG_INFO,
+ "Geo-Rep Session was not started between "
+ "%s and %s::%s. Not Restarting", volinfo->volname,
+ slave_ip, slave_vol);
+ goto out;
+ }
+
+ glusterd_start_gsync (volinfo, slave, path_list, confpath,
+ uuid_str, NULL);
+
+out:
+ GF_FREE (path_list);
+ GF_FREE (op_errstr);
+
+ return ret;
}
int
@@ -3576,6 +6496,15 @@ glusterd_restart_gsyncds (glusterd_conf_t *conf)
return ret;
}
+inline int
+glusterd_get_dist_leaf_count (glusterd_volinfo_t *volinfo)
+{
+ int rcount = volinfo->replica_count;
+ int scount = volinfo->stripe_count;
+
+ return (rcount ? rcount : 1) * (scount ? scount : 1);
+}
+
int
glusterd_get_brickinfo (xlator_t *this, const char *brickname, int port,
gf_boolean_t localhost, glusterd_brickinfo_t **brickinfo)
@@ -3592,7 +6521,7 @@ glusterd_get_brickinfo (xlator_t *this, const char *brickname, int port,
list_for_each_entry (volinfo, &priv->volumes, vol_list) {
list_for_each_entry (tmpbrkinfo, &volinfo->bricks,
brick_list) {
- if (localhost && glusterd_is_local_addr (tmpbrkinfo->hostname))
+ if (localhost && !gf_is_local_addr (tmpbrkinfo->hostname))
continue;
if (!strcmp(tmpbrkinfo->path, brickname) &&
(tmpbrkinfo->port == port)) {
@@ -3663,27 +6592,27 @@ out:
return -1;
}
-#ifdef GF_LINUX_HOST_OS
-static int
+int
glusterd_get_brick_root (char *path, char **mount_point)
{
char *ptr = NULL;
+ char *mnt_pt = NULL;
struct stat brickstat = {0};
struct stat buf = {0};
if (!path)
goto err;
- *mount_point = gf_strdup (path);
- if (!*mount_point)
+ mnt_pt = gf_strdup (path);
+ if (!mnt_pt)
goto err;
- if (stat (*mount_point, &brickstat))
+ if (stat (mnt_pt, &brickstat))
goto err;
- while ((ptr = strrchr (*mount_point, '/')) &&
- ptr != *mount_point) {
+ while ((ptr = strrchr (mnt_pt, '/')) &&
+ ptr != mnt_pt) {
*ptr = '\0';
- if (stat (*mount_point, &buf)) {
+ if (stat (mnt_pt, &buf)) {
gf_log (THIS->name, GF_LOG_ERROR, "error in "
"stat: %s", strerror (errno));
goto err;
@@ -3695,20 +6624,21 @@ glusterd_get_brick_root (char *path, char **mount_point)
}
}
- if (ptr == *mount_point) {
+ if (ptr == mnt_pt) {
if (stat ("/", &buf)) {
gf_log (THIS->name, GF_LOG_ERROR, "error in "
"stat: %s", strerror (errno));
goto err;
}
if (brickstat.st_dev == buf.st_dev)
- strcpy (*mount_point, "/");
+ strcpy (mnt_pt, "/");
}
+ *mount_point = mnt_pt;
return 0;
err:
- GF_FREE (*mount_point);
+ GF_FREE (mnt_pt);
return -1;
}
@@ -3786,6 +6716,17 @@ glusterd_add_inode_size_to_dict (dict_t *dict, int count)
"size for %s : %s package missing", fs_name,
((strcmp (fs_name, "xfs")) ?
"e2fsprogs" : "xfsprogs"));
+ /*
+ * Runner_start might return an error after the child has
+ * been forked, e.g. if the program isn't there. In that
+ * case, we still need to call runner_end to reap the
+ * child and free resources. Fortunately, that seems to
+ * be harmless for other kinds of failures.
+ */
+ if (runner_end(&runner)) {
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "double failure calling runner_end");
+ }
goto out;
}
@@ -3830,6 +6771,31 @@ glusterd_add_inode_size_to_dict (dict_t *dict, int count)
return ret;
}
+struct mntent *
+glusterd_get_mnt_entry_info (char *mnt_pt, FILE *mtab)
+{
+ struct mntent *entry = NULL;
+
+ mtab = setmntent (_PATH_MOUNTED, "r");
+ if (!mtab)
+ goto out;
+
+ entry = getmntent (mtab);
+
+ while (1) {
+ if (!entry)
+ goto out;
+
+ if (!strcmp (entry->mnt_dir, mnt_pt) &&
+ strcmp (entry->mnt_type, "rootfs"))
+ break;
+ entry = getmntent (mtab);
+ }
+
+out:
+ return entry;
+}
+
static int
glusterd_add_brick_mount_details (glusterd_brickinfo_t *brickinfo,
dict_t *dict, int count)
@@ -3838,9 +6804,6 @@ glusterd_add_brick_mount_details (glusterd_brickinfo_t *brickinfo,
char key[1024] = {0};
char base_key[1024] = {0};
char *mnt_pt = NULL;
- char *fs_name = NULL;
- char *mnt_options = NULL;
- char *device = NULL;
FILE *mtab = NULL;
struct mntent *entry = NULL;
@@ -3850,31 +6813,17 @@ glusterd_add_brick_mount_details (glusterd_brickinfo_t *brickinfo,
if (ret)
goto out;
- mtab = setmntent (_PATH_MOUNTED, "r");
- if (!mtab) {
+ entry = glusterd_get_mnt_entry_info (mnt_pt, mtab);
+ if (!entry) {
ret = -1;
goto out;
}
- entry = getmntent (mtab);
-
- while (1) {
- if (!entry) {
- ret = -1;
- goto out;
- }
- if (!strcmp (entry->mnt_dir, mnt_pt) &&
- strcmp (entry->mnt_type, "rootfs"))
- break;
- entry = getmntent (mtab);
- }
-
/* get device file */
memset (key, 0, sizeof (key));
snprintf (key, sizeof (key), "%s.device", base_key);
- device = gf_strdup (entry->mnt_fsname);
- ret = dict_set_dynstr (dict, key, device);
+ ret = dict_set_dynstr_with_alloc (dict, key, entry->mnt_fsname);
if (ret)
goto out;
@@ -3882,8 +6831,7 @@ glusterd_add_brick_mount_details (glusterd_brickinfo_t *brickinfo,
memset (key, 0, sizeof (key));
snprintf (key, sizeof (key), "%s.fs_name", base_key);
- fs_name = gf_strdup (entry->mnt_type);
- ret = dict_set_dynstr (dict, key, fs_name);
+ ret = dict_set_dynstr_with_alloc (dict, key, entry->mnt_type);
if (ret)
goto out;
@@ -3891,8 +6839,7 @@ glusterd_add_brick_mount_details (glusterd_brickinfo_t *brickinfo,
memset (key, 0, sizeof (key));
snprintf (key, sizeof (key), "%s.mnt_options", base_key);
- mnt_options = gf_strdup (entry->mnt_opts);
- ret = dict_set_dynstr (dict, key, mnt_options);
+ ret = dict_set_dynstr_with_alloc (dict, key, entry->mnt_opts);
out:
GF_FREE (mnt_pt);
@@ -3901,7 +6848,45 @@ glusterd_add_brick_mount_details (glusterd_brickinfo_t *brickinfo,
return ret;
}
-#endif
+
+char*
+glusterd_get_brick_mount_details (glusterd_brickinfo_t *brickinfo)
+{
+ int ret = -1;
+ char *mnt_pt = NULL;
+ char *device = NULL;
+ FILE *mtab = NULL;
+ struct mntent *entry = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (brickinfo);
+
+ ret = glusterd_get_brick_root (brickinfo->path, &mnt_pt);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get mount point "
+ "for %s brick", brickinfo->path);
+ goto out;
+ }
+
+ entry = glusterd_get_mnt_entry_info (mnt_pt, mtab);
+ if (NULL == entry) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get mnt entry "
+ "for %s mount path", mnt_pt);
+ goto out;
+ }
+
+ /* get the fs_name/device */
+ device = gf_strdup (entry->mnt_fsname);
+
+out:
+ if (NULL != mtab) {
+ endmntent (mtab);
+ }
+
+ return device;
+}
int
glusterd_add_brick_detail_to_dict (glusterd_volinfo_t *volinfo,
@@ -3975,13 +6960,12 @@ glusterd_add_brick_detail_to_dict (glusterd_volinfo_t *volinfo,
if (ret)
goto out;
}
-#ifdef GF_LINUX_HOST_OS
+
ret = glusterd_add_brick_mount_details (brickinfo, dict, count);
if (ret)
goto out;
ret = glusterd_add_inode_size_to_dict (dict, count);
-#endif
out:
if (ret)
gf_log (this->name, GF_LOG_DEBUG, "Error adding brick"
@@ -4001,7 +6985,6 @@ glusterd_add_brick_to_dict (glusterd_volinfo_t *volinfo,
char key[1024] = {0};
char base_key[1024] = {0};
char pidfile[PATH_MAX] = {0};
- char path[PATH_MAX] = {0};
xlator_t *this = NULL;
glusterd_conf_t *priv = NULL;
@@ -4028,17 +7011,24 @@ glusterd_add_brick_to_dict (glusterd_volinfo_t *volinfo,
if (ret)
goto out;
+ /* add peer uuid */
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "%s.peerid", base_key);
+ ret = dict_set_dynstr_with_alloc (dict, key,
+ uuid_utoa (brickinfo->uuid));
+ if (ret) {
+ goto out;
+ }
+
memset (key, 0, sizeof (key));
snprintf (key, sizeof (key), "%s.port", base_key);
ret = dict_set_int32 (dict, key, brickinfo->port);
if (ret)
goto out;
- GLUSTERD_GET_VOLUME_DIR (path, volinfo, priv);
- GLUSTERD_GET_BRICK_PIDFILE (pidfile, path, brickinfo->hostname,
- brickinfo->path);
+ GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, brickinfo, priv);
- brick_online = glusterd_is_service_running (pidfile, &pid);
+ brick_online = gf_is_service_running (pidfile, &pid);
memset (key, 0, sizeof (key));
snprintf (key, sizeof (key), "%s.pid", base_key);
@@ -4124,11 +7114,14 @@ glusterd_friend_find_by_uuid (uuid_t uuid,
int ret = -1;
glusterd_conf_t *priv = NULL;
glusterd_peerinfo_t *entry = NULL;
+ xlator_t *this = NULL;
+ this = THIS;
+ GF_ASSERT (this);
GF_ASSERT (peerinfo);
*peerinfo = NULL;
- priv = THIS->private;
+ priv = this->private;
GF_ASSERT (priv);
@@ -4138,7 +7131,7 @@ glusterd_friend_find_by_uuid (uuid_t uuid,
list_for_each_entry (entry, &priv->peers, uuid_list) {
if (!uuid_compare (entry->uuid, uuid)) {
- gf_log ("glusterd", GF_LOG_DEBUG,
+ gf_log (this->name, GF_LOG_DEBUG,
"Friend found... state: %s",
glusterd_friend_sm_state_name_get (entry->state.state));
*peerinfo = entry;
@@ -4146,7 +7139,7 @@ glusterd_friend_find_by_uuid (uuid_t uuid,
}
}
- gf_log ("glusterd", GF_LOG_DEBUG, "Friend with uuid: %s, not found",
+ gf_log (this->name, GF_LOG_DEBUG, "Friend with uuid: %s, not found",
uuid_utoa (uuid));
return ret;
}
@@ -4257,17 +7250,18 @@ glusterd_hostname_to_uuid (char *hostname, uuid_t uuid)
ret = glusterd_friend_find_by_hostname (hostname, &peerinfo);
if (ret) {
- ret = glusterd_is_local_addr (hostname);
- if (ret)
- goto out;
- else
+ if (gf_is_local_addr (hostname)) {
uuid_copy (uuid, MY_UUID);
+ ret = 0;
+ } else {
+ goto out;
+ }
} else {
uuid_copy (uuid, peerinfo->uuid);
}
out:
- gf_log ("", GF_LOG_DEBUG, "returning %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "returning %d", ret);
return ret;
}
@@ -4291,8 +7285,7 @@ glusterd_brick_stop (glusterd_volinfo_t *volinfo,
if (uuid_is_null (brickinfo->uuid)) {
ret = glusterd_resolve_brick (brickinfo);
if (ret) {
- gf_log ("glusterd", GF_LOG_ERROR,
- "cannot resolve brick: %s:%s",
+ gf_log (this->name, GF_LOG_ERROR, FMTSTR_RESOLVE_BRICK,
brickinfo->hostname, brickinfo->path);
goto out;
}
@@ -4300,29 +7293,31 @@ glusterd_brick_stop (glusterd_volinfo_t *volinfo,
if (uuid_compare (brickinfo->uuid, MY_UUID)) {
ret = 0;
+ if (del_brick)
+ glusterd_delete_brick (volinfo, brickinfo);
goto out;
}
- gf_log ("", GF_LOG_INFO, "About to stop glusterfs"
+ gf_log (this->name, GF_LOG_DEBUG, "About to stop glusterfs"
" for brick %s:%s", brickinfo->hostname,
brickinfo->path);
ret = glusterd_volume_stop_glusterfs (volinfo, brickinfo, del_brick);
if (ret) {
- gf_log ("", GF_LOG_CRITICAL, "Unable to remove"
+ gf_log (this->name, GF_LOG_CRITICAL, "Unable to stop"
" brick: %s:%s", brickinfo->hostname,
brickinfo->path);
goto out;
}
out:
- gf_log ("", GF_LOG_DEBUG, "returning %d ", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "returning %d ", ret);
return ret;
}
int
glusterd_is_defrag_on (glusterd_volinfo_t *volinfo)
{
- return (volinfo->defrag != NULL);
+ return (volinfo->rebal.defrag != NULL);
}
gf_boolean_t
@@ -4370,41 +7365,54 @@ glusterd_new_brick_validate (char *brick, glusterd_brickinfo_t *brickinfo,
ret = glusterd_resolve_brick (newbrickinfo);
if (ret) {
- snprintf (op_errstr, len, "Host %s not a friend",
- newbrickinfo->hostname);
- gf_log (THIS->name, GF_LOG_ERROR, "%s", op_errstr);
+ snprintf(op_errstr, len, "Host %s is not in \'Peer "
+ "in Cluster\' state", newbrickinfo->hostname);
goto out;
}
- if (!uuid_compare (MY_UUID, newbrickinfo->uuid))
- goto brick_validation;
- ret = glusterd_friend_find_by_uuid (newbrickinfo->uuid, &peerinfo);
- if (ret)
- goto out;
- if ((!peerinfo->connected) ||
- (peerinfo->state.state != GD_FRIEND_STATE_BEFRIENDED)) {
- snprintf(op_errstr, len, "Host %s not connected",
- newbrickinfo->hostname);
- gf_log (THIS->name, GF_LOG_ERROR, "%s", op_errstr);
- ret = -1;
- goto out;
- }
-brick_validation:
- if (!glusterd_is_brickpath_available (newbrickinfo->uuid,
- newbrickinfo->path)) {
- snprintf(op_errstr, len, "Brick: %s not available. Brick may "
- "be containing or be contained by an existing brick",
- brick);
- gf_log (THIS->name, GF_LOG_ERROR, "%s", op_errstr);
- ret = -1;
- goto out;
+ if (!uuid_compare (MY_UUID, newbrickinfo->uuid)) {
+ /* brick is local */
+ if (!glusterd_is_brickpath_available (newbrickinfo->uuid,
+ newbrickinfo->path)) {
+ snprintf(op_errstr, len, "Brick: %s not available."
+ " Brick may be containing or be contained "
+ "by an existing brick", brick);
+ ret = -1;
+ goto out;
+ }
+
} else {
- ret = 0;
+ ret = glusterd_friend_find_by_uuid (newbrickinfo->uuid,
+ &peerinfo);
+ if (ret) {
+ snprintf (op_errstr, len, "Failed to find host %s",
+ newbrickinfo->hostname);
+ goto out;
+ }
+
+ if ((!peerinfo->connected)) {
+ snprintf(op_errstr, len, "Host %s not connected",
+ newbrickinfo->hostname);
+ ret = -1;
+ goto out;
+ }
+
+ if (peerinfo->state.state != GD_FRIEND_STATE_BEFRIENDED) {
+ snprintf(op_errstr, len, "Host %s is not in \'Peer "
+ "in Cluster\' state",
+ newbrickinfo->hostname);
+ ret = -1;
+ goto out;
+ }
}
+
+ ret = 0;
out:
- if (is_allocated && newbrickinfo)
+ if (is_allocated)
glusterd_brickinfo_delete (newbrickinfo);
- gf_log (THIS->name, GF_LOG_DEBUG, "returning %d ", ret);
+ if (op_errstr[0] != '\0')
+ gf_log (this->name, GF_LOG_ERROR, "%s", op_errstr);
+ gf_log (this->name, GF_LOG_DEBUG, "returning %d ", ret);
return ret;
}
@@ -4412,8 +7420,8 @@ int
glusterd_is_rb_started(glusterd_volinfo_t *volinfo)
{
gf_log ("", GF_LOG_DEBUG,
- "is_rb_started:status=%d", volinfo->rb_status);
- return (volinfo->rb_status == GF_RB_STATUS_STARTED);
+ "is_rb_started:status=%d", volinfo->rep_brick.rb_status);
+ return (volinfo->rep_brick.rb_status == GF_RB_STATUS_STARTED);
}
@@ -4421,9 +7429,9 @@ int
glusterd_is_rb_paused ( glusterd_volinfo_t *volinfo)
{
gf_log ("", GF_LOG_DEBUG,
- "is_rb_paused:status=%d", volinfo->rb_status);
+ "is_rb_paused:status=%d", volinfo->rep_brick.rb_status);
- return (volinfo->rb_status == GF_RB_STATUS_PAUSED);
+ return (volinfo->rep_brick.rb_status == GF_RB_STATUS_PAUSED);
}
inline int
@@ -4431,10 +7439,10 @@ glusterd_set_rb_status (glusterd_volinfo_t *volinfo, gf_rb_status_t status)
{
gf_log ("", GF_LOG_DEBUG,
"setting status from %d to %d",
- volinfo->rb_status,
+ volinfo->rep_brick.rb_status,
status);
- volinfo->rb_status = status;
+ volinfo->rep_brick.rb_status = status;
return 0;
}
@@ -4442,19 +7450,27 @@ inline int
glusterd_rb_check_bricks (glusterd_volinfo_t *volinfo,
glusterd_brickinfo_t *src, glusterd_brickinfo_t *dst)
{
- if (!volinfo->src_brick || !volinfo->dst_brick)
+ glusterd_replace_brick_t *rb = NULL;
+
+ GF_ASSERT (volinfo);
+
+ rb = &volinfo->rep_brick;
+
+ if (!rb->src_brick || !rb->dst_brick)
return -1;
- if (strcmp (volinfo->src_brick->hostname, src->hostname) ||
- strcmp (volinfo->src_brick->path, src->path)) {
+ if (strcmp (rb->src_brick->hostname, src->hostname) ||
+ strcmp (rb->src_brick->path, src->path)) {
gf_log("", GF_LOG_ERROR, "Replace brick src bricks differ");
return -1;
}
- if (strcmp (volinfo->dst_brick->hostname, dst->hostname) ||
- strcmp (volinfo->dst_brick->path, dst->path)) {
+
+ if (strcmp (rb->dst_brick->hostname, dst->hostname) ||
+ strcmp (rb->dst_brick->path, dst->path)) {
gf_log ("", GF_LOG_ERROR, "Replace brick dst bricks differ");
return -1;
}
+
return 0;
}
@@ -4553,8 +7569,13 @@ out:
}
if (*in_use) {
- snprintf (msg, sizeof (msg), "%s or a prefix of it is "
- "already part of a volume", path);
+ if (!strcmp (path, curdir)) {
+ snprintf (msg, sizeof (msg), "%s is already part of a "
+ "volume", path);
+ } else {
+ snprintf (msg, sizeof (msg), "parent directory %s is "
+ "already part of a volume", curdir);
+ }
}
if (strlen (msg)) {
@@ -4566,16 +7587,13 @@ out:
}
int
-glusterd_brick_create_path (char *host, char *path, uuid_t uuid,
- char **op_errstr)
+glusterd_check_and_set_brick_xattr (char *host, char *path, uuid_t uuid,
+ char **op_errstr, gf_boolean_t is_force)
{
int ret = -1;
char msg[2048] = {0,};
gf_boolean_t in_use = _gf_false;
-
- ret = mkdir_p (path, 0777, _gf_true);
- if (ret)
- goto out;
+ int flags = 0;
/* Check for xattr support in backend fs */
ret = sys_lsetxattr (path, "trusted.glusterfs.test",
@@ -4589,20 +7607,23 @@ glusterd_brick_create_path (char *host, char *path, uuid_t uuid,
} else {
sys_lremovexattr (path, "trusted.glusterfs.test");
-
}
ret = glusterd_is_path_in_use (path, &in_use, op_errstr);
if (ret)
goto out;
- if (in_use) {
+ if (in_use && !is_force) {
ret = -1;
goto out;
}
+
+ if (!is_force)
+ flags = XATTR_CREATE;
+
ret = sys_lsetxattr (path, GF_XATTR_VOL_ID_KEY, uuid, 16,
- XATTR_CREATE);
+ flags);
if (ret) {
snprintf (msg, sizeof (msg), "Failed to set extended "
"attributes %s, reason: %s",
@@ -4616,7 +7637,6 @@ out:
*op_errstr = gf_strdup (msg);
return ret;
-
}
int
@@ -4658,8 +7678,7 @@ glusterd_sm_tr_log_transition_add_to_dict (dict_t *dict,
snprintf (key, sizeof (key), "log%d-time", count);
gf_time_fmt (timestr, sizeof timestr, log->transitions[i].time,
gf_timefmt_FT);
- str = gf_strdup (timestr);
- ret = dict_set_dynstr (dict, key, str);
+ ret = dict_set_dynstr_with_alloc (dict, key, timestr);
if (ret)
goto out;
@@ -4757,6 +7776,10 @@ glusterd_sm_tr_log_transition_add (glusterd_sm_tr_log_t *log,
glusterd_sm_transition_t *transitions = NULL;
int ret = -1;
int next = 0;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
GF_ASSERT (log);
if (!log)
@@ -4779,18 +7802,18 @@ glusterd_sm_tr_log_transition_add (glusterd_sm_tr_log_t *log,
if (log->count < log->size)
log->count++;
ret = 0;
- gf_log ("glusterd", GF_LOG_DEBUG, "Transitioning from '%s' to '%s' "
+ gf_log (this->name, GF_LOG_DEBUG, "Transitioning from '%s' to '%s' "
"due to event '%s'", log->state_name_get (old_state),
log->state_name_get (new_state), log->event_name_get (event));
out:
- gf_log ("", GF_LOG_DEBUG, "returning %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "returning %d", ret);
return ret;
}
int
glusterd_peerinfo_new (glusterd_peerinfo_t **peerinfo,
- glusterd_friend_sm_state_t state,
- uuid_t *uuid, const char *hostname)
+ glusterd_friend_sm_state_t state, uuid_t *uuid,
+ const char *hostname, int port)
{
glusterd_peerinfo_t *new_peer = NULL;
int ret = -1;
@@ -4820,6 +7843,9 @@ glusterd_peerinfo_new (glusterd_peerinfo_t **peerinfo,
if (ret)
goto out;
+ if (new_peer->state.state == GD_FRIEND_STATE_BEFRIENDED)
+ new_peer->quorum_contrib = QUORUM_WAITING;
+ new_peer->port = port;
*peerinfo = new_peer;
out:
if (ret && new_peer)
@@ -4870,7 +7896,7 @@ glusterd_remove_pending_entry (struct list_head *list, void *elem)
}
}
out:
- gf_log ("", GF_LOG_DEBUG, "returning %d", ret);
+ gf_log (THIS->name, GF_LOG_DEBUG, "returning %d", ret);
return ret;
}
@@ -4910,9 +7936,9 @@ glusterd_delete_volume (glusterd_volinfo_t *volinfo)
if (ret)
goto out;
- ret = glusterd_volinfo_delete (volinfo);
+ glusterd_volinfo_remove (volinfo);
out:
- gf_log ("", GF_LOG_DEBUG, "returning %d", ret);
+ gf_log (THIS->name, GF_LOG_DEBUG, "returning %d", ret);
return ret;
}
@@ -4921,11 +7947,15 @@ glusterd_delete_brick (glusterd_volinfo_t* volinfo,
glusterd_brickinfo_t *brickinfo)
{
int ret = 0;
+ char voldir[PATH_MAX] = {0,};
+ glusterd_conf_t *priv = THIS->private;
GF_ASSERT (volinfo);
GF_ASSERT (brickinfo);
+ GLUSTERD_GET_VOLUME_DIR(voldir, volinfo, priv);
+
glusterd_delete_volfile (volinfo, brickinfo);
- glusterd_store_delete_brick (volinfo, brickinfo);
+ glusterd_store_delete_brick (brickinfo, voldir);
glusterd_brickinfo_delete (brickinfo);
volinfo->brick_count--;
return ret;
@@ -4947,12 +7977,92 @@ glusterd_delete_all_bricks (glusterd_volinfo_t* volinfo)
}
int
+glusterd_get_local_brickpaths (glusterd_volinfo_t *volinfo, char **pathlist)
+{
+ char **path_tokens = NULL;
+ char *tmp_path_list = NULL;
+ char path[PATH_MAX] = "";
+ int32_t count = 0;
+ int32_t pathlen = 0;
+ int32_t total_len = 0;
+ int32_t ret = 0;
+ int i = 0;
+ glusterd_brickinfo_t *brickinfo = NULL;
+
+ if ((!volinfo) || (!pathlist))
+ goto out;
+
+ path_tokens = GF_CALLOC (sizeof(char*), volinfo->brick_count,
+ gf_gld_mt_charptr);
+ if (!path_tokens) {
+ gf_log ("", GF_LOG_DEBUG, "Could not allocate memory.");
+ ret = -1;
+ goto out;
+ }
+
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ if (uuid_compare (brickinfo->uuid, MY_UUID))
+ continue;
+
+ pathlen = snprintf (path, sizeof(path),
+ "--path=%s ", brickinfo->path);
+ if (pathlen < sizeof(path))
+ path[pathlen] = '\0';
+ else
+ path[sizeof(path)-1] = '\0';
+ path_tokens[count] = gf_strdup (path);
+ if (!path_tokens[count]) {
+ gf_log ("", GF_LOG_DEBUG,
+ "Could not allocate memory.");
+ ret = -1;
+ goto out;
+ }
+ count++;
+ total_len += pathlen;
+ }
+
+ tmp_path_list = GF_CALLOC (sizeof(char), total_len + 1,
+ gf_gld_mt_char);
+ if (!tmp_path_list) {
+ gf_log ("", GF_LOG_DEBUG, "Could not allocate memory.");
+ ret = -1;
+ goto out;
+ }
+
+ for (i = 0; i < count; i++)
+ strcat (tmp_path_list, path_tokens[i]);
+
+ if (count)
+ *pathlist = tmp_path_list;
+
+ ret = count;
+out:
+ for (i = 0; i < count; i++) {
+ GF_FREE (path_tokens[i]);
+ path_tokens[i] = NULL;
+ }
+
+ GF_FREE (path_tokens);
+ path_tokens = NULL;
+
+ if (ret == 0) {
+ gf_log ("", GF_LOG_DEBUG, "No Local Bricks Present.");
+ GF_FREE (tmp_path_list);
+ tmp_path_list = NULL;
+ }
+
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int
glusterd_start_gsync (glusterd_volinfo_t *master_vol, char *slave,
- char *glusterd_uuid_str, char **op_errstr)
+ char *path_list, char *conf_path,
+ char *glusterd_uuid_str,
+ char **op_errstr)
{
int32_t ret = 0;
int32_t status = 0;
- char buf[PATH_MAX] = {0,};
char uuid_str [64] = {0};
runner_t runner = {0,};
xlator_t *this = NULL;
@@ -4965,47 +8075,45 @@ glusterd_start_gsync (glusterd_volinfo_t *master_vol, char *slave,
GF_ASSERT (priv);
uuid_utoa_r (MY_UUID, uuid_str);
- if (strcmp (uuid_str, glusterd_uuid_str))
- goto out;
-
- ret = gsync_status (master_vol->volname, slave, &status);
- if (status == 0)
- goto out;
- snprintf (buf, PATH_MAX, "%s/"GEOREP"/%s", priv->workdir, master_vol->volname);
- ret = mkdir_p (buf, 0777, _gf_true);
- if (ret) {
- errcode = -1;
+ if (!path_list) {
+ ret = 0;
+ gf_log ("", GF_LOG_DEBUG, "No Bricks in this node."
+ " Not starting gsyncd.");
goto out;
}
- snprintf (buf, PATH_MAX, DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"/%s",
- master_vol->volname);
- ret = mkdir_p (buf, 0777, _gf_true);
- if (ret) {
- errcode = -1;
+ ret = gsync_status (master_vol->volname, slave, conf_path, &status);
+ if (status == 0)
goto out;
- }
uuid_utoa_r (master_vol->volume_id, uuid_str);
runinit (&runner);
- runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", "-c", NULL);
- runner_argprintf (&runner, "%s/"GSYNC_CONF, priv->workdir);
+ runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd",
+ path_list, "-c", NULL);
+ runner_argprintf (&runner, "%s", conf_path);
runner_argprintf (&runner, ":%s", master_vol->volname);
runner_add_args (&runner, slave, "--config-set", "session-owner",
uuid_str, NULL);
+ synclock_unlock (&priv->big_lock);
ret = runner_run (&runner);
+ synclock_lock (&priv->big_lock);
if (ret == -1) {
errcode = -1;
goto out;
}
runinit (&runner);
- runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd", "--monitor", "-c", NULL);
- runner_argprintf (&runner, "%s/"GSYNC_CONF, priv->workdir);
+ runner_add_args (&runner, GSYNCD_PREFIX"/gsyncd",
+ path_list, "--monitor", "-c", NULL);
+ runner_argprintf (&runner, "%s", conf_path);
runner_argprintf (&runner, ":%s", master_vol->volname);
+ runner_argprintf (&runner, "--glusterd-uuid=%s",
+ uuid_utoa (priv->uuid));
runner_add_arg (&runner, slave);
+ synclock_unlock (&priv->big_lock);
ret = runner_run (&runner);
+ synclock_lock (&priv->big_lock);
if (ret == -1) {
gf_asprintf (op_errstr, GEOREP" start failed for %s %s",
master_vol->volname, slave);
@@ -5017,7 +8125,7 @@ glusterd_start_gsync (glusterd_volinfo_t *master_vol, char *slave,
out:
if ((ret != 0) && errcode == -1) {
if (op_errstr)
- *op_errstr = gf_strdup ("internal error, cannot start"
+ *op_errstr = gf_strdup ("internal error, cannot start "
"the " GEOREP " session");
}
@@ -5026,17 +8134,38 @@ out:
}
int32_t
-glusterd_recreate_bricks (glusterd_conf_t *conf)
+glusterd_recreate_volfiles (glusterd_conf_t *conf)
{
glusterd_volinfo_t *volinfo = NULL;
int ret = 0;
+ int op_ret = 0;
GF_ASSERT (conf);
list_for_each_entry (volinfo, &conf->volumes, vol_list) {
ret = generate_brick_volfiles (volinfo);
+ if (ret) {
+ gf_log ("glusterd", GF_LOG_ERROR, "Failed to "
+ "regenerate brick volfiles for %s",
+ volinfo->volname);
+ op_ret = ret;
+ }
+ ret = generate_client_volfiles (volinfo, GF_CLIENT_TRUSTED);
+ if (ret) {
+ gf_log ("glusterd", GF_LOG_ERROR, "Failed to "
+ "regenerate trusted client volfiles for %s",
+ volinfo->volname);
+ op_ret = ret;
+ }
+ ret = generate_client_volfiles (volinfo, GF_CLIENT_OTHER);
+ if (ret) {
+ gf_log ("glusterd", GF_LOG_ERROR, "Failed to "
+ "regenerate client volfiles for %s",
+ volinfo->volname);
+ op_ret = ret;
+ }
}
- return ret;
+ return op_ret;
}
int32_t
@@ -5046,7 +8175,7 @@ glusterd_handle_upgrade_downgrade (dict_t *options, glusterd_conf_t *conf)
char *type = NULL;
gf_boolean_t upgrade = _gf_false;
gf_boolean_t downgrade = _gf_false;
- gf_boolean_t regenerate_brick_volfiles = _gf_false;
+ gf_boolean_t regenerate_volfiles = _gf_false;
gf_boolean_t terminate = _gf_false;
ret = dict_get_str (options, "upgrade", &type);
@@ -5059,7 +8188,7 @@ glusterd_handle_upgrade_downgrade (dict_t *options, glusterd_conf_t *conf)
goto out;
}
if (_gf_true == upgrade)
- regenerate_brick_volfiles = _gf_true;
+ regenerate_volfiles = _gf_true;
}
ret = dict_get_str (options, "downgrade", &type);
@@ -5084,8 +8213,8 @@ glusterd_handle_upgrade_downgrade (dict_t *options, glusterd_conf_t *conf)
ret = 0;
else
terminate = _gf_true;
- if (regenerate_brick_volfiles) {
- ret = glusterd_recreate_bricks (conf);
+ if (regenerate_volfiles) {
+ ret = glusterd_recreate_volfiles (conf);
}
out:
if (terminate && (ret == 0))
@@ -5160,7 +8289,6 @@ glusterd_brick_statedump (glusterd_volinfo_t *volinfo,
xlator_t *this = NULL;
glusterd_conf_t *conf = NULL;
char pidfile_path[PATH_MAX] = {0,};
- char path[PATH_MAX] = {0,};
char dumpoptions_path[PATH_MAX] = {0,};
FILE *pidfile = NULL;
pid_t pid = -1;
@@ -5185,9 +8313,7 @@ glusterd_brick_statedump (glusterd_volinfo_t *volinfo,
goto out;
}
- GLUSTERD_GET_VOLUME_DIR (path, volinfo, conf);
- GLUSTERD_GET_BRICK_PIDFILE (pidfile_path, path, brickinfo->hostname,
- brickinfo->path);
+ GLUSTERD_GET_BRICK_PIDFILE (pidfile_path, volinfo, brickinfo, conf);
pidfile = fopen (pidfile_path, "r");
if (!pidfile) {
@@ -5205,7 +8331,7 @@ glusterd_brick_statedump (glusterd_volinfo_t *volinfo,
}
snprintf (dumpoptions_path, sizeof (dumpoptions_path),
- "/tmp/glusterdump.%d.options", pid);
+ DEFAULT_VAR_RUN_DIRECTORY"/glusterdump.%d.options", pid);
ret = glusterd_set_dump_options (dumpoptions_path, options, option_cnt);
if (ret < 0) {
gf_log ("", GF_LOG_ERROR, "error while parsing the statedump "
@@ -5278,7 +8404,7 @@ glusterd_nfs_statedump (char *options, int option_cnt, char **op_errstr)
}
snprintf (dumpoptions_path, sizeof (dumpoptions_path),
- "/tmp/glusterdump.%d.options", pid);
+ DEFAULT_VAR_RUN_DIRECTORY"/glusterdump.%d.options", pid);
ret = glusterd_set_dump_options (dumpoptions_path, options, option_cnt);
if (ret < 0) {
gf_log ("", GF_LOG_ERROR, "error while parsing the statedump "
@@ -5303,29 +8429,117 @@ out:
return ret;
}
-/* Checks if the given peer contains all the bricks belonging to the
- * given volume. Returns true if it does else returns false
+int
+glusterd_quotad_statedump (char *options, int option_cnt, char **op_errstr)
+{
+ int ret = -1;
+ xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
+ char pidfile_path[PATH_MAX] = {0,};
+ char path[PATH_MAX] = {0,};
+ FILE *pidfile = NULL;
+ pid_t pid = -1;
+ char dumpoptions_path[PATH_MAX] = {0,};
+ char *option = NULL;
+ char *tmpptr = NULL;
+ char *dup_options = NULL;
+ char msg[256] = {0,};
+
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ dup_options = gf_strdup (options);
+ option = strtok_r (dup_options, " ", &tmpptr);
+ if (strcmp (option, "quotad")) {
+ snprintf (msg, sizeof (msg), "for quotad statedump, options "
+ "should be after the key 'quotad'");
+ *op_errstr = gf_strdup (msg);
+ ret = -1;
+ goto out;
+ }
+
+ GLUSTERD_GET_QUOTAD_DIR (path, conf);
+ GLUSTERD_GET_QUOTAD_PIDFILE (pidfile_path, path);
+
+ pidfile = fopen (pidfile_path, "r");
+ if (!pidfile) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to open pidfile: %s",
+ pidfile_path);
+ ret = -1;
+ goto out;
+ }
+
+ ret = fscanf (pidfile, "%d", &pid);
+ if (ret <= 0) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get pid of quotad "
+ "process");
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (dumpoptions_path, sizeof (dumpoptions_path),
+ DEFAULT_VAR_RUN_DIRECTORY"/glusterdump.%d.options", pid);
+ ret = glusterd_set_dump_options (dumpoptions_path, options, option_cnt);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "error while parsing "
+ "statedump options");
+ ret = -1;
+ goto out;
+ }
+
+ gf_log (this->name, GF_LOG_INFO, "Performing statedump on quotad with "
+ "pid %d", pid);
+
+ kill (pid, SIGUSR1);
+
+ sleep (1);
+
+ ret = 0;
+out:
+ if (pidfile)
+ fclose (pidfile);
+ unlink (dumpoptions_path);
+ GF_FREE (dup_options);
+ return ret;
+}
+
+/* Checks if the given peer contains bricks belonging to the given volume.
+ * Returns,
+ * 2 - if peer contains all the bricks
+ * 1 - if peer contains at least 1 brick
+ * 0 - if peer contains no bricks
*/
-gf_boolean_t
+int
glusterd_friend_contains_vol_bricks (glusterd_volinfo_t *volinfo,
uuid_t friend_uuid)
{
- gf_boolean_t ret = _gf_true;
+ int ret = 0;
glusterd_brickinfo_t *brickinfo = NULL;
+ int count = 0;
GF_ASSERT (volinfo);
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
- if (uuid_compare (friend_uuid, brickinfo->uuid)) {
- ret = _gf_false;
- break;
+ if (!uuid_compare (brickinfo->uuid, friend_uuid)) {
+ count++;
}
}
+
+ if (count) {
+ if (count == volinfo->brick_count)
+ ret = 2;
+ else
+ ret = 1;
+ }
gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
-/* Remove all volumes which completely belong to given friend
+/* Cleanup the stale volumes left behind in the cluster. The volumes which are
+ * contained completely within the detached peer are stale with respect to the
+ * cluster.
*/
int
glusterd_friend_remove_cleanup_vols (uuid_t uuid)
@@ -5340,7 +8554,7 @@ glusterd_friend_remove_cleanup_vols (uuid_t uuid)
list_for_each_entry_safe (volinfo, tmp_volinfo,
&priv->volumes, vol_list) {
- if (glusterd_friend_contains_vol_bricks (volinfo, uuid)) {
+ if (glusterd_friend_contains_vol_bricks (volinfo, uuid) == 2) {
gf_log (THIS->name, GF_LOG_INFO,
"Deleting stale volume %s", volinfo->volname);
ret = glusterd_delete_volume (volinfo);
@@ -5434,22 +8648,44 @@ int
glusterd_volume_defrag_restart (glusterd_volinfo_t *volinfo, char *op_errstr,
size_t len, int cmd, defrag_cbk_fn_t cbk)
{
- glusterd_conf_t *priv = NULL;
- char pidfile[PATH_MAX];
- int ret = -1;
- pid_t pid;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ char pidfile[PATH_MAX] = {0,};
+ int ret = -1;
+ pid_t pid = 0;
- priv = THIS->private;
+ this = THIS;
+ GF_ASSERT (this);
+
+ priv = this->private;
if (!priv)
return ret;
- GLUSTERD_GET_DEFRAG_PID_FILE(pidfile, volinfo, priv);
-
- if (!glusterd_is_service_running (pidfile, &pid)) {
+ /* Don't start the rebalance process if the stautus is already
+ * completed, stopped or failed. If the status is started, check if
+ * there is an existing process already and connect to it. If not, then
+ * start the rebalance process
+ */
+ switch (volinfo->rebal.defrag_status) {
+ case GF_DEFRAG_STATUS_COMPLETE:
+ case GF_DEFRAG_STATUS_STOPPED:
+ case GF_DEFRAG_STATUS_FAILED:
+ break;
+ case GF_DEFRAG_STATUS_STARTED:
+ GLUSTERD_GET_DEFRAG_PID_FILE(pidfile, volinfo, priv);
+ if (gf_is_service_running (pidfile, &pid)) {
+ glusterd_rebalance_rpc_create (volinfo, _gf_true);
+ break;
+ }
+ case GF_DEFRAG_STATUS_NOT_STARTED:
glusterd_handle_defrag_start (volinfo, op_errstr, len, cmd,
- cbk);
- } else {
- glusterd_rebalance_rpc_create (volinfo, priv, cmd);
+ cbk, volinfo->rebal.op);
+ break;
+ default:
+ gf_log (this->name, GF_LOG_ERROR, "Unknown defrag status (%d)."
+ "Not starting rebalance process for %s.",
+ volinfo->rebal.defrag_status, volinfo->volname);
+ break;
}
return ret;
@@ -5463,25 +8699,29 @@ glusterd_restart_rebalance (glusterd_conf_t *conf)
char op_errstr[256];
list_for_each_entry (volinfo, &conf->volumes, vol_list) {
- if (!volinfo->defrag_cmd)
+ if (!volinfo->rebal.defrag_cmd)
+ continue;
+ if (!gd_should_i_start_rebalance (volinfo))
continue;
glusterd_volume_defrag_restart (volinfo, op_errstr, 256,
- volinfo->defrag_cmd, NULL);
+ volinfo->rebal.defrag_cmd, NULL);
}
return ret;
}
-
void
glusterd_volinfo_reset_defrag_stats (glusterd_volinfo_t *volinfo)
{
+ glusterd_rebalance_t *rebal = NULL;
GF_ASSERT (volinfo);
- volinfo->rebalance_files = 0;
- volinfo->rebalance_data = 0;
- volinfo->lookedup_files = 0;
- volinfo->rebalance_failures = 0;
- volinfo->rebalance_time = 0;
+ rebal = &volinfo->rebal;
+ rebal->rebalance_files = 0;
+ rebal->rebalance_data = 0;
+ rebal->lookedup_files = 0;
+ rebal->rebalance_failures = 0;
+ rebal->rebalance_time = 0;
+ rebal->skipped_files = 0;
}
@@ -5537,21 +8777,29 @@ glusterd_validate_volume_id (dict_t *op_dict, glusterd_volinfo_t *volinfo)
int ret = -1;
char *volid_str = NULL;
uuid_t vol_uid = {0, };
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
ret = dict_get_str (op_dict, "vol-id", &volid_str);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "Failed to get volume id");
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get volume id for "
+ "volume %s", volinfo->volname);
goto out;
}
ret = uuid_parse (volid_str, vol_uid);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "Failed to parse uuid");
+ gf_log (this->name, GF_LOG_ERROR, "Failed to parse volume id "
+ "for volume %s", volinfo->volname);
goto out;
}
if (uuid_compare (vol_uid, volinfo->volume_id)) {
- gf_log (THIS->name, GF_LOG_ERROR, "Volume ids are different. "
- "Possibly a split brain among peers.");
+ gf_log (this->name, GF_LOG_ERROR, "Volume ids of volume %s - %s"
+ " and %s - are different. Possibly a split brain among "
+ "peers.", volinfo->volname, volid_str,
+ uuid_utoa (volinfo->volume_id));
ret = -1;
goto out;
}
@@ -5570,6 +8818,7 @@ glusterd_defrag_volume_status_update (glusterd_volinfo_t *volinfo,
uint64_t lookup = 0;
gf_defrag_status_t status = GF_DEFRAG_STATUS_NOT_STARTED;
uint64_t failures = 0;
+ uint64_t skipped = 0;
xlator_t *this = NULL;
double run_time = 0;
@@ -5600,24 +8849,95 @@ glusterd_defrag_volume_status_update (glusterd_volinfo_t *volinfo,
gf_log (this->name, GF_LOG_TRACE,
"failed to get failure count");
+ ret = dict_get_uint64 (rsp_dict, "skipped", &skipped);
+ if (ret)
+ gf_log (this->name, GF_LOG_TRACE,
+ "failed to get skipped count");
+
ret = dict_get_double (rsp_dict, "run-time", &run_time);
if (ret)
gf_log (this->name, GF_LOG_TRACE,
"failed to get run-time");
if (files)
- volinfo->rebalance_files = files;
+ volinfo->rebal.rebalance_files = files;
if (size)
- volinfo->rebalance_data = size;
+ volinfo->rebal.rebalance_data = size;
if (lookup)
- volinfo->lookedup_files = lookup;
+ volinfo->rebal.lookedup_files = lookup;
if (status)
- volinfo->defrag_status = status;
+ volinfo->rebal.defrag_status = status;
if (failures)
- volinfo->rebalance_failures = failures;
+ volinfo->rebal.rebalance_failures = failures;
+ if (skipped)
+ volinfo->rebal.skipped_files = skipped;
if (run_time)
- volinfo->rebalance_time = run_time;
+ volinfo->rebal.rebalance_time = run_time;
+
+ return ret;
+}
+
+int
+glusterd_check_topology_identical (const char *filename1,
+ const char *filename2,
+ gf_boolean_t *identical)
+{
+ int ret = -1; /* FAILURE */
+ xlator_t *this = THIS;
+ FILE *fp1 = NULL;
+ FILE *fp2 = NULL;
+ glusterfs_graph_t *grph1 = NULL;
+ glusterfs_graph_t *grph2 = NULL;
+
+ /* Invalid xlator, Nothing to do */
+ if (!this)
+ return (-1);
+
+ /* Sanitize the inputs */
+ GF_VALIDATE_OR_GOTO (this->name, filename1, out);
+ GF_VALIDATE_OR_GOTO (this->name, filename2, out);
+ GF_VALIDATE_OR_GOTO (this->name, identical, out);
+
+ /* fopen() the volfile1 to create the graph */
+ fp1 = fopen (filename1, "r");
+ if (fp1 == NULL) {
+ gf_log (this->name, GF_LOG_ERROR, "fopen() on file: %s failed "
+ "(%s)", filename1, strerror (errno));
+ goto out;
+ }
+
+ /* fopen() the volfile2 to create the graph */
+ fp2 = fopen (filename2, "r");
+ if (fp2 == NULL) {
+ gf_log (this->name, GF_LOG_ERROR, "fopen() on file: %s failed "
+ "(%s)", filename2, strerror (errno));
+ goto out;
+ }
+
+ /* create the graph for filename1 */
+ grph1 = glusterfs_graph_construct(fp1);
+ if (grph1 == NULL)
+ goto out;
+
+ /* create the graph for filename2 */
+ grph2 = glusterfs_graph_construct(fp2);
+ if (grph2 == NULL)
+ goto out;
+
+ /* compare the graph topology */
+ *identical = is_graph_topology_equal(grph1, grph2);
+ ret = 0; /* SUCCESS */
+out:
+ if (fp1)
+ fclose(fp1);
+ if (fp2)
+ fclose(fp2);
+ if (grph1)
+ glusterfs_graph_destroy(grph1);
+ if (grph2)
+ glusterfs_graph_destroy(grph2);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning with %d", ret);
return ret;
}
@@ -5734,6 +9054,10 @@ glusterd_to_cli (rpcsvc_request_t *req, gf_cli_rsp *arg, struct iovec *payload,
int op_ret = 0;
char *op_errstr = NULL;
int op_errno = 0;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
op_ret = arg->op_ret;
op_errstr = arg->op_errstr;
@@ -5741,19 +9065,22 @@ glusterd_to_cli (rpcsvc_request_t *req, gf_cli_rsp *arg, struct iovec *payload,
ret = dict_get_str (dict, "cmd-str", &cmd);
if (ret)
- gf_log ("glusterd", GF_LOG_ERROR, "Failed to get command string");
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get command "
+ "string");
if (cmd) {
if (op_ret)
gf_cmd_log ("", "%s : FAILED %s %s", cmd,
- (op_errstr)? ":":" ",
- (op_errstr)? op_errstr: " ");
+ (op_errstr)? ":" : " ",
+ (op_errstr)? op_errstr : " ");
else
gf_cmd_log ("", "%s : SUCCESS", cmd);
}
glusterd_submit_reply (req, arg, payload, payloadcount, iobref,
(xdrproc_t) xdrproc);
+ if (dict)
+ dict_unref (dict);
return ret;
}
@@ -5770,7 +9097,7 @@ glusterd_append_gsync_status (dict_t *dst, dict_t *src)
goto out;
}
- ret = dict_set_dynstr (dst, "gsync-status", gf_strdup (stop_msg));
+ ret = dict_set_dynstr_with_alloc (dst, "gsync-status", stop_msg);
if (ret) {
gf_log ("glusterd", GF_LOG_WARNING, "Unable to set the stop"
"message in the ctx dictionary");
@@ -5784,19 +9111,16 @@ glusterd_append_gsync_status (dict_t *dst, dict_t *src)
}
-static int32_t
+int32_t
glusterd_append_status_dicts (dict_t *dst, dict_t *src)
{
- int dst_count = 0;
- int src_count = 0;
- int i = 0;
- int ret = 0;
- char mst[PATH_MAX] = {0,};
- char slv[PATH_MAX] = {0, };
- char sts[PATH_MAX] = {0, };
- char *mst_val = NULL;
- char *slv_val = NULL;
- char *sts_val = NULL;
+ char sts_val_name[PATH_MAX] = {0, };
+ int dst_count = 0;
+ int src_count = 0;
+ int i = 0;
+ int ret = 0;
+ gf_gsync_status_t *sts_val = NULL;
+ gf_gsync_status_t *dst_sts_val = NULL;
GF_ASSERT (dst);
@@ -5814,39 +9138,29 @@ glusterd_append_status_dicts (dict_t *dst, dict_t *src)
goto out;
}
- for (i = 1; i <= src_count; i++) {
- snprintf (mst, sizeof(mst), "master%d", i);
- snprintf (slv, sizeof(slv), "slave%d", i);
- snprintf (sts, sizeof(sts), "status%d", i);
-
- ret = dict_get_str (src, mst, &mst_val);
- if (ret)
- goto out;
+ for (i = 0; i < src_count; i++) {
+ memset (sts_val_name, '\0', sizeof(sts_val_name));
+ snprintf (sts_val_name, sizeof(sts_val_name), "status_value%d", i);
- ret = dict_get_str (src, slv, &slv_val);
+ ret = dict_get_bin (src, sts_val_name, (void **) &sts_val);
if (ret)
goto out;
- ret = dict_get_str (src, sts, &sts_val);
- if (ret)
+ dst_sts_val = GF_CALLOC (1, sizeof(gf_gsync_status_t),
+ gf_common_mt_gsync_status_t);
+ if (!dst_sts_val) {
+ gf_log ("", GF_LOG_ERROR, "Out Of Memory");
goto out;
+ }
- snprintf (mst, sizeof(mst), "master%d", i+dst_count);
- snprintf (slv, sizeof(slv), "slave%d", i+dst_count);
- snprintf (sts, sizeof(sts), "status%d", i+dst_count);
-
- ret = dict_set_dynstr (dst, mst, gf_strdup (mst_val));
- if (ret)
- goto out;
+ memcpy (dst_sts_val, sts_val, sizeof(gf_gsync_status_t));
- ret = dict_set_dynstr (dst, slv, gf_strdup (slv_val));
- if (ret)
- goto out;
+ memset (sts_val_name, '\0', sizeof(sts_val_name));
+ snprintf (sts_val_name, sizeof(sts_val_name), "status_value%d", i + dst_count);
- ret = dict_set_dynstr (dst, sts, gf_strdup (sts_val));
+ ret = dict_set_bin (dst, sts_val_name, dst_sts_val, sizeof(gf_gsync_status_t));
if (ret)
goto out;
-
}
ret = dict_set_int32 (dst, "gsync-count", dst_count+src_count);
@@ -5862,6 +9176,7 @@ glusterd_gsync_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict, char *op_errstr)
{
dict_t *ctx = NULL;
int ret = 0;
+ char *conf_path = NULL;
if (aggr) {
ctx = aggr;
@@ -5883,9 +9198,21 @@ glusterd_gsync_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict, char *op_errstr)
ret = glusterd_append_gsync_status (ctx, rsp_dict);
if (ret)
goto out;
+
+ ret = dict_get_str (rsp_dict, "conf_path", &conf_path);
+ if (!ret && conf_path) {
+ ret = dict_set_dynstr_with_alloc (ctx, "conf_path",
+ conf_path);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to store conf path.");
+ goto out;
+ }
+ }
}
- if (strcmp ("", op_errstr)) {
- ret = dict_set_dynstr (ctx, "errstr", gf_strdup(op_errstr));
+ if ((op_errstr) && (strcmp ("", op_errstr))) {
+ ret = dict_set_dynstr_with_alloc (ctx, "errstr",
+ op_errstr);
if (ret)
goto out;
}
@@ -6046,8 +9373,12 @@ glusterd_volume_status_add_peer_rsp (dict_t *this, char *key, data_t *value,
int32_t ret = 0;
/* Skip the following keys, they are already present in the ctx_dict */
+ /* Also, skip all the task related pairs. They will be added to the
+ * ctx_dict later
+ */
if (!strcmp (key, "count") || !strcmp (key, "cmd") ||
- !strcmp (key, "brick-index-max") || !strcmp (key, "other-count"))
+ !strcmp (key, "brick-index-max") || !strcmp (key, "other-count") ||
+ !strncmp (key, "task", 4))
return 0;
rsp_ctx = data;
@@ -6072,21 +9403,250 @@ glusterd_volume_status_add_peer_rsp (dict_t *this, char *key, data_t *value,
return 0;
}
+static int
+glusterd_volume_status_copy_tasks_to_ctx_dict (dict_t *this, char *key,
+ data_t *value, void *data)
+{
+ int ret = 0;
+ dict_t *ctx_dict = NULL;
+ data_t *new_value = NULL;
+
+ if (strncmp (key, "task", 4))
+ return 0;
+
+ ctx_dict = data;
+ GF_ASSERT (ctx_dict);
+
+ new_value = data_copy (value);
+ GF_ASSERT (new_value);
+
+ ret = dict_set (ctx_dict, key, new_value);
+
+ return ret;
+}
+
+int
+glusterd_volume_status_aggregate_tasks_status (dict_t *ctx_dict,
+ dict_t *rsp_dict)
+{
+ int ret = -1;
+ xlator_t *this = NULL;
+ int local_count = 0;
+ int remote_count = 0;
+ int i = 0;
+ int j = 0;
+ char key[128] = {0,};
+ char *task_type = NULL;
+ int local_status = 0;
+ int remote_status = 0;
+ char *local_task_id = NULL;
+ char *remote_task_id = NULL;
+
+ GF_ASSERT (ctx_dict);
+ GF_ASSERT (rsp_dict);
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ ret = dict_get_int32 (rsp_dict, "tasks", &remote_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get remote task count");
+ goto out;
+ }
+ /* Local count will not be present when this is called for the first
+ * time with the origins rsp_dict
+ */
+ ret = dict_get_int32 (ctx_dict, "tasks", &local_count);
+ if (ret) {
+ ret = dict_foreach (rsp_dict,
+ glusterd_volume_status_copy_tasks_to_ctx_dict,
+ ctx_dict);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR, "Failed to copy tasks"
+ "to ctx_dict.");
+ goto out;
+ }
+
+ if (local_count != remote_count) {
+ gf_log (this->name, GF_LOG_ERROR, "Local tasks count (%d) and "
+ "remote tasks count (%d) do not match. Not aggregating "
+ "tasks status.", local_count, remote_count);
+ ret = -1;
+ goto out;
+ }
+
+ /* Update the tasks statuses. For every remote tasks, search for the
+ * local task, and update the local task status based on the remote
+ * status.
+ */
+ for (i = 0; i < remote_count; i++) {
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "task%d.type", i);
+ ret = dict_get_str (rsp_dict, key, &task_type);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get task typpe from rsp dict");
+ goto out;
+ }
+
+ /* Skip replace-brick status as it is going to be the same on
+ * all peers. rb_status is set by the replace brick commit
+ * function on all peers based on the replace brick command.
+ * We return the value of rb_status as the status for a
+ * replace-brick task in a 'volume status' command.
+ */
+ if (!strcmp (task_type, "Replace brick"))
+ continue;
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "task%d.status", i);
+ ret = dict_get_int32 (rsp_dict, key, &remote_status);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get task status from rsp dict");
+ goto out;
+ }
+ snprintf (key, sizeof (key), "task%d.id", i);
+ ret = dict_get_str (rsp_dict, key, &remote_task_id);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get task id from rsp dict");
+ goto out;
+ }
+ for (j = 0; j < local_count; j++) {
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "task%d.id", j);
+ ret = dict_get_str (ctx_dict, key, &local_task_id);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get local task-id");
+ goto out;
+ }
+
+ if (strncmp (remote_task_id, local_task_id,
+ strlen (remote_task_id))) {
+ /* Quit if a matching local task is not found */
+ if (j == (local_count - 1)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Could not find matching local "
+ "task for task %s",
+ remote_task_id);
+ goto out;
+ }
+ continue;
+ }
+
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "task%d.status", j);
+ ret = dict_get_int32 (ctx_dict, key, &local_status);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to get local task status");
+ goto out;
+ }
+
+ /* Rebalance has 5 states,
+ * NOT_STARTED, STARTED, STOPPED, COMPLETE, FAILED
+ * The precedence used to determine the aggregate status
+ * is as below,
+ * STARTED > FAILED > STOPPED > COMPLETE > NOT_STARTED
+ */
+ /* TODO: Move this to a common place utilities that both
+ * CLI and glusterd need.
+ * Till then if the below algorithm is changed, change
+ * it in cli_xml_output_vol_rebalance_status in
+ * cli-xml-output.c
+ */
+ ret = 0;
+ int rank[] = {
+ [GF_DEFRAG_STATUS_STARTED] = 1,
+ [GF_DEFRAG_STATUS_FAILED] = 2,
+ [GF_DEFRAG_STATUS_STOPPED] = 3,
+ [GF_DEFRAG_STATUS_COMPLETE] = 4,
+ [GF_DEFRAG_STATUS_NOT_STARTED] = 5
+ };
+ if (rank[remote_status] <= rank[local_status])
+ ret = dict_set_int32 (ctx_dict, key,
+ remote_status);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "update task status");
+ goto out;
+ }
+ break;
+ }
+ }
+
+out:
+ return ret;
+}
+
+gf_boolean_t
+glusterd_status_has_tasks (int cmd) {
+ if (((cmd & GF_CLI_STATUS_MASK) == GF_CLI_STATUS_NONE) &&
+ (cmd & GF_CLI_STATUS_VOL))
+ return _gf_true;
+ return _gf_false;
+}
+
int
glusterd_volume_status_copy_to_op_ctx_dict (dict_t *aggr, dict_t *rsp_dict)
{
int ret = 0;
glusterd_status_rsp_conv_t rsp_ctx = {0};
+ int32_t cmd = GF_CLI_STATUS_NONE;
int32_t node_count = 0;
- int32_t rsp_node_count = 0;
- int32_t brick_index_max = -1;
int32_t other_count = 0;
+ int32_t brick_index_max = -1;
+ int32_t rsp_node_count = 0;
int32_t rsp_other_count = 0;
+ int vol_count = -1;
+ int i = 0;
dict_t *ctx_dict = NULL;
- glusterd_op_t op = GD_OP_NONE;
+ char key[PATH_MAX] = {0,};
+ char *volname = NULL;
GF_ASSERT (rsp_dict);
+ if (aggr) {
+ ctx_dict = aggr;
+
+ } else {
+ ctx_dict = glusterd_op_get_ctx (GD_OP_STATUS_VOLUME);
+
+ }
+
+ ret = dict_get_int32 (ctx_dict, "cmd", &cmd);
+ if (ret)
+ goto out;
+
+ if (cmd & GF_CLI_STATUS_ALL && is_origin_glusterd (ctx_dict)) {
+ ret = dict_get_int32 (rsp_dict, "vol_count", &vol_count);
+ if (ret == 0) {
+ ret = dict_set_int32 (ctx_dict, "vol_count",
+ vol_count);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < vol_count; i++) {
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "vol%d", i);
+ ret = dict_get_str (rsp_dict, key, &volname);
+ if (ret)
+ goto out;
+
+ ret = dict_set_str (ctx_dict, key, volname);
+ if (ret)
+ goto out;
+ }
+ }
+ }
+
+ if ((cmd & GF_CLI_STATUS_TASKS) != 0)
+ goto aggregate_tasks;
+
ret = dict_get_int32 (rsp_dict, "count", &rsp_node_count);
if (ret) {
ret = 0; //no bricks in the rsp
@@ -6100,20 +9660,20 @@ glusterd_volume_status_copy_to_op_ctx_dict (dict_t *aggr, dict_t *rsp_dict)
goto out;
}
- op = glusterd_op_get_op ();
- GF_ASSERT (GD_OP_STATUS_VOLUME == op);
- if (aggr) {
- ctx_dict = aggr;
+ ret = dict_get_int32 (ctx_dict, "count", &node_count);
+ ret = dict_get_int32 (ctx_dict, "other-count", &other_count);
+ if (!dict_get (ctx_dict, "brick-index-max")) {
+ ret = dict_get_int32 (rsp_dict, "brick-index-max", &brick_index_max);
+ if (ret)
+ goto out;
+ ret = dict_set_int32 (ctx_dict, "brick-index-max", brick_index_max);
+ if (ret)
+ goto out;
} else {
- ctx_dict = glusterd_op_get_ctx (op);
-
+ ret = dict_get_int32 (ctx_dict, "brick-index-max", &brick_index_max);
}
- ret = dict_get_int32 (ctx_dict, "count", &node_count);
- ret = dict_get_int32 (ctx_dict, "brick-index-max", &brick_index_max);
- ret = dict_get_int32 (ctx_dict, "other-count", &other_count);
-
rsp_ctx.count = node_count;
rsp_ctx.brick_index_max = brick_index_max;
rsp_ctx.other_count = other_count;
@@ -6130,9 +9690,22 @@ glusterd_volume_status_copy_to_op_ctx_dict (dict_t *aggr, dict_t *rsp_dict)
ret = dict_set_int32 (ctx_dict, "other-count",
(other_count + rsp_other_count));
- if (ret)
+ if (ret) {
gf_log (THIS->name, GF_LOG_ERROR,
"Failed to update other-count");
+ goto out;
+ }
+
+aggregate_tasks:
+ /* Tasks are only present for a normal status command for a volume or
+ * for an explicit tasks status command for a volume
+ */
+ if (!(cmd & GF_CLI_STATUS_ALL) &&
+ (((cmd & GF_CLI_STATUS_TASKS) != 0) ||
+ glusterd_status_has_tasks (cmd)))
+ ret = glusterd_volume_status_aggregate_tasks_status (ctx_dict,
+ rsp_dict);
+
out:
return ret;
}
@@ -6140,21 +9713,26 @@ out:
int
glusterd_volume_rebalance_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict)
{
- int ret = 0;
- dict_t *ctx_dict = NULL;
- glusterd_op_t op = GD_OP_NONE;
- uint64_t value = 0;
- int32_t value32 = 0;
- char *volname = NULL;
- glusterd_volinfo_t *volinfo = NULL;
- char key[256] = {0,};
- int32_t index = 0;
- int32_t i = 0;
- char *node_uuid = NULL;
- char *node_uuid_str = NULL;
- double elapsed_time = 0;
+ char key[256] = {0,};
+ char *node_uuid = NULL;
+ char *node_uuid_str = NULL;
+ char *volname = NULL;
+ dict_t *ctx_dict = NULL;
+ double elapsed_time = 0;
+ glusterd_conf_t *conf = NULL;
+ glusterd_op_t op = GD_OP_NONE;
+ glusterd_peerinfo_t *peerinfo = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ int ret = 0;
+ int32_t index = 0;
+ int32_t count = 0;
+ int32_t current_index = 2;
+ int32_t value32 = 0;
+ uint64_t value = 0;
+ char *peer_uuid_str = NULL;
GF_ASSERT (rsp_dict);
+ conf = THIS->private;
op = glusterd_op_get_op ();
GF_ASSERT ((GD_OP_REBALANCE == op) ||
@@ -6171,12 +9749,6 @@ glusterd_volume_rebalance_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict)
if (!ctx_dict)
goto out;
- ret = dict_get_int32 (ctx_dict, "count", &i);
- i++;
- ret = dict_set_int32 (ctx_dict, "count", i);
- if (ret)
- gf_log ("", GF_LOG_ERROR, "Failed to set index");
-
ret = dict_get_str (ctx_dict, "volname", &volname);
if (ret) {
gf_log ("", GF_LOG_ERROR, "Unable to get volume name");
@@ -6192,11 +9764,44 @@ glusterd_volume_rebalance_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict)
if (ret)
gf_log ("", GF_LOG_ERROR, "failed to get index");
+ memset (key, 0, 256);
+ snprintf (key, 256, "node-uuid-%d", index);
+ ret = dict_get_str (rsp_dict, key, &node_uuid);
+ if (!ret) {
+ node_uuid_str = gf_strdup (node_uuid);
+
+ /* Finding the index of the node-uuid in the peer-list */
+ list_for_each_entry (peerinfo, &conf->peers, uuid_list) {
+ peer_uuid_str = gd_peer_uuid_str (peerinfo);
+ if (strcmp (peer_uuid_str, node_uuid_str) == 0)
+ break;
+
+ current_index++;
+ }
+
+ /* Setting the largest index value as the total count. */
+ ret = dict_get_int32 (ctx_dict, "count", &count);
+ if (count < current_index) {
+ ret = dict_set_int32 (ctx_dict, "count", current_index);
+ if (ret)
+ gf_log ("", GF_LOG_ERROR, "Failed to set count");
+ }
+
+ /* Setting the same index for the node, as is in the peerlist.*/
+ memset (key, 0, 256);
+ snprintf (key, 256, "node-uuid-%d", current_index);
+ ret = dict_set_dynstr (ctx_dict, key, node_uuid_str);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_DEBUG,
+ "failed to set node-uuid");
+ }
+ }
+
snprintf (key, 256, "files-%d", index);
ret = dict_get_uint64 (rsp_dict, key, &value);
if (!ret) {
memset (key, 0, 256);
- snprintf (key, 256, "files-%d", i);
+ snprintf (key, 256, "files-%d", current_index);
ret = dict_set_uint64 (ctx_dict, key, value);
if (ret) {
gf_log (THIS->name, GF_LOG_DEBUG,
@@ -6209,7 +9814,7 @@ glusterd_volume_rebalance_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict)
ret = dict_get_uint64 (rsp_dict, key, &value);
if (!ret) {
memset (key, 0, 256);
- snprintf (key, 256, "size-%d", i);
+ snprintf (key, 256, "size-%d", current_index);
ret = dict_set_uint64 (ctx_dict, key, value);
if (ret) {
gf_log (THIS->name, GF_LOG_DEBUG,
@@ -6222,7 +9827,7 @@ glusterd_volume_rebalance_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict)
ret = dict_get_uint64 (rsp_dict, key, &value);
if (!ret) {
memset (key, 0, 256);
- snprintf (key, 256, "lookups-%d", i);
+ snprintf (key, 256, "lookups-%d", current_index);
ret = dict_set_uint64 (ctx_dict, key, value);
if (ret) {
gf_log (THIS->name, GF_LOG_DEBUG,
@@ -6235,7 +9840,7 @@ glusterd_volume_rebalance_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict)
ret = dict_get_int32 (rsp_dict, key, &value32);
if (!ret) {
memset (key, 0, 256);
- snprintf (key, 256, "status-%d", i);
+ snprintf (key, 256, "status-%d", current_index);
ret = dict_set_int32 (ctx_dict, key, value32);
if (ret) {
gf_log (THIS->name, GF_LOG_DEBUG,
@@ -6244,38 +9849,36 @@ glusterd_volume_rebalance_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict)
}
memset (key, 0, 256);
- snprintf (key, 256, "node-uuid-%d", index);
- ret = dict_get_str (rsp_dict, key, &node_uuid);
+ snprintf (key, 256, "failures-%d", index);
+ ret = dict_get_uint64 (rsp_dict, key, &value);
if (!ret) {
memset (key, 0, 256);
- snprintf (key, 256, "node-uuid-%d", i);
- node_uuid_str = gf_strdup (node_uuid);
- ret = dict_set_dynstr (ctx_dict, key, node_uuid_str);
+ snprintf (key, 256, "failures-%d", current_index);
+ ret = dict_set_uint64 (ctx_dict, key, value);
if (ret) {
gf_log (THIS->name, GF_LOG_DEBUG,
- "failed to set node-uuid");
+ "failed to set failure count");
}
}
memset (key, 0, 256);
- snprintf (key, 256, "failures-%d", index);
+ snprintf (key, 256, "skipped-%d", index);
ret = dict_get_uint64 (rsp_dict, key, &value);
if (!ret) {
memset (key, 0, 256);
- snprintf (key, 256, "failures-%d", i);
+ snprintf (key, 256, "skipped-%d", current_index);
ret = dict_set_uint64 (ctx_dict, key, value);
if (ret) {
gf_log (THIS->name, GF_LOG_DEBUG,
- "failed to set failure count");
+ "failed to set skipped count");
}
}
-
memset (key, 0, 256);
snprintf (key, 256, "run-time-%d", index);
ret = dict_get_double (rsp_dict, key, &elapsed_time);
if (!ret) {
memset (key, 0, 256);
- snprintf (key, 256, "run-time-%d", i);
+ snprintf (key, 256, "run-time-%d", current_index);
ret = dict_set_double (ctx_dict, key, elapsed_time);
if (ret) {
gf_log (THIS->name, GF_LOG_DEBUG,
@@ -6290,6 +9893,351 @@ out:
}
int
+glusterd_snap_config_use_rsp_dict (dict_t *dst, dict_t *src)
+{
+ char buf[PATH_MAX] = "";
+ char *volname = NULL;
+ int ret = -1;
+ int config_command = 0;
+ uint64_t i = 0;
+ uint64_t value = 0;
+ uint64_t voldisplaycount = 0;
+
+ if (!dst || !src) {
+ gf_log ("", GF_LOG_ERROR, "Source or Destination "
+ "dict is empty.");
+ goto out;
+ }
+
+ ret = dict_get_int32 (dst, "config-command", &config_command);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "failed to get config-command type");
+ goto out;
+ }
+
+ switch (config_command) {
+ case GF_SNAP_CONFIG_DISPLAY:
+ ret = dict_get_uint64 (src, "snap-max-hard-limit", &value);
+ if (!ret) {
+ ret = dict_set_uint64 (dst, "snap-max-hard-limit", value);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to set snap_max_hard_limit");
+ goto out;
+ }
+ } else {
+ /* Received dummy response from other nodes */
+ ret = 0;
+ goto out;
+ }
+
+ ret = dict_get_uint64 (src, "snap-max-soft-limit", &value);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to get snap_max_soft_limit");
+ goto out;
+ }
+
+ ret = dict_set_uint64 (dst, "snap-max-soft-limit", value);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to set snap_max_soft_limit");
+ goto out;
+ }
+
+ ret = dict_get_uint64 (src, "voldisplaycount",
+ &voldisplaycount);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to get voldisplaycount");
+ goto out;
+ }
+
+ ret = dict_set_uint64 (dst, "voldisplaycount",
+ voldisplaycount);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to set voldisplaycount");
+ goto out;
+ }
+
+ for (i = 0; i < voldisplaycount; i++) {
+ snprintf (buf, sizeof(buf), "volume%"PRIu64"-volname", i);
+ ret = dict_get_str (src, buf, &volname);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to get %s", buf);
+ goto out;
+ }
+ ret = dict_set_str (dst, buf, volname);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to set %s", buf);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf),
+ "volume%"PRIu64"-snap-max-hard-limit", i);
+ ret = dict_get_uint64 (src, buf, &value);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to get %s", buf);
+ goto out;
+ }
+ ret = dict_set_uint64 (dst, buf, value);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to set %s", buf);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf),
+ "volume%"PRIu64"-active-hard-limit", i);
+ ret = dict_get_uint64 (src, buf, &value);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to get %s", buf);
+ goto out;
+ }
+ ret = dict_set_uint64 (dst, buf, value);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to set %s", buf);
+ goto out;
+ }
+
+ snprintf (buf, sizeof(buf),
+ "volume%"PRIu64"-snap-max-soft-limit", i);
+ ret = dict_get_uint64 (src, buf, &value);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to get %s", buf);
+ goto out;
+ }
+ ret = dict_set_uint64 (dst, buf, value);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Unable to set %s", buf);
+ goto out;
+ }
+ }
+
+ break;
+ default:
+ break;
+ }
+
+ ret = 0;
+out:
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+/* Aggregate missed_snap_counts from different nodes and save it *
+ * in the req_dict of the originator node */
+int
+glusterd_snap_create_use_rsp_dict (dict_t *dst, dict_t *src)
+{
+ char *buf = NULL;
+ char *tmp_str = NULL;
+ char name_buf[PATH_MAX] = "";
+ int32_t i = -1;
+ int32_t ret = -1;
+ int32_t src_missed_snap_count = -1;
+ int32_t dst_missed_snap_count = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ if (!dst || !src) {
+ gf_log (this->name, GF_LOG_ERROR, "Source or Destination "
+ "dict is empty.");
+ goto out;
+ }
+
+ ret = dict_get_int32 (src, "missed_snap_count",
+ &src_missed_snap_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "No missed snaps");
+ ret = 0;
+ goto out;
+ }
+
+ ret = dict_get_int32 (dst, "missed_snap_count",
+ &dst_missed_snap_count);
+ if (ret) {
+ /* Initialize dst_missed_count for the first time */
+ dst_missed_snap_count = 0;
+ }
+
+ for (i = 0; i < src_missed_snap_count; i++) {
+ snprintf (name_buf, sizeof(name_buf), "missed_snaps_%d",
+ i);
+ ret = dict_get_str (src, name_buf, &buf);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to fetch %s", name_buf);
+ goto out;
+ }
+
+ snprintf (name_buf, sizeof(name_buf), "missed_snaps_%d",
+ dst_missed_snap_count);
+
+ tmp_str = gf_strdup (buf);
+ if (!tmp_str) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_dynstr (dst, name_buf, tmp_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set %s", name_buf);
+ goto out;
+ }
+
+ tmp_str = NULL;
+ dst_missed_snap_count++;
+ }
+
+ ret = dict_set_int32 (dst, "missed_snap_count", dst_missed_snap_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to set dst_missed_snap_count");
+ goto out;
+ }
+
+out:
+ if (ret && tmp_str)
+ GF_FREE(tmp_str);
+
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_snap_use_rsp_dict (dict_t *dst, dict_t *src)
+{
+ int ret = -1;
+ int32_t snap_command = 0;
+
+ if (!dst || !src) {
+ gf_log ("", GF_LOG_ERROR, "Source or Destination "
+ "dict is empty.");
+ goto out;
+ }
+
+ ret = dict_get_int32 (dst, "type", &snap_command);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "unable to get the type of "
+ "the snapshot command");
+ goto out;
+ }
+
+ switch (snap_command) {
+ case GF_SNAP_OPTION_TYPE_CREATE:
+ case GF_SNAP_OPTION_TYPE_DELETE:
+ ret = glusterd_snap_create_use_rsp_dict (dst, src);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to use rsp dict");
+ goto out;
+ }
+ break;
+ case GF_SNAP_OPTION_TYPE_CONFIG:
+ ret = glusterd_snap_config_use_rsp_dict (dst, src);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to use rsp dict");
+ goto out;
+ }
+ break;
+ default:
+ // copy the response dictinary's contents to the dict to be
+ // sent back to the cli
+ dict_copy (src, dst);
+ break;
+ }
+
+ ret = 0;
+out:
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_sys_exec_output_rsp_dict (dict_t *dst, dict_t *src)
+{
+ char output_name[PATH_MAX] = "";
+ char *output = NULL;
+ int ret = 0;
+ int i = 0;
+ int len = 0;
+ int src_output_count = 0;
+ int dst_output_count = 0;
+
+ if (!dst || !src) {
+ gf_log ("", GF_LOG_ERROR, "Source or Destination "
+ "dict is empty.");
+ goto out;
+ }
+
+ ret = dict_get_int32 (dst, "output_count", &dst_output_count);
+
+ ret = dict_get_int32 (src, "output_count", &src_output_count);
+ if (ret) {
+ gf_log ("", GF_LOG_DEBUG, "No output from source");
+ ret = 0;
+ goto out;
+ }
+
+ for (i = 1; i <= src_output_count; i++) {
+ len = snprintf (output_name, sizeof(output_name) - 1,
+ "output_%d", i);
+ output_name[len] = '\0';
+ ret = dict_get_str (src, output_name, &output);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to fetch %s",
+ output_name);
+ goto out;
+ }
+
+ len = snprintf (output_name, sizeof(output_name) - 1,
+ "output_%d", i+dst_output_count);
+ output_name[len] = '\0';
+ ret = dict_set_dynstr (dst, output_name, gf_strdup (output));
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to set %s",
+ output_name);
+ goto out;
+ }
+ }
+
+ ret = dict_set_int32 (dst, "output_count",
+ dst_output_count+src_output_count);
+out:
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int
+glusterd_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict)
+{
+ int ret = 0;
+ glusterd_op_t op = GD_OP_NONE;
+
+ op = glusterd_op_get_op ();
+ GF_ASSERT (aggr);
+ GF_ASSERT (rsp_dict);
+
+ if (!aggr)
+ goto out;
+ dict_copy (rsp_dict, aggr);
+out:
+ return ret;
+}
+
+int
glusterd_volume_heal_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict)
{
int ret = 0;
@@ -6314,3 +10262,1283 @@ glusterd_volume_heal_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict)
out:
return ret;
}
+
+int
+_profile_volume_add_brick_rsp (dict_t *this, char *key, data_t *value,
+ void *data)
+{
+ char new_key[256] = {0};
+ glusterd_pr_brick_rsp_conv_t *rsp_ctx = NULL;
+ data_t *new_value = NULL;
+
+ rsp_ctx = data;
+ new_value = data_copy (value);
+ GF_ASSERT (new_value);
+ snprintf (new_key, sizeof (new_key), "%d-%s", rsp_ctx->count, key);
+ dict_set (rsp_ctx->dict, new_key, new_value);
+ return 0;
+}
+
+int
+glusterd_volume_quota_copy_to_op_ctx_dict (dict_t *dict, dict_t *rsp_dict)
+{
+ int ret = -1;
+ int i = 0;
+ int count = 0;
+ int rsp_dict_count = 0;
+ char *uuid_str = NULL;
+ char *uuid_str_dup = NULL;
+ char key[256] = {0,};
+ xlator_t *this = NULL;
+ int type = GF_QUOTA_OPTION_TYPE_NONE;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ ret = dict_get_int32 (dict, "type", &type);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get quota opcode");
+ goto out;
+ }
+
+ if ((type != GF_QUOTA_OPTION_TYPE_LIMIT_USAGE) &&
+ (type != GF_QUOTA_OPTION_TYPE_REMOVE)) {
+ dict_copy (rsp_dict, dict);
+ ret = 0;
+ goto out;
+ }
+
+ ret = dict_get_int32 (rsp_dict, "count", &rsp_dict_count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get the count of "
+ "gfids from the rsp dict");
+ goto out;
+ }
+
+ ret = dict_get_int32 (dict, "count", &count);
+ if (ret)
+ /* The key "count" is absent in op_ctx when this function is
+ * called after self-staging on the originator. This must not
+ * be treated as error.
+ */
+ gf_log (this->name, GF_LOG_DEBUG, "Failed to get count of gfids"
+ " from req dict. This could be because count is not yet"
+ " copied from rsp_dict into op_ctx");
+
+ for (i = 0; i < rsp_dict_count; i++) {
+ snprintf (key, sizeof(key)-1, "gfid%d", i);
+
+ ret = dict_get_str (rsp_dict, key, &uuid_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get gfid "
+ "from rsp dict");
+ goto out;
+ }
+
+ snprintf (key, sizeof (key)-1, "gfid%d", i + count);
+
+ uuid_str_dup = gf_strdup (uuid_str);
+ if (!uuid_str_dup) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_dynstr (dict, key, uuid_str_dup);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set gfid "
+ "from rsp dict into req dict");
+ GF_FREE (uuid_str_dup);
+ goto out;
+ }
+ }
+
+ ret = dict_set_int32 (dict, "count", rsp_dict_count + count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set aggregated "
+ "count in req dict");
+ goto out;
+ }
+
+out:
+ return ret;
+}
+
+int
+glusterd_profile_volume_brick_rsp (void *pending_entry,
+ dict_t *rsp_dict, dict_t *op_ctx,
+ char **op_errstr, gd_node_type type)
+{
+ int ret = 0;
+ glusterd_pr_brick_rsp_conv_t rsp_ctx = {0};
+ int32_t count = 0;
+ char brick[PATH_MAX+1024] = {0};
+ char key[256] = {0};
+ char *full_brick = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ GF_ASSERT (rsp_dict);
+ GF_ASSERT (op_ctx);
+ GF_ASSERT (op_errstr);
+ GF_ASSERT (pending_entry);
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ ret = dict_get_int32 (op_ctx, "count", &count);
+ if (ret) {
+ count = 1;
+ } else {
+ count++;
+ }
+ snprintf (key, sizeof (key), "%d-brick", count);
+ if (type == GD_NODE_BRICK) {
+ brickinfo = pending_entry;
+ snprintf (brick, sizeof (brick), "%s:%s", brickinfo->hostname,
+ brickinfo->path);
+ } else if (type == GD_NODE_NFS) {
+ snprintf (brick, sizeof (brick), "%s", uuid_utoa (MY_UUID));
+ }
+ full_brick = gf_strdup (brick);
+ GF_ASSERT (full_brick);
+ ret = dict_set_dynstr (op_ctx, key, full_brick);
+
+ rsp_ctx.count = count;
+ rsp_ctx.dict = op_ctx;
+ dict_foreach (rsp_dict, _profile_volume_add_brick_rsp, &rsp_ctx);
+ dict_del (op_ctx, "count");
+ ret = dict_set_int32 (op_ctx, "count", count);
+ return ret;
+}
+
+//input-key: <replica-id>:<child-id>-*
+//output-key: <brick-id>-*
+int
+_heal_volume_add_shd_rsp (dict_t *this, char *key, data_t *value, void *data)
+{
+ char new_key[256] = {0,};
+ char int_str[16] = {0};
+ data_t *new_value = NULL;
+ char *rxl_end = NULL;
+ char *rxl_child_end = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ int rxl_id = 0;
+ int rxl_child_id = 0;
+ int brick_id = 0;
+ int int_len = 0;
+ int ret = 0;
+ glusterd_heal_rsp_conv_t *rsp_ctx = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+
+ rsp_ctx = data;
+ rxl_end = strchr (key, '-');
+ if (!rxl_end)
+ goto out;
+
+ int_len = strlen (key) - strlen (rxl_end);
+ strncpy (int_str, key, int_len);
+ int_str[int_len] = '\0';
+ ret = gf_string2int (int_str, &rxl_id);
+ if (ret)
+ goto out;
+
+ rxl_child_end = strchr (rxl_end + 1, '-');
+ if (!rxl_child_end)
+ goto out;
+
+ int_len = strlen (rxl_end) - strlen (rxl_child_end) - 1;
+ strncpy (int_str, rxl_end + 1, int_len);
+ int_str[int_len] = '\0';
+ ret = gf_string2int (int_str, &rxl_child_id);
+ if (ret)
+ goto out;
+
+ volinfo = rsp_ctx->volinfo;
+ brick_id = rxl_id * volinfo->replica_count + rxl_child_id;
+
+ if (!strcmp (rxl_child_end, "-status")) {
+ brickinfo = glusterd_get_brickinfo_by_position (volinfo,
+ brick_id);
+ if (!brickinfo)
+ goto out;
+ if (!glusterd_is_local_brick (rsp_ctx->this, volinfo,
+ brickinfo))
+ goto out;
+ }
+ new_value = data_copy (value);
+ snprintf (new_key, sizeof (new_key), "%d%s", brick_id, rxl_child_end);
+ dict_set (rsp_ctx->dict, new_key, new_value);
+
+out:
+ return 0;
+}
+
+int
+_heal_volume_add_shd_rsp_of_statistics (dict_t *this, char *key, data_t
+ *value, void *data)
+{
+ char new_key[256] = {0,};
+ char int_str[16] = {0,};
+ char key_begin_string[128] = {0,};
+ data_t *new_value = NULL;
+ char *rxl_end = NULL;
+ char *rxl_child_end = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ char *key_begin_str = NULL;
+ int rxl_id = 0;
+ int rxl_child_id = 0;
+ int brick_id = 0;
+ int int_len = 0;
+ int ret = 0;
+ glusterd_heal_rsp_conv_t *rsp_ctx = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+
+ rsp_ctx = data;
+ key_begin_str = strchr (key, '-');
+ if (!key_begin_str)
+ goto out;
+
+ int_len = strlen (key) - strlen (key_begin_str);
+ strncpy (key_begin_string, key, int_len);
+ key_begin_string[int_len] = '\0';
+
+ rxl_end = strchr (key_begin_str + 1, '-');
+ if (!rxl_end)
+ goto out;
+
+ int_len = strlen (key_begin_str) - strlen (rxl_end) - 1;
+ strncpy (int_str, key_begin_str + 1, int_len);
+ int_str[int_len] = '\0';
+ ret = gf_string2int (int_str, &rxl_id);
+ if (ret)
+ goto out;
+
+
+ rxl_child_end = strchr (rxl_end + 1, '-');
+ if (!rxl_child_end)
+ goto out;
+
+ int_len = strlen (rxl_end) - strlen (rxl_child_end) - 1;
+ strncpy (int_str, rxl_end + 1, int_len);
+ int_str[int_len] = '\0';
+ ret = gf_string2int (int_str, &rxl_child_id);
+ if (ret)
+ goto out;
+
+ volinfo = rsp_ctx->volinfo;
+ brick_id = rxl_id * volinfo->replica_count + rxl_child_id;
+
+ brickinfo = glusterd_get_brickinfo_by_position (volinfo, brick_id);
+ if (!brickinfo)
+ goto out;
+ if (!glusterd_is_local_brick (rsp_ctx->this, volinfo, brickinfo))
+ goto out;
+
+ new_value = data_copy (value);
+ snprintf (new_key, sizeof (new_key), "%s-%d%s", key_begin_string,
+ brick_id, rxl_child_end);
+ dict_set (rsp_ctx->dict, new_key, new_value);
+
+out:
+ return 0;
+
+}
+
+int
+glusterd_heal_volume_brick_rsp (dict_t *req_dict, dict_t *rsp_dict,
+ dict_t *op_ctx, char **op_errstr)
+{
+ int ret = 0;
+ glusterd_heal_rsp_conv_t rsp_ctx = {0};
+ char *volname = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ int heal_op = -1;
+
+ GF_ASSERT (rsp_dict);
+ GF_ASSERT (op_ctx);
+ GF_ASSERT (op_errstr);
+
+ ret = dict_get_str (req_dict, "volname", &volname);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to get volume name");
+ goto out;
+ }
+
+ ret = dict_get_int32 (req_dict, "heal-op", &heal_op);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to get heal_op");
+ goto out;
+ }
+
+
+ ret = glusterd_volinfo_find (volname, &volinfo);
+
+ if (ret)
+ goto out;
+
+ rsp_ctx.dict = op_ctx;
+ rsp_ctx.volinfo = volinfo;
+ rsp_ctx.this = THIS;
+ if (heal_op == GF_AFR_OP_STATISTICS)
+ dict_foreach (rsp_dict, _heal_volume_add_shd_rsp_of_statistics,
+ &rsp_ctx);
+ else
+ dict_foreach (rsp_dict, _heal_volume_add_shd_rsp, &rsp_ctx);
+
+
+out:
+ return ret;
+}
+
+int
+_status_volume_add_brick_rsp (dict_t *this, char *key, data_t *value,
+ void *data)
+{
+ char new_key[256] = {0,};
+ data_t *new_value = 0;
+ glusterd_pr_brick_rsp_conv_t *rsp_ctx = NULL;
+
+ rsp_ctx = data;
+ new_value = data_copy (value);
+ snprintf (new_key, sizeof (new_key), "brick%d.%s", rsp_ctx->count, key);
+ dict_set (rsp_ctx->dict, new_key, new_value);
+
+ return 0;
+}
+
+int
+glusterd_status_volume_brick_rsp (dict_t *rsp_dict, dict_t *op_ctx,
+ char **op_errstr)
+{
+ int ret = 0;
+ glusterd_pr_brick_rsp_conv_t rsp_ctx = {0};
+ int32_t count = 0;
+ int index = 0;
+
+ GF_ASSERT (rsp_dict);
+ GF_ASSERT (op_ctx);
+ GF_ASSERT (op_errstr);
+
+ ret = dict_get_int32 (op_ctx, "count", &count);
+ if (ret) {
+ count = 0;
+ } else {
+ count++;
+ }
+ ret = dict_get_int32 (rsp_dict, "index", &index);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR, "Couldn't get node index");
+ goto out;
+ }
+ dict_del (rsp_dict, "index");
+
+ rsp_ctx.count = index;
+ rsp_ctx.dict = op_ctx;
+ dict_foreach (rsp_dict, _status_volume_add_brick_rsp, &rsp_ctx);
+ ret = dict_set_int32 (op_ctx, "count", count);
+
+out:
+ return ret;
+}
+
+int
+glusterd_defrag_volume_node_rsp (dict_t *req_dict, dict_t *rsp_dict,
+ dict_t *op_ctx)
+{
+ int ret = 0;
+ char *volname = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ char key[256] = {0,};
+ int32_t i = 0;
+ char buf[1024] = {0,};
+ char *node_str = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ priv = THIS->private;
+ GF_ASSERT (req_dict);
+
+ ret = dict_get_str (req_dict, "volname", &volname);
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR, "Unable to get volume name");
+ goto out;
+ }
+
+ ret = glusterd_volinfo_find (volname, &volinfo);
+
+ if (ret)
+ goto out;
+
+ if (rsp_dict) {
+ ret = glusterd_defrag_volume_status_update (volinfo,
+ rsp_dict);
+ }
+
+ if (!op_ctx) {
+ dict_copy (rsp_dict, op_ctx);
+ goto out;
+ }
+
+ ret = dict_get_int32 (op_ctx, "count", &i);
+ i++;
+
+ ret = dict_set_int32 (op_ctx, "count", i);
+ if (ret)
+ gf_log (THIS->name, GF_LOG_ERROR, "Failed to set count");
+
+ snprintf (buf, 1024, "%s", uuid_utoa (MY_UUID));
+ node_str = gf_strdup (buf);
+
+ snprintf (key, 256, "node-uuid-%d",i);
+ ret = dict_set_dynstr (op_ctx, key, node_str);
+ if (ret)
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "failed to set node-uuid");
+
+ memset (key, 0 , 256);
+ snprintf (key, 256, "files-%d", i);
+ ret = dict_set_uint64 (op_ctx, key, volinfo->rebal.rebalance_files);
+ if (ret)
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "failed to set file count");
+
+ memset (key, 0 , 256);
+ snprintf (key, 256, "size-%d", i);
+ ret = dict_set_uint64 (op_ctx, key, volinfo->rebal.rebalance_data);
+ if (ret)
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "failed to set size of xfer");
+
+ memset (key, 0 , 256);
+ snprintf (key, 256, "lookups-%d", i);
+ ret = dict_set_uint64 (op_ctx, key, volinfo->rebal.lookedup_files);
+ if (ret)
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "failed to set lookedup file count");
+
+ memset (key, 0 , 256);
+ snprintf (key, 256, "status-%d", i);
+ ret = dict_set_int32 (op_ctx, key, volinfo->rebal.defrag_status);
+ if (ret)
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "failed to set status");
+
+ memset (key, 0 , 256);
+ snprintf (key, 256, "failures-%d", i);
+ ret = dict_set_uint64 (op_ctx, key, volinfo->rebal.rebalance_failures);
+ if (ret)
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "failed to set failure count");
+
+ memset (key, 0 , 256);
+ snprintf (key, 256, "skipped-%d", i);
+ ret = dict_set_uint64 (op_ctx, key, volinfo->rebal.skipped_files);
+ if (ret)
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "failed to set skipped count");
+
+ memset (key, 0, 256);
+ snprintf (key, 256, "run-time-%d", i);
+ ret = dict_set_double (op_ctx, key, volinfo->rebal.rebalance_time);
+ if (ret)
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "failed to set run-time");
+
+out:
+ return ret;
+}
+int32_t
+glusterd_handle_node_rsp (dict_t *req_dict, void *pending_entry,
+ glusterd_op_t op, dict_t *rsp_dict, dict_t *op_ctx,
+ char **op_errstr, gd_node_type type)
+{
+ int ret = 0;
+
+ GF_ASSERT (op_errstr);
+
+ switch (op) {
+ case GD_OP_PROFILE_VOLUME:
+ ret = glusterd_profile_volume_brick_rsp (pending_entry,
+ rsp_dict, op_ctx,
+ op_errstr, type);
+ break;
+ case GD_OP_STATUS_VOLUME:
+ ret = glusterd_status_volume_brick_rsp (rsp_dict, op_ctx,
+ op_errstr);
+ break;
+
+ case GD_OP_DEFRAG_BRICK_VOLUME:
+ glusterd_defrag_volume_node_rsp (req_dict,
+ rsp_dict, op_ctx);
+ break;
+
+ case GD_OP_HEAL_VOLUME:
+ ret = glusterd_heal_volume_brick_rsp (req_dict, rsp_dict,
+ op_ctx, op_errstr);
+ break;
+ default:
+ break;
+ }
+
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_set_originator_uuid (dict_t *dict)
+{
+ int ret = -1;
+ uuid_t *originator_uuid = NULL;
+
+ GF_ASSERT (dict);
+
+ originator_uuid = GF_CALLOC (1, sizeof(uuid_t),
+ gf_common_mt_uuid_t);
+ if (!originator_uuid) {
+ ret = -1;
+ goto out;
+ }
+
+ uuid_copy (*originator_uuid, MY_UUID);
+ ret = dict_set_bin (dict, "originator_uuid",
+ originator_uuid, sizeof (uuid_t));
+ if (ret) {
+ gf_log ("", GF_LOG_ERROR,
+ "Failed to set originator_uuid.");
+ goto out;
+ }
+
+out:
+ if (ret && originator_uuid)
+ GF_FREE (originator_uuid);
+
+ return ret;
+}
+
+/* Should be used only when an operation is in progress, as that is the only
+ * time a lock_owner is set
+ */
+gf_boolean_t
+is_origin_glusterd (dict_t *dict)
+{
+ gf_boolean_t ret = _gf_false;
+ uuid_t lock_owner = {0,};
+ uuid_t *originator_uuid = NULL;
+
+ GF_ASSERT (dict);
+
+ ret = dict_get_bin (dict, "originator_uuid",
+ (void **) &originator_uuid);
+ if (ret) {
+ /* If not originator_uuid has been set, then the command
+ * has been originated from a glusterd running on older version
+ * Hence fetching the lock owner */
+ ret = glusterd_get_lock_owner (&lock_owner);
+ if (ret) {
+ ret = _gf_false;
+ goto out;
+ }
+ ret = !uuid_compare (MY_UUID, lock_owner);
+ } else
+ ret = !uuid_compare (MY_UUID, *originator_uuid);
+
+out:
+ return ret;
+}
+
+int
+glusterd_generate_and_set_task_id (dict_t *dict, char *key)
+{
+ int ret = -1;
+ uuid_t task_id = {0,};
+ char *uuid_str = NULL;
+ xlator_t *this = NULL;
+
+ GF_ASSERT (dict);
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ uuid_generate (task_id);
+ uuid_str = gf_strdup (uuid_utoa (task_id));
+ if (!uuid_str) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_dynstr (dict, key, uuid_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set %s in dict",
+ key);
+ goto out;
+ }
+ gf_log (this->name, GF_LOG_INFO, "Generated task-id %s for key %s",
+ uuid_str, key);
+
+out:
+ if (ret)
+ GF_FREE (uuid_str);
+ return ret;
+}
+
+int
+glusterd_copy_uuid_to_dict (uuid_t uuid, dict_t *dict, char *key)
+{
+ int ret = -1;
+ char tmp_str[40] = {0,};
+ char *task_id_str = NULL;
+
+ GF_ASSERT (dict);
+ GF_ASSERT (key);
+
+ uuid_unparse (uuid, tmp_str);
+ task_id_str = gf_strdup (tmp_str);
+ if (!task_id_str)
+ return -1;
+
+ ret = dict_set_dynstr (dict, key, task_id_str);
+ if (ret) {
+ GF_FREE (task_id_str);
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "Error setting uuid in dict with key %s", key);
+ }
+
+ return 0;
+}
+
+int
+_update_volume_op_versions (dict_t *this, char *key, data_t *value, void *data)
+{
+ int op_version = 0;
+ glusterd_volinfo_t *ctx = NULL;
+ gf_boolean_t enabled = _gf_true;
+ int ret = -1;
+
+ GF_ASSERT (data);
+ ctx = data;
+
+ op_version = glusterd_get_op_version_for_key (key);
+
+ if (gd_is_xlator_option (key) || gd_is_boolean_option (key)) {
+ ret = gf_string2boolean (value->data, &enabled);
+ if (ret)
+ return 0;
+
+ if (!enabled)
+ return 0;
+ }
+
+ if (op_version > ctx->op_version)
+ ctx->op_version = op_version;
+
+ if (gd_is_client_option (key) &&
+ (op_version > ctx->client_op_version))
+ ctx->client_op_version = op_version;
+
+ return 0;
+}
+
+void
+gd_update_volume_op_versions (glusterd_volinfo_t *volinfo)
+{
+ glusterd_conf_t *conf = NULL;
+ gf_boolean_t ob_enabled = _gf_false;
+
+ GF_ASSERT (volinfo);
+
+ conf = THIS->private;
+ GF_ASSERT (conf);
+
+ /* Reset op-versions to minimum */
+ volinfo->op_version = 1;
+ volinfo->client_op_version = 1;
+
+ dict_foreach (volinfo->dict, _update_volume_op_versions, volinfo);
+
+ /* Special case for open-behind
+ * If cluster op-version >= 2 and open-behind hasn't been explicitly
+ * disabled, volume op-versions must be updated to account for it
+ */
+
+ /* TODO: Remove once we have a general way to update automatically
+ * enabled features
+ */
+ if (conf->op_version >= 2) {
+ ob_enabled = dict_get_str_boolean (volinfo->dict,
+ "performance.open-behind",
+ _gf_true);
+ if (ob_enabled) {
+
+ if (volinfo->op_version < 2)
+ volinfo->op_version = 2;
+ if (volinfo->client_op_version < 2)
+ volinfo->client_op_version = 2;
+ }
+ }
+
+ return;
+}
+
+int
+op_version_check (xlator_t *this, int min_op_version, char *msg, int msglen)
+{
+ int ret = 0;
+ glusterd_conf_t *priv = NULL;
+
+ GF_ASSERT (this);
+ GF_ASSERT (msg);
+
+ priv = this->private;
+ if (priv->op_version < min_op_version) {
+ snprintf (msg, msglen, "One or more nodes do not support "
+ "the required op-version. Cluster op-version must "
+ "atleast be %d.", min_op_version);
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg);
+ ret = -1;
+ }
+ return ret;
+}
+
+
+/* A task is committed/completed once the task-id for it is cleared */
+gf_boolean_t
+gd_is_remove_brick_committed (glusterd_volinfo_t *volinfo)
+{
+ GF_ASSERT (volinfo);
+
+ if ((GD_OP_REMOVE_BRICK == volinfo->rebal.op) &&
+ !uuid_is_null (volinfo->rebal.rebalance_id))
+ return _gf_false;
+
+ return _gf_true;
+}
+
+gf_boolean_t
+glusterd_are_vol_all_peers_up (glusterd_volinfo_t *volinfo,
+ struct list_head *peers,
+ char **down_peerstr)
+{
+ glusterd_peerinfo_t *peerinfo = NULL;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ gf_boolean_t ret = _gf_false;
+
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ if (!uuid_compare (brickinfo->uuid, MY_UUID))
+ continue;
+
+ list_for_each_entry (peerinfo, peers, uuid_list) {
+ if (uuid_compare (peerinfo->uuid, brickinfo->uuid))
+ continue;
+
+ /*Found peer who owns the brick, return false
+ * if peer is not connected or not friend */
+ if (!(peerinfo->connected) ||
+ (peerinfo->state.state !=
+ GD_FRIEND_STATE_BEFRIENDED)) {
+ *down_peerstr = gf_strdup (peerinfo->hostname);
+ gf_log ("", GF_LOG_DEBUG, "Peer %s is down. ",
+ peerinfo->hostname);
+ goto out;
+ }
+ }
+ }
+
+ ret = _gf_true;
+out:
+ gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ return ret;
+}
+
+gf_boolean_t
+glusterd_is_status_tasks_op (glusterd_op_t op, dict_t *dict)
+{
+ int ret = -1;
+ uint32_t cmd = GF_CLI_STATUS_NONE;
+ gf_boolean_t is_status_tasks = _gf_false;
+
+ if (op != GD_OP_STATUS_VOLUME)
+ goto out;
+
+ ret = dict_get_uint32 (dict, "cmd", &cmd);
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_ERROR, "Failed to get opcode");
+ goto out;
+ }
+
+ if (cmd & GF_CLI_STATUS_TASKS)
+ is_status_tasks = _gf_true;
+
+out:
+ return is_status_tasks;
+}
+
+int
+glusterd_compare_snap_time(struct list_head *list1, struct list_head *list2)
+{
+ glusterd_snap_t *snap1 = NULL;
+ glusterd_snap_t *snap2 = NULL;
+ double diff_time = 0;
+
+ GF_ASSERT (list1);
+ GF_ASSERT (list2);
+
+ snap1 = list_entry(list1, glusterd_snap_t, snap_list);
+ snap2 = list_entry(list2, glusterd_snap_t, snap_list);
+ diff_time = difftime(snap1->time_stamp, snap2->time_stamp);
+
+ return ((int)diff_time);
+}
+
+int
+glusterd_compare_snap_vol_time(struct list_head *list1, struct list_head *list2)
+{
+ glusterd_volinfo_t *snapvol1 = NULL;
+ glusterd_volinfo_t *snapvol2 = NULL;
+ double diff_time = 0;
+
+ GF_ASSERT (list1);
+ GF_ASSERT (list2);
+
+ snapvol1 = list_entry(list1, glusterd_volinfo_t, snapvol_list);
+ snapvol2 = list_entry(list2, glusterd_volinfo_t, snapvol_list);
+ diff_time = difftime(snapvol1->snapshot->time_stamp,
+ snapvol2->snapshot->time_stamp);
+
+ return ((int)diff_time);
+}
+
+int32_t
+glusterd_missed_snapinfo_new (glusterd_missed_snap_info **missed_snapinfo)
+{
+ glusterd_missed_snap_info *new_missed_snapinfo = NULL;
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (missed_snapinfo);
+
+ new_missed_snapinfo = GF_CALLOC (1, sizeof(*new_missed_snapinfo),
+ gf_gld_mt_missed_snapinfo_t);
+
+ if (!new_missed_snapinfo)
+ goto out;
+
+ INIT_LIST_HEAD (&new_missed_snapinfo->missed_snaps);
+ INIT_LIST_HEAD (&new_missed_snapinfo->snap_ops);
+
+ *missed_snapinfo = new_missed_snapinfo;
+
+ ret = 0;
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+int32_t
+glusterd_missed_snap_op_new (glusterd_snap_op_t **snap_op)
+{
+ glusterd_snap_op_t *new_snap_op = NULL;
+ int32_t ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (snap_op);
+
+ new_snap_op = GF_CALLOC (1, sizeof(*new_snap_op),
+ gf_gld_mt_missed_snapinfo_t);
+
+ if (!new_snap_op)
+ goto out;
+
+ new_snap_op->brick_num = -1;
+ new_snap_op->op = -1;
+ new_snap_op->status = -1;
+ INIT_LIST_HEAD (&new_snap_op->snap_ops_list);
+
+ *snap_op = new_snap_op;
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning %d", ret);
+ return ret;
+}
+
+/* Tells if rebalance needs to be started for the given volume on the peer
+ *
+ * Rebalance should be started on a peer only if an involved brick is present on
+ * the peer.
+ *
+ * For a normal rebalance, if any one brick of the given volume is present on
+ * the peer, the rebalance process should be started.
+ *
+ * For a rebalance as part of a remove-brick operation, the rebalance process
+ * should be started only if one of the bricks being removed is present on the
+ * peer
+ */
+gf_boolean_t
+gd_should_i_start_rebalance (glusterd_volinfo_t *volinfo) {
+ gf_boolean_t retval = _gf_false;
+ int ret = -1;
+ glusterd_brickinfo_t *brick = NULL;
+ int count = 0;
+ int i = 0;
+ char key[1023] = {0,};
+ char *brickname = NULL;
+
+
+ switch (volinfo->rebal.op) {
+ case GD_OP_REBALANCE:
+ list_for_each_entry (brick, &volinfo->bricks, brick_list) {
+ if (uuid_compare (MY_UUID, brick->uuid) == 0) {
+ retval = _gf_true;
+ break;
+ }
+ }
+ break;
+ case GD_OP_REMOVE_BRICK:
+ ret = dict_get_int32 (volinfo->rebal.dict, "count", &count);
+ if (ret) {
+ goto out;
+ }
+ for (i = 1; i <= count; i++) {
+ memset (key, 0, sizeof (key));
+ snprintf (key, sizeof (key), "brick%d", i);
+ ret = dict_get_str (volinfo->rebal.dict, key,
+ &brickname);
+ if (ret)
+ goto out;
+ ret = glusterd_volume_brickinfo_get_by_brick (brickname,
+ volinfo,
+ &brick);
+ if (ret)
+ goto out;
+ if (uuid_compare (MY_UUID, brick->uuid) == 0) {
+ retval = _gf_true;
+ break;
+ }
+ }
+ break;
+ default:
+ break;
+ }
+
+out:
+ return retval;
+}
+
+int
+glusterd_is_volume_quota_enabled (glusterd_volinfo_t *volinfo)
+{
+ return (glusterd_volinfo_get_boolean (volinfo, VKEY_FEATURES_QUOTA));
+}
+
+int
+glusterd_validate_and_set_gfid (dict_t *op_ctx, dict_t *req_dict,
+ char **op_errstr)
+{
+ int ret = -1;
+ int count = 0;
+ int i = 0;
+ int op_code = GF_QUOTA_OPTION_TYPE_NONE;
+ uuid_t uuid1 = {0};
+ uuid_t uuid2 = {0,};
+ char *path = NULL;
+ char key[256] = {0,};
+ char *uuid1_str = NULL;
+ char *uuid1_str_dup = NULL;
+ char *uuid2_str = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ ret = dict_get_int32 (op_ctx, "type", &op_code);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get quota opcode");
+ goto out;
+ }
+
+ if ((op_code != GF_QUOTA_OPTION_TYPE_LIMIT_USAGE) &&
+ (op_code != GF_QUOTA_OPTION_TYPE_REMOVE)) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = dict_get_str (op_ctx, "path", &path);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get path");
+ goto out;
+ }
+
+ ret = dict_get_int32 (op_ctx, "count", &count);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get count");
+ goto out;
+ }
+
+ /* If count is 0, fail the command with ENOENT.
+ *
+ * If count is 1, treat gfid0 as the gfid on which the operation
+ * is to be performed and resume the command.
+ *
+ * if count > 1, get the 0th gfid from the op_ctx and,
+ * compare it with the remaining 'count -1' gfids.
+ * If they are found to be the same, set gfid0 in the op_ctx and
+ * resume the operation, else error out.
+ */
+
+ if (count == 0) {
+ gf_asprintf (op_errstr, "Failed to get trusted.gfid attribute "
+ "on path %s. Reason : %s", path,
+ strerror (ENOENT));
+ ret = -1;
+ goto out;
+ }
+
+ snprintf (key, sizeof (key) - 1, "gfid%d", 0);
+
+ ret = dict_get_str (op_ctx, key, &uuid1_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get key '%s'",
+ key);
+ goto out;
+ }
+
+ uuid_parse (uuid1_str, uuid1);
+
+ for (i = 1; i < count; i++) {
+ snprintf (key, sizeof (key)-1, "gfid%d", i);
+
+ ret = dict_get_str (op_ctx, key, &uuid2_str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get key "
+ "'%s'", key);
+ goto out;
+ }
+
+ uuid_parse (uuid2_str, uuid2);
+
+ if (uuid_compare (uuid1, uuid2)) {
+ gf_asprintf (op_errstr, "gfid mismatch between %s and "
+ "%s for path %s", uuid1_str, uuid2_str,
+ path);
+ ret = -1;
+ goto out;
+ }
+ }
+
+ if (i == count) {
+ uuid1_str_dup = gf_strdup (uuid1_str);
+ if (!uuid1_str_dup) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_dynstr (req_dict, "gfid", uuid1_str_dup);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set gfid");
+ GF_FREE (uuid1_str_dup);
+ goto out;
+ }
+ } else {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to iterate through %d"
+ " entries in the req dict", count);
+ ret = -1;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+void
+glusterd_clean_up_quota_store (glusterd_volinfo_t *volinfo)
+{
+ char voldir[PATH_MAX] = {0,};
+ char quota_confpath[PATH_MAX] = {0,};
+ char cksum_path[PATH_MAX] = {0,};
+ xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ GLUSTERD_GET_VOLUME_DIR (voldir, volinfo, conf);
+
+ snprintf (quota_confpath, sizeof (quota_confpath), "%s/%s", voldir,
+ GLUSTERD_VOLUME_QUOTA_CONFIG);
+ snprintf (cksum_path, sizeof (cksum_path), "%s/%s", voldir,
+ GLUSTERD_VOL_QUOTA_CKSUM_FILE);
+
+ unlink (quota_confpath);
+ unlink (cksum_path);
+
+ gf_store_handle_destroy (volinfo->quota_conf_shandle);
+ volinfo->quota_conf_shandle = NULL;
+ volinfo->quota_conf_version = 0;
+
+}
+
+#define QUOTA_CONF_HEADER \
+ "GlusterFS Quota conf | version: v%d.%d\n"
+
+int
+glusterd_store_quota_conf_skip_header (xlator_t *this, int fd)
+{
+ char buf[PATH_MAX] = {0,};
+
+ snprintf (buf, sizeof(buf)-1, QUOTA_CONF_HEADER, 1, 1);
+ return gf_skip_header_section (fd, strlen (buf));
+}
+
+int
+glusterd_store_quota_conf_stamp_header (xlator_t *this, int fd)
+{
+ char buf[PATH_MAX] = {0,};
+ int buf_len = 0;
+ ssize_t ret = -1;
+ ssize_t written = 0;
+
+ snprintf (buf, sizeof(buf)-1, QUOTA_CONF_HEADER, 1, 1);
+ buf_len = strlen (buf);
+ for (written = 0; written != buf_len; written += ret) {
+ ret = write (fd, buf + written, buf_len - written);
+ if (ret == -1) {
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int
+glusterd_remove_auxiliary_mount (char *volname)
+{
+ int ret = -1;
+ runner_t runner = {0,};
+ char mountdir[PATH_MAX] = {0,};
+ char pidfile[PATH_MAX] = {0,};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ GLUSTERFS_GET_AUX_MOUNT_PIDFILE (pidfile, volname);
+
+ if (!gf_is_service_running (pidfile, NULL)) {
+ gf_log (this->name, GF_LOG_DEBUG, "Aux mount of volume %s "
+ "absent, hence returning", volname);
+ return 0;
+ }
+
+ GLUSTERD_GET_QUOTA_AUX_MOUNT_PATH (mountdir, volname, "/");
+ runinit (&runner);
+ runner_add_args (&runner, "umount",
+
+#if GF_LINUX_HOST_OS
+ "-l",
+#endif
+ mountdir, NULL);
+ ret = runner_run_reuse (&runner);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR, "umount on %s failed, "
+ "reason : %s", mountdir, strerror (errno));
+ runner_end (&runner);
+
+ rmdir (mountdir);
+ return ret;
+}
+
+/* Stops the rebalance process of the given volume
+ */
+int
+gd_stop_rebalance_process (glusterd_volinfo_t *volinfo)
+{
+ int ret = -1;
+ xlator_t *this = NULL;
+ glusterd_conf_t *conf = NULL;
+ char pidfile[PATH_MAX] = {0,};
+
+ GF_ASSERT (volinfo);
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ conf = this->private;
+ GF_ASSERT (conf);
+
+ GLUSTERD_GET_DEFRAG_PID_FILE (pidfile, volinfo, conf);
+ ret = glusterd_service_stop ("rebalance", pidfile, SIGTERM, _gf_true);
+
+ return ret;
+}
+
+rpc_clnt_t *
+glusterd_rpc_clnt_unref (glusterd_conf_t *conf, rpc_clnt_t *rpc)
+{
+ rpc_clnt_t *ret = NULL;
+
+ GF_ASSERT (conf);
+ GF_ASSERT (rpc);
+ synclock_unlock (&conf->big_lock);
+ ret = rpc_clnt_unref (rpc);
+ synclock_lock (&conf->big_lock);
+
+ return ret;
+}
+
+int32_t
+glusterd_compare_volume_name(struct list_head *list1, struct list_head *list2)
+{
+ glusterd_volinfo_t *volinfo1 = NULL;
+ glusterd_volinfo_t *volinfo2 = NULL;
+
+ volinfo1 = list_entry(list1, glusterd_volinfo_t, vol_list);
+ volinfo2 = list_entry(list2, glusterd_volinfo_t, vol_list);
+ return strcmp(volinfo1->volname, volinfo2->volname);
+}
+
+int32_t
+glusterd_mount_lvm_snapshot (char *device_path, char *brick_mount_path)
+{
+ char msg[NAME_MAX] = "";
+ int32_t ret = -1;
+ runner_t runner = {0, };
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ GF_ASSERT (brick_mount_path);
+ GF_ASSERT (device_path);
+
+
+ runinit (&runner);
+ snprintf (msg, sizeof (msg), "mount -o nouuid %s %s",
+ device_path, brick_mount_path);
+ runner_add_args (&runner, "mount", "-o", "nouuid", device_path,
+ brick_mount_path, NULL);
+ runner_log (&runner, this->name, GF_LOG_DEBUG, msg);
+ ret = runner_run (&runner);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "mounting the snapshot "
+ "logical device %s failed (error: %s)", device_path,
+ strerror (errno));
+ goto out;
+ } else
+ gf_log (this->name, GF_LOG_DEBUG, "mounting the snapshot "
+ "logical device %s successful", device_path);
+
+out:
+ gf_log (this->name, GF_LOG_TRACE, "Returning with %d", ret);
+ return ret;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.h b/xlators/mgmt/glusterd/src/glusterd-utils.h
index 1cda32b57..84fa89b0e 100644
--- a/xlators/mgmt/glusterd/src/glusterd-utils.h
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.h
@@ -8,7 +8,7 @@
cases as published by the Free Software Foundation.
*/
#ifndef _GLUSTERD_UTILS_H
-#define _GLUSTERD_UTILS_H_
+#define _GLUSTERD_UTILS_H
#ifndef _CONFIG_H
#define _CONFIG_H
@@ -28,18 +28,24 @@
#include "rpc-clnt.h"
#include "protocol-common.h"
+#define GLUSTERD_SOCK_DIR "/var/run"
+#define GLUSTERD_ASSIGN_BRICKID_TO_BRICKINFO(brickinfo, volinfo, brickid) do {\
+ sprintf (brickinfo->brick_id, "%s-client-%d",\
+ volinfo->volname, brickid);\
+} while (0)
+
struct glusterd_lock_ {
uuid_t owner;
time_t timestamp;
};
-typedef struct glusterd_voldict_ctx_ {
+typedef struct glusterd_dict_ctx_ {
dict_t *dict;
- int count;
int opt_count;
char *key_name;
char *val_name;
-} glusterd_voldict_ctx_t;
+ char *prefix;
+} glusterd_dict_ctx_t;
int
glusterd_compare_lines (const void *a, const void *b);
@@ -79,6 +85,11 @@ glusterd_submit_request (struct rpc_clnt *rpc, void *req,
int32_t
glusterd_volinfo_new (glusterd_volinfo_t **volinfo);
+int32_t
+glusterd_volinfo_dup (glusterd_volinfo_t *volinfo,
+ glusterd_volinfo_t **dup_volinfo,
+ gf_boolean_t set_userauth);
+
char *
glusterd_auth_get_username (glusterd_volinfo_t *volinfo);
@@ -113,12 +124,30 @@ int32_t
glusterd_peer_hostname_new (char *hostname, glusterd_peer_hostname_t **name);
int32_t
+glusterd_snap_volinfo_find (char *volname, glusterd_snap_t *snap,
+ glusterd_volinfo_t **volinfo);
+int32_t
+glusterd_snap_volinfo_find_from_parent_volname (char *origin_volname,
+ glusterd_snap_t *snap,
+ glusterd_volinfo_t **volinfo);
+
+int32_t
glusterd_volinfo_find (char *volname, glusterd_volinfo_t **volinfo);
+int
+glusterd_volinfo_find_by_volume_id (uuid_t volume_id, glusterd_volinfo_t **volinfo);
+
+int
+glusterd_snap_volinfo_find_by_volume_id (uuid_t volume_id,
+ glusterd_volinfo_t **volinfo);
+
int32_t
glusterd_service_stop(const char *service, char *pidfile, int sig,
gf_boolean_t force_kill);
+int
+glusterd_get_next_available_brickid (glusterd_volinfo_t *volinfo);
+
int32_t
glusterd_resolve_brick (glusterd_brickinfo_t *brickinfo);
@@ -132,6 +161,12 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t *volinfo,
glusterd_brickinfo_t *brickinfo,
gf_boolean_t del_brick);
+glusterd_volinfo_t *
+glusterd_volinfo_ref (glusterd_volinfo_t *volinfo);
+
+glusterd_volinfo_t *
+glusterd_volinfo_unref (glusterd_volinfo_t *volinfo);
+
int32_t
glusterd_volinfo_delete (glusterd_volinfo_t *volinfo);
@@ -147,25 +182,21 @@ glusterd_volume_brickinfo_get_by_brick (char *brick,
glusterd_brickinfo_t **brickinfo);
int32_t
-glusterd_is_local_addr (char *hostname);
-
-int32_t
-glusterd_build_volume_dict (dict_t **vols);
+glusterd_add_volumes_to_export_dict (dict_t **peer_data);
int32_t
-glusterd_compare_friend_data (dict_t *vols, int32_t *status);
+glusterd_compare_friend_data (dict_t *peer_data, int32_t *status,
+ char *hostname);
int
-glusterd_volume_compute_cksum (glusterd_volinfo_t *volinfo);
+glusterd_compute_cksum (glusterd_volinfo_t *volinfo,
+ gf_boolean_t is_quota_conf);
void
glusterd_get_nodesvc_volfile (char *server, char *workdir,
char *volfile, size_t len);
gf_boolean_t
-glusterd_is_service_running (char *pidfile, int *pid);
-
-gf_boolean_t
glusterd_is_nodesvc_running ();
gf_boolean_t
@@ -186,6 +217,12 @@ glusterd_shd_start ();
int32_t
glusterd_shd_stop ();
+int32_t
+glusterd_quotad_start ();
+
+int32_t
+glusterd_quotad_stop ();
+
void
glusterd_set_socket_filepath (char *sock_filepath, char *sockpath, size_t len);
@@ -206,16 +243,16 @@ int32_t
glusterd_nodesvc_connect (char *server, char *socketpath);
void
-glusterd_nodesvc_set_running (char *server, gf_boolean_t status);
+glusterd_nodesvc_set_online_status (char *server, gf_boolean_t status);
gf_boolean_t
-glusterd_nodesvc_is_running (char *server);
+glusterd_is_nodesvc_online (char *server);
int
glusterd_remote_hostname_get (rpcsvc_request_t *req,
char *remote_host, int len);
int32_t
-glusterd_import_friend_volumes (dict_t *vols);
+glusterd_import_friend_volumes (dict_t *peer_data);
void
glusterd_set_volume_status (glusterd_volinfo_t *volinfo,
glusterd_volume_status status);
@@ -226,6 +263,9 @@ int
glusterd_check_generate_start_shd (void);
int
+glusterd_check_generate_start_quotad (void);
+
+int
glusterd_nodesvcs_handle_graph_change (glusterd_volinfo_t *volinfo);
int
@@ -241,7 +281,8 @@ int32_t
glusterd_volume_count_get (void);
int32_t
glusterd_add_volume_to_dict (glusterd_volinfo_t *volinfo,
- dict_t *dict, int32_t count);
+ dict_t *dict, int32_t count,
+ char *prefix);
int
glusterd_get_brickinfo (xlator_t *this, const char *brickname,
int port, gf_boolean_t localhost,
@@ -280,6 +321,7 @@ glusterd_is_defrag_on (glusterd_volinfo_t *volinfo);
int32_t
glusterd_volinfo_bricks_delete (glusterd_volinfo_t *volinfo);
+
int
glusterd_friend_find_by_uuid (uuid_t uuid,
glusterd_peerinfo_t **peerinfo);
@@ -315,16 +357,21 @@ glusterd_rb_check_bricks (glusterd_volinfo_t *volinfo,
glusterd_brickinfo_t *dst_brick);
int
-glusterd_brick_create_path (char *host, char *path, uuid_t uuid,
- char **op_errstr);
+glusterd_check_and_set_brick_xattr (char *host, char *path, uuid_t uuid,
+ char **op_errstr, gf_boolean_t is_force);
+
+int
+glusterd_validate_and_create_brickpath (glusterd_brickinfo_t *brickinfo,
+ uuid_t volume_id, char **op_errstr,
+ gf_boolean_t is_force);
int
glusterd_sm_tr_log_transition_add (glusterd_sm_tr_log_t *log,
int old_state, int new_state,
int event);
int
glusterd_peerinfo_new (glusterd_peerinfo_t **peerinfo,
- glusterd_friend_sm_state_t state,
- uuid_t *uuid, const char *hostname);
+ glusterd_friend_sm_state_t state, uuid_t *uuid,
+ const char *hostname, int port);
int
glusterd_sm_tr_log_init (glusterd_sm_tr_log_t *log,
char * (*state_name_get) (int),
@@ -344,7 +391,7 @@ gf_boolean_t
glusterd_peerinfo_is_uuid_unknown (glusterd_peerinfo_t *peerinfo);
int32_t
glusterd_brick_connect (glusterd_volinfo_t *volinfo,
- glusterd_brickinfo_t *brickinfo);
+ glusterd_brickinfo_t *brickinfo, char *socketpath);
int32_t
glusterd_brick_disconnect (glusterd_brickinfo_t *brickinfo);
int32_t
@@ -352,13 +399,25 @@ glusterd_delete_volume (glusterd_volinfo_t *volinfo);
int32_t
glusterd_delete_brick (glusterd_volinfo_t* volinfo,
glusterd_brickinfo_t *brickinfo);
+
int32_t
glusterd_delete_all_bricks (glusterd_volinfo_t* volinfo);
+
+int
+glusterd_spawn_daemons (void *opaque);
+
int
glusterd_restart_gsyncds (glusterd_conf_t *conf);
+
int
glusterd_start_gsync (glusterd_volinfo_t *master_vol, char *slave,
- char *glusterd_uuid_str, char **op_errstr);
+ char *path_list, char *conf_path,
+ char *glusterd_uuid_str,
+ char **op_errstr);
+int
+glusterd_get_local_brickpaths (glusterd_volinfo_t *volinfo,
+ char **pathlist);
+
int32_t
glusterd_recreate_bricks (glusterd_conf_t *conf);
int32_t
@@ -386,12 +445,17 @@ glusterd_brick_statedump (glusterd_volinfo_t *volinfo,
char *options, int option_cnt, char **op_errstr);
int
glusterd_nfs_statedump (char *options, int option_cnt, char **op_errstr);
+
+int
+glusterd_quotad_statedump (char *options, int option_cnt, char **op_errstr);
+
gf_boolean_t
glusterd_is_volume_replicate (glusterd_volinfo_t *volinfo);
+
gf_boolean_t
glusterd_is_brick_decommissioned (glusterd_volinfo_t *volinfo, char *hostname,
char *path);
-gf_boolean_t
+int
glusterd_friend_contains_vol_bricks (glusterd_volinfo_t *volinfo,
uuid_t friend_uuid);
int
@@ -422,6 +486,9 @@ glusterd_add_node_to_dict (char *server, dict_t *dict, int count,
char *
glusterd_uuid_to_hostname (uuid_t uuid);
+int
+glusterd_get_dist_leaf_count (glusterd_volinfo_t *volinfo);
+
glusterd_brickinfo_t*
glusterd_get_brickinfo_by_position (glusterd_volinfo_t *volinfo, uint32_t pos);
@@ -438,6 +505,12 @@ glusterd_defrag_volume_status_update (glusterd_volinfo_t *volinfo,
int
glusterd_check_files_identical (char *filename1, char *filename2,
gf_boolean_t *identical);
+
+int
+glusterd_check_topology_identical (const char *filename1,
+ const char *filename2,
+ gf_boolean_t *identical);
+
void
glusterd_volinfo_reset_defrag_stats (glusterd_volinfo_t *volinfo);
int
@@ -457,4 +530,229 @@ int
glusterd_volume_rebalance_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict);
int
glusterd_volume_heal_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict);
+int
+glusterd_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict);
+int
+glusterd_sys_exec_output_rsp_dict (dict_t *aggr, dict_t *rsp_dict);
+int
+glusterd_snap_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict);
+int32_t
+glusterd_handle_node_rsp (dict_t *req_ctx, void *pending_entry,
+ glusterd_op_t op, dict_t *rsp_dict, dict_t *op_ctx,
+ char **op_errstr, gd_node_type type);
+int
+glusterd_volume_rebalance_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict);
+int
+glusterd_volume_heal_use_rsp_dict (dict_t *aggr, dict_t *rsp_dict);
+
+int32_t
+glusterd_check_if_quota_trans_enabled (glusterd_volinfo_t *volinfo);
+int
+glusterd_volume_quota_copy_to_op_ctx_dict (dict_t *aggr, dict_t *rsp);
+int
+_profile_volume_add_brick_rsp (dict_t *this, char *key, data_t *value,
+ void *data);
+int
+glusterd_profile_volume_brick_rsp (void *pending_entry,
+ dict_t *rsp_dict, dict_t *op_ctx,
+ char **op_errstr, gd_node_type type);
+
+gf_boolean_t
+glusterd_are_vol_all_peers_up (glusterd_volinfo_t *volinfo,
+ struct list_head *peers,
+ char **down_peerstr);
+
+int32_t
+glusterd_set_originator_uuid (dict_t *dict);
+
+/* Should be used only when an operation is in progress, as that is the only
+ * time a lock_owner is set
+ */
+gf_boolean_t
+is_origin_glusterd (dict_t *dict);
+
+gf_boolean_t
+glusterd_is_quorum_changed (dict_t *options, char *option, char *value);
+
+int
+glusterd_do_quorum_action ();
+
+int
+glusterd_get_quorum_cluster_counts (xlator_t *this, int *active_count,
+ int *quorum_count);
+
+int
+glusterd_get_next_global_opt_version_str (dict_t *opts, char **version_str);
+gf_boolean_t
+glusterd_is_quorum_option (char *option);
+gf_boolean_t
+glusterd_is_volume_in_server_quorum (glusterd_volinfo_t *volinfo);
+gf_boolean_t
+glusterd_is_any_volume_in_server_quorum (xlator_t *this);
+gf_boolean_t
+does_gd_meet_server_quorum (xlator_t *this);
+
+int
+glusterd_generate_and_set_task_id (dict_t *dict, char *key);
+
+int
+glusterd_validate_and_set_gfid (dict_t *op_ctx, dict_t *req_dict,
+ char **op_errstr);
+
+int
+glusterd_copy_uuid_to_dict (uuid_t uuid, dict_t *dict, char *key);
+
+gf_boolean_t
+glusterd_is_same_address (char *name1, char *name2);
+
+void
+gd_update_volume_op_versions (glusterd_volinfo_t *volinfo);
+
+int
+op_version_check (xlator_t *this, int min_op_version, char *msg, int msglen);
+
+char*
+gd_peer_uuid_str (glusterd_peerinfo_t *peerinfo);
+
+gf_boolean_t
+gd_is_remove_brick_committed (glusterd_volinfo_t *volinfo);
+
+gf_boolean_t
+glusterd_are_vol_all_peers_up (glusterd_volinfo_t *volinfo,
+ struct list_head *peers,
+ char **down_peerstr);
+
+int
+glusterd_get_slave_details_confpath (glusterd_volinfo_t *volinfo, dict_t *dict,
+ char **slave_ip, char **slave_vol,
+ char **conf_path, char **op_errstr);
+
+int
+glusterd_get_slave_info (char *slave, char **slave_ip,
+ char **slave_vol, char **op_errstr);
+
+int
+glusterd_get_statefile_name (glusterd_volinfo_t *volinfo, char *slave,
+ char *conf_path, char **statefile);
+
+int
+glusterd_gsync_read_frm_status (char *path, char *buf, size_t blen);
+
+int
+glusterd_check_restart_gsync_session (glusterd_volinfo_t *volinfo, char *slave,
+ dict_t *resp_dict, char *path_list,
+ char *conf_path, gf_boolean_t is_force);
+
+int
+glusterd_check_gsync_running_local (char *master, char *slave,
+ char *conf_path,
+ gf_boolean_t *is_run);
+
+gf_boolean_t
+glusterd_is_status_tasks_op (glusterd_op_t op, dict_t *dict);
+
+gf_boolean_t
+gd_should_i_start_rebalance (glusterd_volinfo_t *volinfo);
+
+int
+glusterd_is_volume_quota_enabled (glusterd_volinfo_t *volinfo);
+
+gf_boolean_t
+glusterd_all_volumes_with_quota_stopped ();
+
+int
+glusterd_reconfigure_quotad ();
+
+void
+glusterd_clean_up_quota_store (glusterd_volinfo_t *volinfo);
+
+int
+glusterd_store_quota_conf_skip_header (xlator_t *this, int fd);
+
+int
+glusterd_store_quota_conf_stamp_header (xlator_t *this, int fd);
+
+int
+glusterd_remove_auxiliary_mount (char *volname);
+
+gf_boolean_t
+glusterd_status_has_tasks (int cmd);
+
+int
+gd_stop_rebalance_process (glusterd_volinfo_t *volinfo);
+
+rpc_clnt_t *
+glusterd_rpc_clnt_unref (glusterd_conf_t *conf, rpc_clnt_t *rpc);
+
+int32_t
+glusterd_compare_volume_name(struct list_head *, struct list_head *);
+
+char*
+glusterd_get_brick_mount_details (glusterd_brickinfo_t *brickinfo);
+
+struct mntent *
+glusterd_get_mnt_entry_info (char *mnt_pt, FILE *mtab);
+
+int
+glusterd_get_brick_root (char *path, char **mount_point);
+
+
+int
+glusterd_compare_snap_time(struct list_head *, struct list_head *);
+
+int
+glusterd_compare_snap_vol_time(struct list_head *, struct list_head *);
+
+int32_t
+glusterd_snap_volinfo_restore (dict_t *rsp_dict,
+ glusterd_volinfo_t *new_volinfo,
+ glusterd_volinfo_t *snap_volinfo);
+int32_t
+glusterd_lvm_snapshot_remove (dict_t *rsp_dict, glusterd_volinfo_t *snap_vol);
+
+int32_t
+glusterd_missed_snapinfo_new (glusterd_missed_snap_info **missed_snapinfo);
+
+int32_t
+glusterd_missed_snap_op_new (glusterd_snap_op_t **snap_op);
+
+int32_t
+glusterd_add_missed_snaps_to_dict (dict_t *rsp_dict,
+ glusterd_volinfo_t *snap_vol,
+ glusterd_brickinfo_t *brickinfo,
+ int32_t brick_number, int32_t op);
+
+int32_t
+glusterd_add_missed_snaps_to_export_dict (dict_t *peer_data);
+
+int32_t
+glusterd_import_friend_missed_snap_list (dict_t *peer_data);
+
+int32_t
+gd_restore_snap_volume (dict_t *rsp_dict,
+ glusterd_volinfo_t *orig_vol,
+ glusterd_volinfo_t *snap_vol);
+
+int32_t
+glusterd_mount_lvm_snapshot (char *device_path, char *brick_mount_path);
+
+int32_t
+glusterd_add_snapshots_to_export_dict (dict_t *peer_data);
+
+int32_t
+glusterd_compare_friend_snapshots (dict_t *peer_data,
+ glusterd_peerinfo_t *peerinfo);
+
+int32_t
+glusterd_snapobject_delete (glusterd_snap_t *snap);
+
+int32_t
+glusterd_snap_volume_remove (dict_t *rsp_dict,
+ glusterd_volinfo_t *snap_vol,
+ gf_boolean_t remove_lvm,
+ gf_boolean_t force);
+
+int32_t
+glusterd_store_create_snap_dir (glusterd_snap_t *snap);
+
#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c
index 2784242fe..f42d596ba 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volgen.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c
@@ -15,6 +15,7 @@
#include <fnmatch.h>
#include <sys/wait.h>
+#include <dlfcn.h>
#if (HAVE_LIB_XML)
#include <libxml/encoding.h>
@@ -27,6 +28,8 @@
#include "logging.h"
#include "dict.h"
#include "graph-utils.h"
+#include "glusterd-store.h"
+#include "glusterd-hooks.h"
#include "trie.h"
#include "glusterd-mem-types.h"
#include "cli1-xdr.h"
@@ -34,216 +37,9 @@
#include "glusterd-op-sm.h"
#include "glusterd-utils.h"
#include "run.h"
+#include "options.h"
-#define AUTH_ALLOW_MAP_KEY "auth.allow"
-#define AUTH_REJECT_MAP_KEY "auth.reject"
-#define NFS_DISABLE_MAP_KEY "nfs.disable"
-#define AUTH_ALLOW_OPT_KEY "auth.addr.*.allow"
-#define AUTH_REJECT_OPT_KEY "auth.addr.*.reject"
-#define NFS_DISABLE_OPT_KEY "nfs.*.disable"
-
-
-/* dispatch table for VOLUME SET
- * -----------------------------
- *
- * Format of entries:
- *
- * First field is the <key>, for the purpose of looking it up
- * in volume dictionary. Each <key> is of the format "<domain>.<specifier>".
- *
- * Second field is <voltype>.
- *
- * Third field is <option>, if its unset, it's assumed to be
- * the same as <specifier>.
- *
- * Fourth field is <value>. In this context they are used to specify
- * a default. That is, even the volume dict doesn't have a value,
- * we procced as if the default value were set for it.
- *
- * There are two type of entries: basic and special.
- *
- * - Basic entries are the ones where the <option> does _not_ start with
- * the bang! character ('!').
- *
- * In their case, <option> is understood as an option for an xlator of
- * type <voltype>. Their effect is to copy over the volinfo->dict[<key>]
- * value to all graph nodes of type <voltype> (if such a value is set).
- *
- * You are free to add entries of this type, they will become functional
- * just by being present in the table.
- *
- * - Special entries where the <option> starts with the bang!.
- *
- * They are not applied to all graphs during generation, and you cannot
- * extend them in a trivial way which could be just picked up. Better
- * not touch them unless you know what you do.
- *
- * "NODOC" entries are not part of the public interface and are subject
- * to change at any time.
- *
- * Another kind of grouping for options, according to visibility:
- *
- * - Exported: one which is used in the code. These are characterized by
- * being used a macro as <key> (of the format VKEY_..., defined in
- * glusterd-volgen.h
- *
- * - Non-exported: the rest; these have string literal <keys>.
- *
- * Adhering to this policy, option name changes shall be one-liners.
- *
- */
-typedef enum { DOC, NO_DOC, GLOBAL_DOC, GLOBAL_NO_DOC } option_type_t;
-
-
-struct volopt_map_entry {
- char *key;
- char *voltype;
- char *option;
- char *value;
- option_type_t type;
- uint32_t flags;
-};
-
-static struct volopt_map_entry glusterd_volopt_map[] = {
-
- {"cluster.lookup-unhashed", "cluster/distribute", NULL, NULL, NO_DOC, 0 },
- {"cluster.min-free-disk", "cluster/distribute", NULL, NULL, NO_DOC, 0 },
- {"cluster.min-free-inodes", "cluster/distribute", NULL, NULL, NO_DOC, 0 },
- {"cluster.rebalance-stats", "cluster/distribute", NULL, NULL, NO_DOC, 0 },
- {"cluster.subvols-per-directory", "cluster/distribute", "directory-layout-spread", NULL, NO_DOC, 0 },
- {"cluster.readdir-optimize", "cluster/distribute", NULL, NULL, NO_DOC, 0 },
-
- {"cluster.entry-change-log", "cluster/replicate", NULL, NULL, NO_DOC, 0 },
- {"cluster.read-subvolume", "cluster/replicate", NULL, NULL, NO_DOC, 0 },
- {"cluster.read-subvolume-index", "cluster/replicate", NULL, NULL, NO_DOC, 0 },
- {"cluster.read-hash-mode", "cluster/replicate", NULL, NULL, NO_DOC, 0},
- {"cluster.background-self-heal-count", "cluster/replicate", NULL, NULL, NO_DOC, 0 },
- {"cluster.metadata-self-heal", "cluster/replicate", NULL, NULL, NO_DOC, 0 },
- {"cluster.data-self-heal", "cluster/replicate", NULL, NULL, NO_DOC, 0 },
- {"cluster.entry-self-heal", "cluster/replicate", NULL, NULL, NO_DOC, 0 },
- {"cluster.self-heal-daemon", "cluster/replicate", "!self-heal-daemon" , NULL, NO_DOC, 0 },
- {"cluster.heal-timeout", "cluster/replicate", "!heal-timeout" , NULL, NO_DOC, 0 },
- {"cluster.strict-readdir", "cluster/replicate", NULL, NULL, NO_DOC, 0 },
- {"cluster.self-heal-window-size", "cluster/replicate", "data-self-heal-window-size", NULL, DOC, 0},
- {"cluster.data-change-log", "cluster/replicate", NULL, NULL, NO_DOC, 0 },
- {"cluster.metadata-change-log", "cluster/replicate", NULL, NULL, NO_DOC, 0 },
- {"cluster.data-self-heal-algorithm", "cluster/replicate", "data-self-heal-algorithm", NULL,DOC, 0},
- {"cluster.eager-lock", "cluster/replicate", NULL, NULL, NO_DOC, 0 },
- {"cluster.quorum-type", "cluster/replicate", "quorum-type", NULL, NO_DOC, 0},
- {"cluster.quorum-count", "cluster/replicate", "quorum-count", NULL, NO_DOC, 0},
- {"cluster.choose-local", "cluster/replicate", NULL, NULL, DOC, 0},
- {"cluster.self-heal-readdir-size", "cluster/replicate", NULL, NULL, NO_DOC, 0},
-
- {"cluster.stripe-block-size", "cluster/stripe", "block-size", NULL, DOC, 0},
- {"cluster.stripe-coalesce", "cluster/stripe", "coalesce", NULL, DOC, 0},
-
- {VKEY_DIAG_LAT_MEASUREMENT, "debug/io-stats", "latency-measurement", "off", NO_DOC, 0},
- {"diagnostics.dump-fd-stats", "debug/io-stats", NULL, NULL, NO_DOC, 0},
- {VKEY_DIAG_CNT_FOP_HITS, "debug/io-stats", "count-fop-hits", "off", NO_DOC, 0},
-
- {"diagnostics.brick-log-level", "debug/io-stats", "!brick-log-level", NULL, DOC, 0},
- {"diagnostics.client-log-level", "debug/io-stats", "!client-log-level", NULL, DOC, 0},
- {"diagnostics.brick-sys-log-level", "debug/io-stats", "!sys-log-level", NULL, DOC, 0},
- {"diagnostics.client-sys-log-level", "debug/io-stats", "!sys-log-level", NULL, DOC, 0},
-
- {"performance.cache-max-file-size", "performance/io-cache", "max-file-size", NULL, DOC, 0},
- {"performance.cache-min-file-size", "performance/io-cache", "min-file-size", NULL, DOC, 0},
- {"performance.cache-refresh-timeout", "performance/io-cache", "cache-timeout", NULL, DOC, 0},
- {"performance.cache-priority", "performance/io-cache", "priority", NULL, DOC, 0},
- {"performance.cache-size", "performance/io-cache", NULL, NULL, NO_DOC, 0 },
- {"performance.cache-size", "performance/quick-read", NULL, NULL, NO_DOC, 0 },
- {"performance.flush-behind", "performance/write-behind", "flush-behind", NULL, DOC, 0},
- {"performance.md-cache-timeout", "performance/md-cache", "md-cache-timeout", NULL, DOC, 0},
-
- {"performance.io-thread-count", "performance/io-threads", "thread-count", NULL, DOC, 0},
- {"performance.high-prio-threads", "performance/io-threads", NULL, NULL, DOC, 0},
- {"performance.normal-prio-threads", "performance/io-threads", NULL, NULL, DOC, 0},
- {"performance.low-prio-threads", "performance/io-threads", NULL, NULL, DOC, 0},
- {"performance.least-prio-threads", "performance/io-threads", NULL, NULL, DOC, 0},
- {"performance.enable-least-priority", "performance/io-threads", NULL, NULL, DOC, 0},
- {"performance.disk-usage-limit", "performance/quota", NULL, NULL, NO_DOC, 0},
- {"performance.min-free-disk-limit", "performance/quota", NULL, NULL, NO_DOC, 0},
- {"performance.write-behind-window-size", "performance/write-behind", "cache-size", NULL, DOC},
- {"performance.strict-o-direct", "performance/write-behind", "strict-O_DIRECT", NULL, DOC},
- {"performance.strict-write-ordering", "performance/write-behind", "strict-write-ordering", NULL, DOC},
- {"performance.read-ahead-page-count", "performance/read-ahead", "page-count", NULL, DOC},
-
- {"network.frame-timeout", "protocol/client", NULL, NULL, NO_DOC, 0},
- {"network.ping-timeout", "protocol/client", NULL, NULL, NO_DOC, 0},
- {"network.tcp-window-size", "protocol/client", NULL, NULL, NO_DOC, 0},
- { "client.ssl", "protocol/client", "transport.socket.ssl-enabled", NULL, NO_DOC, 0},
-
- {"network.tcp-window-size", "protocol/server", NULL, NULL, NO_DOC, 0},
- {"network.inode-lru-limit", "protocol/server", NULL, NULL, NO_DOC, 0},
- {AUTH_ALLOW_MAP_KEY, "protocol/server", "!server-auth", "*", DOC, 0},
- {AUTH_REJECT_MAP_KEY, "protocol/server", "!server-auth", NULL, DOC, 0},
- {"transport.keepalive", "protocol/server", "transport.socket.keepalive", NULL, NO_DOC, 0},
- {"server.allow-insecure", "protocol/server", "rpc-auth-allow-insecure", NULL, NO_DOC, 0},
- { "server.ssl", "protocol/server", "transport.socket.ssl-enabled", NULL, NO_DOC, 0},
-
- {"performance.write-behind", "performance/write-behind", "!perf", "on", NO_DOC, 0},
- {"performance.read-ahead", "performance/read-ahead", "!perf", "on", NO_DOC, 0},
- {"performance.io-cache", "performance/io-cache", "!perf", "on", NO_DOC, 0},
- {"performance.quick-read", "performance/quick-read", "!perf", "on", NO_DOC, 0},
- {VKEY_PERF_STAT_PREFETCH, "performance/md-cache", "!perf", "on", NO_DOC, 0},
- {"performance.client-io-threads", "performance/io-threads", "!perf", "off", NO_DOC, 0},
-
- {"performance.nfs.write-behind", "performance/write-behind", "!nfsperf", "off", NO_DOC, 0},
- {"performance.nfs.read-ahead", "performance/read-ahead", "!nfsperf", "off", NO_DOC, 0},
- {"performance.nfs.io-cache", "performance/io-cache", "!nfsperf", "off", NO_DOC, 0},
- {"performance.nfs.quick-read", "performance/quick-read", "!nfsperf", "off", NO_DOC, 0},
- {"performance.nfs.stat-prefetch", "performance/md-cache", "!nfsperf", "off", NO_DOC, 0},
- {"performance.nfs.io-threads", "performance/io-threads", "!nfsperf", "off", NO_DOC, 0},
-
- {VKEY_MARKER_XTIME, "features/marker", "xtime", "off", NO_DOC, OPT_FLAG_FORCE},
- {VKEY_MARKER_XTIME, "features/marker", "!xtime", "off", NO_DOC, OPT_FLAG_FORCE},
- {"debug.trace", "debug/trace", "!debug","off", NO_DOC, 0},
- {"debug.error-gen", "debug/error-gen", "!debug","off", NO_DOC, 0},
-
- {"nfs.enable-ino32", "nfs/server", "nfs.enable-ino32", NULL, GLOBAL_DOC, 0},
- {"nfs.mem-factor", "nfs/server", "nfs.mem-factor", NULL, GLOBAL_DOC, 0},
- {"nfs.export-dirs", "nfs/server", "nfs3.export-dirs", NULL, GLOBAL_DOC, 0},
- {"nfs.export-volumes", "nfs/server", "nfs3.export-volumes", NULL, GLOBAL_DOC, 0},
- {"nfs.addr-namelookup", "nfs/server", "rpc-auth.addr.namelookup", NULL, GLOBAL_DOC, 0},
- {"nfs.dynamic-volumes", "nfs/server", "nfs.dynamic-volumes", NULL, GLOBAL_NO_DOC, 0},
- {"nfs.register-with-portmap", "nfs/server", "rpc.register-with-portmap", NULL, GLOBAL_DOC, 0},
- {"nfs.port", "nfs/server", "nfs.port", NULL, GLOBAL_DOC, 0},
-
- {"nfs.rpc-auth-unix", "nfs/server", "!rpc-auth.auth-unix.*", NULL, DOC, 0},
- {"nfs.rpc-auth-null", "nfs/server", "!rpc-auth.auth-null.*", NULL, DOC, 0},
- {"nfs.rpc-auth-allow", "nfs/server", "!rpc-auth.addr.*.allow", NULL, DOC, 0},
- {"nfs.rpc-auth-reject", "nfs/server", "!rpc-auth.addr.*.reject", NULL, DOC, 0},
- {"nfs.ports-insecure", "nfs/server", "!rpc-auth.ports.*.insecure", NULL, DOC, 0},
- {"nfs.transport-type", "nfs/server", "!nfs.transport-type", NULL, DOC, 0},
-
- {"nfs.trusted-sync", "nfs/server", "!nfs3.*.trusted-sync", NULL, DOC, 0},
- {"nfs.trusted-write", "nfs/server", "!nfs3.*.trusted-write", NULL, DOC, 0},
- {"nfs.volume-access", "nfs/server", "!nfs3.*.volume-access", NULL, DOC, 0},
- {"nfs.export-dir", "nfs/server", "!nfs3.*.export-dir", NULL, DOC, 0},
- {NFS_DISABLE_MAP_KEY, "nfs/server", "!nfs-disable", NULL, DOC, 0},
- {"nfs.nlm", "nfs/server", "nfs.nlm", NULL, GLOBAL_DOC, 0},
- {"nfs.mount-udp", "nfs/server", "nfs.mount-udp", NULL, GLOBAL_DOC, 0},
- {"nfs.server-aux-gids", "nfs/server", "nfs.server-aux-gids", NULL, NO_DOC, 0},
-
- {VKEY_FEATURES_QUOTA, "features/marker", "quota", "off", NO_DOC, OPT_FLAG_FORCE},
- {VKEY_FEATURES_LIMIT_USAGE, "features/quota", "limit-set", NULL, NO_DOC, 0},
- {"features.quota-timeout", "features/quota", "timeout", "0", DOC, 0},
- {"server.statedump-path", "protocol/server", "statedump-path", NULL, DOC, 0},
- {"features.lock-heal", "protocol/client", "lk-heal", NULL, NO_DOC, 0},
- {"features.lock-heal", "protocol/server", "lk-heal", NULL, DOC, 0},
- {"features.grace-timeout", "protocol/client", "grace-timeout", NULL, NO_DOC, 0},
- {"features.grace-timeout", "protocol/server", "grace-timeout", NULL, DOC, 0},
- {"features.read-only", "features/read-only", "!read-only", "off", DOC, 0},
- {"features.worm", "features/worm", "!worm", "off", DOC, 0},
- {"storage.linux-aio", "storage/posix", NULL, NULL, DOC, 0},
- {"storage.owner-uid", "storage/posix", "brick-uid", NULL, DOC, 0},
- {"storage.owner-gid", "storage/posix", "brick-gid", NULL, DOC, 0},
- {"config.memory-accounting", "configuration", "!config", NULL, DOC, 0},
- {"config.transport", "configuration", "!config", NULL, DOC, 0},
- {NULL, }
-};
-
-
+extern struct volopt_map_entry glusterd_volopt_map[];
/*********************************************
*
@@ -307,7 +103,6 @@ xlator_instantiate_va (const char *type, const char *format, va_list arg)
return NULL;
}
-#ifdef __not_used_as_of_now_
static xlator_t *
xlator_instantiate (const char *type, const char *format, ...)
{
@@ -320,7 +115,6 @@ xlator_instantiate (const char *type, const char *format, ...)
return xl;
}
-#endif
static int
volgen_xlator_link (xlator_t *pxl, xlator_t *cxl)
@@ -569,10 +363,9 @@ volopt_trie_section (int lvl, char **patt, char *word, char **hint, int hints)
GF_ASSERT (hints <= 2);
nodevec.cnt = hints;
ret = trie_measure_vec (trie, word, &nodevec);
- if (ret || !nodevec.nodes[0])
- trie_destroy (trie);
+ if (!ret && nodevec.nodes[0])
+ ret = process_nodevec (&nodevec, hint);
- ret = process_nodevec (&nodevec, hint);
trie_destroy (trie);
return ret;
@@ -656,7 +449,7 @@ struct opthandler_data {
};
static int
-process_option (dict_t *dict, char *key, data_t *value, void *param)
+process_option (char *key, data_t *value, void *param)
{
struct opthandler_data *odt = param;
struct volopt_map_entry vme = {0,};
@@ -668,6 +461,8 @@ process_option (dict_t *dict, char *key, data_t *value, void *param)
vme.key = key;
vme.voltype = odt->vme->voltype;
vme.option = odt->vme->option;
+ vme.op_version = odt->vme->op_version;
+
if (!vme.option) {
vme.option = strrchr (key, '.');
if (vme.option)
@@ -705,7 +500,7 @@ volgen_graph_set_options_generic (volgen_graph_t *graph, dict_t *dict,
data = dict_get (dict, vme->key);
if (data)
- process_option (dict, vme->key, data, &odt);
+ process_option (vme->key, data, &odt);
if (odt.rv)
return odt.rv;
@@ -719,8 +514,7 @@ volgen_graph_set_options_generic (volgen_graph_t *graph, dict_t *dict,
* in this context
*/
odt.data_t_fake = _gf_true;
- process_option (NULL, vme->key, (data_t *)vme->value,
- &odt);
+ process_option (vme->key, (data_t *)vme->value, &odt);
if (odt.rv)
return odt.rv;
}
@@ -780,33 +574,99 @@ optget_option_handler (volgen_graph_t *graph, struct volopt_map_entry *vme,
return 0;
}
+static glusterd_server_xlator_t
+get_server_xlator (char *xlator)
+{
+ glusterd_server_xlator_t subvol = GF_XLATOR_NONE;
+
+ if (strcmp (xlator, "posix") == 0)
+ subvol = GF_XLATOR_POSIX;
+ if (strcmp (xlator, "acl") == 0)
+ subvol = GF_XLATOR_ACL;
+ if (strcmp (xlator, "locks") == 0)
+ subvol = GF_XLATOR_LOCKS;
+ if (strcmp (xlator, "io-threads") == 0)
+ subvol = GF_XLATOR_IOT;
+ if (strcmp (xlator, "index") == 0)
+ subvol = GF_XLATOR_INDEX;
+ if (strcmp (xlator, "marker") == 0)
+ subvol = GF_XLATOR_MARKER;
+ if (strcmp (xlator, "io-stats") == 0)
+ subvol = GF_XLATOR_IO_STATS;
+ if (strcmp (xlator, "bd") == 0)
+ subvol = GF_XLATOR_BD;
+
+ return subvol;
+}
+
+static glusterd_client_xlator_t
+get_client_xlator (char *xlator)
+{
+ glusterd_client_xlator_t subvol = GF_CLNT_XLATOR_NONE;
+
+ if (strcmp (xlator, "client") == 0)
+ subvol = GF_CLNT_XLATOR_FUSE;
+
+ return subvol;
+}
+
+static int
+debugxl_option_handler (volgen_graph_t *graph, struct volopt_map_entry *vme,
+ void *param)
+{
+ char *volname = NULL;
+ gf_boolean_t enabled = _gf_false;
+
+ volname = param;
+
+ if (strcmp (vme->option, "!debug") != 0)
+ return 0;
+
+ if (!strcmp (vme->key , "debug.trace") ||
+ !strcmp (vme->key, "debug.error-gen")) {
+ if (get_server_xlator (vme->value) == GF_XLATOR_NONE &&
+ get_client_xlator (vme->value) == GF_CLNT_XLATOR_NONE)
+ return 0;
+ else
+ goto add_graph;
+ }
+
+ if (gf_string2boolean (vme->value, &enabled) == -1)
+ return -1;
+ if (!enabled)
+ return 0;
+
+add_graph:
+ if (volgen_graph_add (graph, vme->voltype, volname))
+ return 0;
+ else
+ return -1;
+}
+
int
check_and_add_debug_xl (volgen_graph_t *graph, dict_t *set_dict, char *volname,
char *xlname)
{
int ret = 0;
- xlator_t *xl = NULL;
char *value_str = NULL;
ret = dict_get_str (set_dict, "debug.trace", &value_str);
if (!ret) {
if (strcmp (xlname, value_str) == 0) {
- xl = volgen_graph_add (graph, "debug/trace", volname);
- if (!xl) {
- ret = -1;
+ ret = volgen_graph_set_options_generic (graph, set_dict, volname,
+ &debugxl_option_handler);
+ if (ret)
goto out;
- }
}
}
ret = dict_get_str (set_dict, "debug.error-gen", &value_str);
if (!ret) {
if (strcmp (xlname, value_str) == 0) {
- xl = volgen_graph_add (graph, "debug/error-gen", volname);
- if (!xl) {
- ret = -1;
+ ret = volgen_graph_set_options_generic (graph, set_dict, volname,
+ &debugxl_option_handler);
+ if (ret)
goto out;
- }
}
}
@@ -879,7 +739,7 @@ int
glusterd_volinfo_get_boolean (glusterd_volinfo_t *volinfo, char *key)
{
char *val = NULL;
- gf_boolean_t boo = _gf_false;
+ gf_boolean_t enabled = _gf_false;
int ret = 0;
ret = glusterd_volinfo_get (volinfo, key, &val);
@@ -887,14 +747,14 @@ glusterd_volinfo_get_boolean (glusterd_volinfo_t *volinfo, char *key)
return -1;
if (val)
- ret = gf_string2boolean (val, &boo);
+ ret = gf_string2boolean (val, &enabled);
if (ret) {
gf_log ("", GF_LOG_ERROR, "value for %s option is not valid", key);
return -1;
}
- return boo;
+ return enabled;
}
gf_boolean_t
@@ -987,6 +847,7 @@ glusterd_check_option_exists (char *key, char **completion)
struct volopt_map_entry vme = {0,};
struct volopt_map_entry *vmep = NULL;
int ret = 0;
+ xlator_t *this = THIS;
(void)vme;
(void)vmep;
@@ -995,7 +856,8 @@ glusterd_check_option_exists (char *key, char **completion)
if (completion) {
ret = option_complete (key, completion);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Out of memory");
+ gf_log (this->name, GF_LOG_ERROR,
+ "Out of memory");
return -1;
}
@@ -1021,7 +883,7 @@ glusterd_check_option_exists (char *key, char **completion)
trie:
ret = volopt_trie (key, completion);
if (ret) {
- gf_log ("", GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"Some error occurred during keyword hinting");
}
@@ -1394,8 +1256,8 @@ static int
server_check_marker_off (volgen_graph_t *graph, struct volopt_map_entry *vme,
glusterd_volinfo_t *volinfo)
{
- gf_boolean_t bool = _gf_false;
- int ret = 0;
+ gf_boolean_t enabled = _gf_false;
+ int ret = 0;
GF_ASSERT (volinfo);
GF_ASSERT (vme);
@@ -1403,8 +1265,8 @@ server_check_marker_off (volgen_graph_t *graph, struct volopt_map_entry *vme,
if (strcmp (vme->option, "!xtime") != 0)
return 0;
- ret = gf_string2boolean (vme->value, &bool);
- if (ret || bool)
+ ret = gf_string2boolean (vme->value, &enabled);
+ if (ret || enabled)
goto out;
ret = glusterd_volinfo_get_boolean (volinfo, VKEY_MARKER_XTIME);
@@ -1415,10 +1277,10 @@ server_check_marker_off (volgen_graph_t *graph, struct volopt_map_entry *vme,
}
if (ret) {
- bool = _gf_false;
- ret = glusterd_check_gsync_running (volinfo, &bool);
+ enabled = _gf_false;
+ ret = glusterd_check_gsync_running (volinfo, &enabled);
- if (bool) {
+ if (enabled) {
gf_log ("", GF_LOG_WARNING, GEOREP" sessions active"
"for the volume %s, cannot disable marker "
,volinfo->volname);
@@ -1464,6 +1326,44 @@ sys_loglevel_option_handler (volgen_graph_t *graph,
}
static int
+logger_option_handler (volgen_graph_t *graph, struct volopt_map_entry *vme,
+ void *param)
+{
+ char *role = NULL;
+ struct volopt_map_entry vme2 = {0,};
+
+ role = (char *) param;
+
+ if (strcmp (vme->option, "!logger") != 0 ||
+ !strstr (vme->key, role))
+ return 0;
+
+ memcpy (&vme2, vme, sizeof (vme2));
+ vme2.option = "logger";
+
+ return basic_option_handler (graph, &vme2, NULL);
+}
+
+static int
+log_format_option_handler (volgen_graph_t *graph, struct volopt_map_entry *vme,
+ void *param)
+{
+ char *role = NULL;
+ struct volopt_map_entry vme2 = {0,};
+
+ role = (char *) param;
+
+ if (strcmp (vme->option, "!log-format") != 0 ||
+ !strstr (vme->key, role))
+ return 0;
+
+ memcpy (&vme2, vme, sizeof (vme2));
+ vme2.option = "log-format";
+
+ return basic_option_handler (graph, &vme2, NULL);
+}
+
+static int
volgen_graph_set_xl_options (volgen_graph_t *graph, dict_t *dict)
{
int32_t ret = -1;
@@ -1515,6 +1415,12 @@ server_spec_option_handler (volgen_graph_t *graph,
if (!ret)
ret = sys_loglevel_option_handler (graph, vme, "brick");
+ if (!ret)
+ ret = logger_option_handler (graph, vme, "brick");
+
+ if (!ret)
+ ret = log_format_option_handler (graph, vme, "brick");
+
return ret;
}
@@ -1537,29 +1443,341 @@ server_spec_extended_option_handler (volgen_graph_t *graph,
static void get_vol_tstamp_file (char *filename, glusterd_volinfo_t *volinfo);
+xlator_t *
+add_one_peer (volgen_graph_t *graph, glusterd_brickinfo_t *peer,
+ char *volname, uint16_t index)
+{
+ xlator_t *kid;
+
+ kid = volgen_graph_add_nolink (graph, "protocol/client",
+ "%s-client-%u", volname,
+ index++);
+ if (!kid) {
+ return NULL;
+ }
+
+ /* TBD: figure out where to get the proper transport list */
+ if (xlator_set_option(kid,"transport-type","socket")) {
+ return NULL;
+ }
+ if (xlator_set_option(kid,"remote-host",peer->hostname)) {
+ return NULL;
+ }
+ if (xlator_set_option(kid,"remote-subvolume",peer->path)) {
+ return NULL;
+ }
+ /* TBD: deal with RDMA, SSL */
+
+ return kid;
+}
+
+void
+assign_groups (glusterd_volinfo_t *volinfo)
+{
+ glusterd_brickinfo_t *brickinfo = NULL;
+ uint16_t group_num = 0;
+ int in_group = 0;
+ uuid_t tmp_uuid;
+
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ if (in_group == 0) {
+ uuid_generate(tmp_uuid);
+ }
+ brickinfo->group = group_num;
+ uuid_copy(brickinfo->nsr_uuid,tmp_uuid);
+ if (++in_group >= volinfo->replica_count) {
+ in_group = 0;
+ ++group_num;
+ }
+ }
+}
+
+int
+add_nsr_stuff (volgen_graph_t *graph, char *volname,
+ glusterd_brickinfo_t *brickinfo, glusterd_volinfo_t *volinfo,
+ char *changelog_basepath)
+{
+ xlator_t *me;
+ xlator_t *kid;
+ glusterd_brickinfo_t *peer;
+ uint16_t index = 0;
+ //uint32_t i=0;
+ char *leader_opt;
+ uint32_t replica_group_size = 1;
+ char dst[NSR_MAX_PATH_SIZE];
+ char local_path[NSR_MAX_PATH_SIZE];
+ char local_name[NSR_MAX_PATH_SIZE];
+ char hosts[NSR_MAX_PATH_SIZE * NSR_MAX_REPLICA_GROUP_SIZE];
+ char remote_names[NSR_MAX_REPLICA_GROUP_SIZE * NSR_MAX_PATH_SIZE];
+ char filepath[PATH_MAX] = {0,};
+ char lp[PATH_MAX] = {0,};
+ xlator_t *xl = NULL;
+ char s[256];
+ char transt[16] = {0,};
+ char auth[256];
+ char c_d[NSR_MAX_PATH_SIZE];
+ char *username = NULL, *password = NULL;
+ gf_boolean_t enable_recon = _gf_false;
+ static uint32_t nsr_port = 27000;
+
+ if (glusterd_volinfo_get_boolean(volinfo,"cluster.nsr.recon") > 0) {
+ enable_recon = _gf_true;
+ }
+
+ volgen_graph_t ng = {0,};
+ char path[PATH_MAX] = {0,};
+ char *ptr = NULL, *this = NULL, *that = NULL;
+ glusterd_conf_t *priv = NULL;
+
+
+ priv = THIS->private;
+ remote_names[0] = '\0';
+ that = gf_strdup (brickinfo->hostname);
+ this = gf_strdup (brickinfo->path);
+ ptr = strchr (this, '/');
+ while (ptr) {
+ *ptr = '-';
+ ptr = strchr (this, '/');
+ }
+ GLUSTERD_GET_VOLUME_DIR (path, volinfo, priv);
+ snprintf (dst, PATH_MAX,
+ "%s/%s/%s:%s",
+ path,
+ GLUSTERD_BRICK_INFO_DIR,
+ that,
+ this);
+
+ /* Create the NSR xlator, but defer linkage for now. */
+ me = xlator_instantiate ("cluster/nsr", "%s-nsr", volname);
+ if (!me || volgen_xlator_link(me,first_of(graph))) {
+ return -1;
+ }
+
+ strcpy(local_name, brickinfo->hostname);
+ strcpy(local_path, brickinfo->hostname);
+ strcat(local_name, ":");
+ strcat(local_name, brickinfo->path);
+ strcpy(hosts, brickinfo->hostname);
+
+ peer = list_prev (brickinfo, &volinfo->bricks,
+ glusterd_brickinfo_t, brick_list);
+ /* Check leader status while we have this pointer in hand. */
+ leader_opt = (!peer || (peer->group != brickinfo->group)) ? "yes"
+ : "no";
+ if (xlator_set_option(me,"vol-name",volname))
+ return -1;
+ if (xlator_set_option(me,"my-name",local_name))
+ return -1;
+ if (xlator_set_option(me,"leader",leader_opt))
+ return -1;
+ if (xlator_set_option(me,"subvol-uuid",
+ uuid_utoa(brickinfo->nsr_uuid))) {
+ return -1;
+ }
+
+#define FILL_REMOTE_NAMES { \
+ strcat(remote_names, \
+ peer->hostname); \
+ strcat(remote_names, \
+ ":"); \
+ strcat(remote_names, \
+ peer->path); \
+ strcat(remote_names, \
+ ","); \
+ strcat(hosts, ","); \
+ strcat(hosts, \
+ peer->hostname); \
+ replica_group_size++; \
+}
+
+ /* Now get on with the show. */
+ while (peer) {
+ if (peer->group != brickinfo->group) {
+ break;
+ }
+ gf_log ("glusterd", GF_LOG_INFO,
+ "%s:%s needs client for %s:%s",
+ brickinfo->hostname, brickinfo->path,
+ peer->hostname, peer->path);
+ kid = add_one_peer (graph, peer, volname, index++);
+ if (!kid || volgen_xlator_link(me,kid)) {
+ return -1;
+ }
+ FILL_REMOTE_NAMES;
+ peer = list_prev (peer, &volinfo->bricks,
+ glusterd_brickinfo_t, brick_list);
+ }
+
+ peer = list_next (brickinfo, &volinfo->bricks,
+ glusterd_brickinfo_t, brick_list);
+ while (peer) {
+ if (peer->group != brickinfo->group) {
+ break;
+ }
+ gf_log ("glusterd", GF_LOG_INFO,
+ "%s:%s needs client for %s:%s",
+ brickinfo->hostname, brickinfo->path,
+ peer->hostname, peer->path);
+ kid = add_one_peer (graph, peer, volname, index++);
+ if (!kid || volgen_xlator_link(me,kid)) {
+ return -1;
+ }
+ FILL_REMOTE_NAMES;
+ peer = list_next (peer, &volinfo->bricks,
+ glusterd_brickinfo_t, brick_list);
+ }
+
+ // to remove the final ","
+ if (strlen(remote_names)) {
+ remote_names[strlen(remote_names) - 1] = '\0';
+ }
+ if (xlator_set_option(me,"etcd-servers",hosts))
+ return -1;
+
+ // Finish linkage to client file
+ glusterfs_graph_set_first(&graph->graph,me);
+
+ if (enable_recon == _gf_false)
+ return 0;
+
+ /* Now fill in the various files required for reconciliation */
+ snprintf (filepath, PATH_MAX,
+ "%s-nsr-recon.vol",
+ dst);
+ gf_log ("glusterd", GF_LOG_INFO,
+ "writing nsr recon volfile in %s\n",
+ filepath);
+#if 0
+ strcpy(lp, local_name);
+#else
+ strcpy(lp, brickinfo->path);
+#endif
+ strcat(lp,"/recon");
+ bzero(&ng, sizeof(ng));
+ xl = volgen_graph_add_as (&ng, "cluster/nsr_recon",lp);
+ if (!xl)
+ return -1;
+ sprintf(s,"%d",replica_group_size);
+ if (xlator_set_option(xl, "replica-group-size", s) == -1)
+ return -1;
+ if (xlator_set_option(xl, "local-member", local_name) == -1)
+ return -1;
+ if (xlator_set_option(xl, "replica-group-members", remote_names) == -1)
+ return -1;
+ if (xlator_set_option(xl,"vol-name",volname))
+ return -1;
+ if (xlator_set_option(xl,"changelog-dir",changelog_basepath))
+ return -1;
+ if (xlator_set_option(xl,"base-dir",brickinfo->path))
+ return -1;
+
+ xl = volgen_graph_add (&ng, "protocol/server", lp);
+ if (!xl)
+ return -1;
+ get_vol_transport_type (volinfo, transt);
+ if(xlator_set_option (xl, "transport-type", transt) == -1)
+ return -1;
+ sprintf(s,"%d",nsr_port);
+ if(xlator_set_option (xl, "transport.socket.listen-port", s) == -1)
+ return -1;
+ strcpy(auth, "auth.addr.");
+ strcat(auth, lp);
+ strcat(auth, ".allow");
+ if(xlator_set_option (xl, auth, "*") == -1)
+ return -1;
+ if(xlator_set_option (xl, "rpc-auth.auth-null", "off") == -1)
+ return -1;
+ if(xlator_set_option (xl, "rpc-auth.auth-unix", "off") == -1)
+ return -1;
+ if(xlator_set_option (xl, "rpc-auth.auth-glusterfs", "off") == -1)
+ return -1;
+ if(volgen_write_volfile(&ng, filepath) == -1)
+ return -1;
+
+ bzero(&ng, sizeof(ng));
+ kid = volgen_graph_add_nolink (&ng, "protocol/client",
+ "%s-client-%u", lp, 0);
+ if (!kid)
+ return -1;
+ if (xlator_set_option(kid,"remote-host",brickinfo->hostname))
+ return -1;
+#if 0
+ strcpy(lp, brickinfo->path);
+ strcat(lp,"/recon");
+#endif
+ if (xlator_set_option(kid,"remote-subvolume",lp))
+ return -1;
+ if(xlator_set_option (kid, "transport-type", transt) == -1)
+ return -1;
+ sprintf(s,"%d",nsr_port++);
+ if(xlator_set_option (kid, "remote-port", s) == -1)
+ return -1;
+ snprintf (c_d, PATH_MAX,
+ "%s/%s/con:%s:%s",
+ path,
+ GLUSTERD_BRICK_INFO_DIR,
+ that, this);
+ if (volgen_write_volfile(&ng, c_d))
+ return -1;
+
+ bzero(&ng, sizeof(ng));
+ kid = volgen_graph_add_nolink (&ng, "protocol/client",
+ "%s-client-%u", lp, 0);
+ if (!kid)
+ return -1;
+ if (xlator_set_option(kid,"remote-host",brickinfo->hostname))
+ return -1;
+ if (xlator_set_option(kid,"remote-subvolume",brickinfo->path))
+ return -1;
+ if(xlator_set_option (kid, "transport-type", transt) == -1)
+ return -1;
+ username = glusterd_auth_get_username (volinfo);
+ password = glusterd_auth_get_password (volinfo);
+ if(xlator_set_option (kid, "username", username) == -1)
+ return -1;
+ if(xlator_set_option (kid, "password", password) == -1)
+ return -1;
+ snprintf (c_d, PATH_MAX,
+ "%s/%s/data:%s:%s",
+ path,
+ GLUSTERD_BRICK_INFO_DIR, that,
+ this);
+ if (volgen_write_volfile(&ng, c_d))
+ return -1;
+
+ return 0;
+
+}
+
static int
server_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
dict_t *set_dict, void *param)
{
- char *volname = NULL;
- char *path = NULL;
- int pump = 0;
- xlator_t *xl = NULL;
- xlator_t *txl = NULL;
- xlator_t *rbxl = NULL;
- char transt[16] = {0,};
- char *ptranst = NULL;
- char volume_id[64] = {0,};
- char tstamp_file[PATH_MAX] = {0,};
- int ret = 0;
- char *xlator = NULL;
- char *loglevel = NULL;
- char *username = NULL;
- char *password = NULL;
- char index_basepath[PATH_MAX] = {0};
- char key[1024] = {0};
-
- path = param;
+ char *volname = NULL;
+ char *path = NULL;
+ int pump = 0;
+ xlator_t *xl = NULL;
+ xlator_t *txl = NULL;
+ xlator_t *rbxl = NULL;
+ char transt[16] = {0,};
+ char *ptranst = NULL;
+ char volume_id[64] = {0,};
+ char tstamp_file[PATH_MAX] = {0,};
+ int ret = 0;
+ char *xlator = NULL;
+ char *loglevel = NULL;
+ char *username = NULL;
+ char *password = NULL;
+ char index_basepath[PATH_MAX] = {0};
+ char key[1024] = {0};
+ glusterd_brickinfo_t *brickinfo = NULL;
+ char changelog_basepath[PATH_MAX] = {0,};
+ gf_boolean_t quota_enabled = _gf_true;
+ gf_boolean_t pgfid_feat = _gf_false;
+ char *value = NULL;
+
+ brickinfo = param;
+ path = brickinfo->path;
volname = volinfo->volname;
get_vol_transport_type (volinfo, transt);
@@ -1575,6 +1793,22 @@ server_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
}
}
+ ret = glusterd_volinfo_get (volinfo, VKEY_FEATURES_QUOTA, &value);
+ if (value) {
+ ret = gf_string2boolean (value, &quota_enabled);
+ if (ret)
+ goto out;
+ }
+
+ ret = glusterd_volinfo_get (volinfo,
+ "update-link-count-parent",
+ &value);
+ if (value) {
+ ret = gf_string2boolean (value, &pgfid_feat);
+ if (ret)
+ goto out;
+ }
+
xl = volgen_graph_add (graph, "storage/posix", volname);
if (!xl)
return -1;
@@ -1588,9 +1822,59 @@ server_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
if (ret)
return -1;
- ret = check_and_add_debug_xl (graph, set_dict, volname, "posix");
+ if (quota_enabled || pgfid_feat)
+ xlator_set_option (xl, "update-link-count-parent",
+ "on");
+
+ ret = check_and_add_debug_xl (graph, set_dict, volname,
+ "posix");
if (ret)
return -1;
+#ifdef HAVE_BD_XLATOR
+ if (*brickinfo->vg != '\0') {
+ /* Now add BD v2 xlator if volume is BD type */
+ xl = volgen_graph_add (graph, "storage/bd", volname);
+ if (!xl)
+ return -1;
+
+ ret = xlator_set_option (xl, "device", "vg");
+ if (ret)
+ return -1;
+ ret = xlator_set_option (xl, "export", brickinfo->vg);
+ if (ret)
+ return -1;
+
+ ret = check_and_add_debug_xl (graph, set_dict, volname, "bd");
+ if (ret)
+ return -1;
+
+ }
+#endif
+
+ xl = volgen_graph_add (graph, "features/changelog", volname);
+ if (!xl)
+ return -1;
+
+ ret = xlator_set_option (xl, "changelog-brick", path);
+ if (ret)
+ return -1;
+
+ snprintf (changelog_basepath, sizeof (changelog_basepath),
+ "%s/%s", path, ".glusterfs/changelogs");
+ ret = xlator_set_option (xl, "changelog-dir", changelog_basepath);
+ if (ret)
+ return -1;
+
+ if (glusterd_volinfo_get_boolean(volinfo,"cluster.nsr") > 0) {
+ ret = xlator_set_option (xl, "encoding", "ascii");
+ if (ret)
+ return -1;
+ }
+
+ ret = check_and_add_debug_xl (graph, set_dict, volname, "changelog");
+ if (ret)
+ return -1;
+
xl = volgen_graph_add (graph, "features/access-control", volname);
if (!xl)
@@ -1616,6 +1900,10 @@ server_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
if (ret)
return -1;
+ xl = volgen_graph_add (graph, "features/barrier", volname);
+ if (!xl)
+ return -1;
+
ret = dict_get_int32 (volinfo->dict, "enable-pump", &pump);
if (ret == -ENOENT)
ret = pump = 0;
@@ -1666,9 +1954,19 @@ server_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
return -1;
}
- xl = volgen_graph_add (graph, "features/index", volname);
- if (!xl)
- return -1;
+ /* TBD: conditionalize on NSR being enabled */
+ if (glusterd_volinfo_get_boolean(volinfo,"cluster.nsr") > 0) {
+ ret = add_nsr_stuff (graph, volname, brickinfo, volinfo,
+ changelog_basepath);
+ if (ret) {
+ return -1;
+ }
+ }
+ else {
+ xl = volgen_graph_add (graph, "features/index", volname);
+ if (!xl)
+ return -1;
+ }
snprintf (index_basepath, sizeof (index_basepath), "%s/%s",
path, ".glusterfs/indices");
@@ -1698,7 +1996,21 @@ server_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
if (ret)
return -1;
- if (dict_get_str_boolean (set_dict, "features.read-only", 0) &&
+ xl = volgen_graph_add (graph, "features/quota", volname);
+ if (!xl)
+ return -1;
+ ret = xlator_set_option (xl, "volume-uuid", volname);
+ if (ret)
+ return -1;
+
+ ret = glusterd_volinfo_get (volinfo, VKEY_FEATURES_QUOTA, &value);
+ if (value) {
+ ret = xlator_set_option (xl, "server-quota", value);
+ if (ret)
+ return -1;
+ }
+
+ if (dict_get_str_boolean (set_dict, "features.read-only", 0) &&
dict_get_str_boolean (set_dict, "features.worm",0)) {
gf_log (THIS->name, GF_LOG_ERROR,
"read-only and worm cannot be set together");
@@ -1707,7 +2019,8 @@ server_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
}
/* Check for read-only volume option, and add it to the graph */
- if (dict_get_str_boolean (set_dict, "features.read-only", 0)) {
+ if (dict_get_str_boolean (set_dict, "features.read-only", 0)
+ || volinfo -> is_snap_volume) {
xl = volgen_graph_add (graph, "features/read-only", volname);
if (!xl) {
ret = -1;
@@ -1724,6 +2037,21 @@ server_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
}
}
+ /* Check for compress volume option, and add it to the graph on server side */
+ ret = dict_get_str_boolean (set_dict, "network.compression", 0);
+ if (ret == -1)
+ goto out;
+ if (ret) {
+ xl = volgen_graph_add (graph, "features/cdc", volname);
+ if (!xl) {
+ ret = -1;
+ goto out;
+ }
+ ret = xlator_set_option (xl, "mode", "server");
+ if (ret)
+ goto out;
+ }
+
xl = volgen_graph_add_as (graph, "debug/io-stats", path);
if (!xl)
return -1;
@@ -1735,6 +2063,16 @@ server_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
if (ret)
return -1;
+ /*In the case of running multiple glusterds on a single machine,
+ * we should ensure that bricks don't listen on all IPs on that
+ * machine and break the IP based separation being brought about.*/
+ if (dict_get (THIS->options, "transport.socket.bind-address")) {
+ ret = xlator_set_option (xl, "transport.socket.bind-address",
+ brickinfo->hostname);
+ if (ret)
+ return -1;
+ }
+
if (username) {
memset (key, 0, sizeof (key));
snprintf (key, sizeof (key), "auth.login.%s.allow", path);
@@ -1767,9 +2105,9 @@ server_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
/* builds a graph for server role , with option overrides in mod_dict */
static int
build_server_graph (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
- dict_t *mod_dict, char *path)
+ dict_t *mod_dict, glusterd_brickinfo_t *brickinfo)
{
- return build_graph_generic (graph, volinfo, mod_dict, path,
+ return build_graph_generic (graph, volinfo, mod_dict, brickinfo,
&server_graph_builder);
}
@@ -1777,10 +2115,11 @@ static int
perfxl_option_handler (volgen_graph_t *graph, struct volopt_map_entry *vme,
void *param)
{
- char *volname = NULL;
gf_boolean_t enabled = _gf_false;
+ glusterd_volinfo_t *volinfo = NULL;
- volname = param;
+ GF_ASSERT (param);
+ volinfo = param;
if (strcmp (vme->option, "!perf") != 0)
return 0;
@@ -1790,7 +2129,13 @@ perfxl_option_handler (volgen_graph_t *graph, struct volopt_map_entry *vme,
if (!enabled)
return 0;
- if (volgen_graph_add (graph, vme->voltype, volname))
+ /* Check op-version before adding the 'open-behind' xlator in the graph
+ */
+ if (!strcmp (vme->key, "performance.open-behind") &&
+ (vme->op_version > volinfo->client_op_version))
+ return 0;
+
+ if (volgen_graph_add (graph, vme->voltype, volinfo->volname))
return 0;
else
return -1;
@@ -1955,7 +2300,7 @@ xml_add_volset_element (xmlTextWriterPtr writer, const char *name,
#endif
static int
-get_key_from_volopt ( struct volopt_map_entry *vme, char **key)
+_get_xlator_opt_key_from_vme ( struct volopt_map_entry *vme, char **key)
{
int ret = 0;
@@ -1998,11 +2343,22 @@ get_key_from_volopt ( struct volopt_map_entry *vme, char **key)
return ret;
}
+static void
+_free_xlator_opt_key (char *key)
+{
+ GF_ASSERT (key);
+
+ if (!strcmp (key, AUTH_ALLOW_OPT_KEY) ||
+ !strcmp (key, AUTH_REJECT_OPT_KEY) ||
+ !strcmp (key, NFS_DISABLE_OPT_KEY))
+ GF_FREE (key);
+
+ return;
+}
+
int
glusterd_get_volopt_content (dict_t * ctx, gf_boolean_t xml_out)
{
-
- char *xlator_type = NULL;
void *dl_handle = NULL;
volume_opt_list_t vol_opt_handle = {{0},};
char *key = NULL;
@@ -2010,9 +2366,9 @@ glusterd_get_volopt_content (dict_t * ctx, gf_boolean_t xml_out)
int ret = -1;
char *def_val = NULL;
char *descr = NULL;
- char output_string[16384] = {0, };
+ char output_string[51200] = {0, };
char *output = NULL;
- char tmp_str[1024] = {0, };
+ char tmp_str[2048] = {0, };
#if (HAVE_LIB_XML)
xmlTextWriterPtr writer = NULL;
xmlBufferPtr buf = NULL;
@@ -2028,46 +2384,69 @@ glusterd_get_volopt_content (dict_t * ctx, gf_boolean_t xml_out)
for (vme = &glusterd_volopt_map[0]; vme->key; vme++) {
- if ( ( vme->type == NO_DOC) || (vme->type == GLOBAL_NO_DOC) )
+ if ((vme->type == NO_DOC) || (vme->type == GLOBAL_NO_DOC))
continue;
- if (get_key_from_volopt (vme, &key))
- goto out; /*Some error while getin key*/
+ if (vme->description) {
+ descr = vme->description;
+ def_val = vme->value;
+ } else {
+ if (_get_xlator_opt_key_from_vme (vme, &key)) {
+ gf_log ("glusterd", GF_LOG_DEBUG, "Failed to "
+ "get %s key from volume option entry",
+ vme->key);
+ goto out; /*Some error while geting key*/
+ }
- if (!xlator_type || strcmp (vme->voltype, xlator_type)){
ret = xlator_volopt_dynload (vme->voltype,
&dl_handle,
&vol_opt_handle);
+
if (ret) {
- dl_handle = NULL;
- continue;
+ gf_log ("glusterd", GF_LOG_DEBUG,
+ "xlator_volopt_dynload error(%d)", ret);
+ ret = 0;
+ goto cont;
}
- }
- ret = xlator_option_info_list (&vol_opt_handle, key,
- &def_val, &descr);
- if (ret) /*Swallow Error i.e if option not found*/
- continue;
+ ret = xlator_option_info_list (&vol_opt_handle, key,
+ &def_val, &descr);
+ if (ret) { /*Swallow Error i.e if option not found*/
+ gf_log ("glusterd", GF_LOG_DEBUG,
+ "Failed to get option for %s key", key);
+ ret = 0;
+ goto cont;
+ }
+ }
if (xml_out) {
#if (HAVE_LIB_XML)
if (xml_add_volset_element (writer,vme->key,
- def_val, descr))
- goto out;
+ def_val, descr)) {
+ ret = -1;
+ goto cont;
+ }
#else
gf_log ("glusterd", GF_LOG_ERROR, "Libxml not present");
#endif
} else {
- snprintf (tmp_str, 1024, "Option: %s\nDefault "
+ snprintf (tmp_str, sizeof (tmp_str), "Option: %s\nDefault "
"Value: %s\nDescription: %s\n\n",
vme->key, def_val, descr);
strcat (output_string, tmp_str);
}
-
- if (!strcmp (key, AUTH_ALLOW_OPT_KEY) ||
- !strcmp (key, AUTH_REJECT_OPT_KEY) ||
- !strcmp (key, NFS_DISABLE_OPT_KEY))
- GF_FREE (key);
+cont:
+ if (dl_handle) {
+ dlclose (dl_handle);
+ dl_handle = NULL;
+ vol_opt_handle.given_opt = NULL;
+ }
+ if (key) {
+ _free_xlator_opt_key (key);
+ key = NULL;
+ }
+ if (ret)
+ goto out;
}
#if (HAVE_LIB_XML)
@@ -2094,7 +2473,7 @@ glusterd_get_volopt_content (dict_t * ctx, gf_boolean_t xml_out)
}
ret = dict_set_dynstr (ctx, "help-str", output);
- out:
+out:
gf_log ("glusterd", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
@@ -2144,7 +2523,7 @@ volgen_graph_build_clients (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
list_for_each_entry (brick, &volinfo->bricks, brick_list) {
ret = -1;
xl = volgen_graph_add_nolink (graph, "protocol/client",
- "%s-client-%d", volname, i);
+ "%s", brick->brick_id);
if (!xl)
goto out;
ret = xlator_set_option (xl, "remote-host", brick->hostname);
@@ -2350,18 +2729,46 @@ out:
static int
volgen_graph_build_dht_cluster (volgen_graph_t *graph,
- glusterd_volinfo_t *volinfo, size_t child_count)
+ glusterd_volinfo_t *volinfo, size_t child_count,
+ gf_boolean_t is_quotad)
{
int32_t clusters = 0;
int ret = -1;
char *decommissioned_children = NULL;
xlator_t *dht = NULL;
+ char *voltype = "cluster/distribute";
+ char *name_fmt = NULL;
+
+ /* NUFA and Switch section */
+ if (dict_get_str_boolean (volinfo->dict, "cluster.nufa", 0) &&
+ dict_get_str_boolean (volinfo->dict, "cluster.switch", 0)) {
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "nufa and switch cannot be set together");
+ ret = -1;
+ goto out;
+ }
+
+ /* Check for NUFA volume option, and change the voltype */
+ if (dict_get_str_boolean (volinfo->dict, "cluster.nufa", 0))
+ voltype = "cluster/nufa";
+
+ /* Check for switch volume option, and change the voltype */
+ if (dict_get_str_boolean (volinfo->dict, "cluster.switch", 0))
+ voltype = "cluster/switch";
+
+ if (is_quotad)
+ name_fmt = "%s";
+ else
+ name_fmt = "%s-dht";
clusters = volgen_graph_build_clusters (graph, volinfo,
- "cluster/distribute", "%s-dht",
- child_count, child_count);
+ voltype,
+ name_fmt,
+ child_count,
+ child_count);
if (clusters < 0)
goto out;
+
dht = first_of (graph);
ret = _graph_get_decommissioned_children (dht, volinfo,
&decommissioned_children);
@@ -2381,10 +2788,11 @@ out:
static int
volume_volgen_graph_build_clusters (volgen_graph_t *graph,
- glusterd_volinfo_t *volinfo)
+ glusterd_volinfo_t *volinfo,
+ gf_boolean_t is_quotad)
{
- char *replicate_args[] = {"cluster/replicate",
- "%s-replicate-%d"};
+ char *replicate_type = "cluster/replicate";
+ char *replicate_fmt = "%s-replicate-%d";
char *stripe_args[] = {"cluster/stripe",
"%s-stripe-%d"};
int rclusters = 0;
@@ -2398,12 +2806,16 @@ volume_volgen_graph_build_clusters (volgen_graph_t *graph,
if (volinfo->dist_leaf_count == 1)
goto build_distribute;
+ if (glusterd_volinfo_get_boolean(volinfo,"cluster.nsr") > 0) {
+ replicate_type = "cluster/nsrc";
+ }
+
/* All other cases, it will have one or the other cluster type */
switch (volinfo->type) {
case GF_CLUSTER_TYPE_REPLICATE:
clusters = volgen_graph_build_clusters (graph, volinfo,
- replicate_args[0],
- replicate_args[1],
+ replicate_type,
+ replicate_fmt,
volinfo->brick_count,
volinfo->replica_count);
if (clusters < 0)
@@ -2423,8 +2835,8 @@ volume_volgen_graph_build_clusters (volgen_graph_t *graph,
if (volinfo->replica_count == 0)
goto out;
clusters = volgen_graph_build_clusters (graph, volinfo,
- replicate_args[0],
- replicate_args[1],
+ replicate_type,
+ replicate_fmt,
volinfo->brick_count,
volinfo->replica_count);
if (clusters < 0)
@@ -2454,7 +2866,7 @@ build_distribute:
}
ret = volgen_graph_build_dht_cluster (graph, volinfo,
- dist_count);
+ dist_count, is_quotad);
if (ret)
goto out;
@@ -2463,29 +2875,74 @@ out:
return ret;
}
+static int client_graph_set_perf_options(volgen_graph_t *graph,
+ glusterd_volinfo_t *volinfo,
+ dict_t *set_dict)
+{
+ data_t *tmp_data = NULL;
+ char *volname = NULL;
+
+ /*
+ * Logic to make sure NFS doesn't have performance translators by
+ * default for a volume
+ */
+ volname = volinfo->volname;
+ tmp_data = dict_get (set_dict, "nfs-volume-file");
+ if (!tmp_data)
+ return volgen_graph_set_options_generic(graph, set_dict,
+ volinfo,
+ &perfxl_option_handler);
+ else
+ return volgen_graph_set_options_generic(graph, set_dict,
+ volname,
+ &nfsperfxl_option_handler);
+}
+
static int
client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
dict_t *set_dict, void *param)
{
- int ret = 0;
- xlator_t *xl = NULL;
- char *volname = NULL;
- data_t *tmp_data = NULL;
+ int ret = 0;
+ xlator_t *xl = NULL;
+ char *volname = NULL;
+ glusterd_conf_t *conf = THIS->private;
+ char *tmp = NULL;
+ gf_boolean_t var = _gf_false;
+ gf_boolean_t ob = _gf_false;
+ xlator_t *this = THIS;
+
+ GF_ASSERT (this);
+ GF_ASSERT (conf);
volname = volinfo->volname;
ret = volgen_graph_build_clients (graph, volinfo, set_dict, param);
if (ret)
goto out;
- ret = volume_volgen_graph_build_clusters (graph, volinfo);
- if (ret)
+ ret = volume_volgen_graph_build_clusters (graph, volinfo, _gf_false);
+ if (ret == -1)
goto out;
- ret = glusterd_volinfo_get_boolean (volinfo, VKEY_FEATURES_QUOTA);
+ /* Check for compress volume option, and add it to the graph on client side */
+ ret = dict_get_str_boolean (set_dict, "network.compression", 0);
+ if (ret == -1)
+ goto out;
+ if (ret) {
+ xl = volgen_graph_add (graph, "features/cdc", volname);
+ if (!xl) {
+ ret = -1;
+ goto out;
+ }
+ ret = xlator_set_option (xl, "mode", "client");
+ if (ret)
+ goto out;
+ }
+
+ ret = glusterd_volinfo_get_boolean (volinfo, "features.encryption");
if (ret == -1)
goto out;
if (ret) {
- xl = volgen_graph_add (graph, "features/quota", volname);
+ xl = volgen_graph_add (graph, "encryption/crypt", volname);
if (!xl) {
ret = -1;
@@ -2493,16 +2950,121 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
}
}
- /* Logic to make sure NFS doesn't have performance translators by
- default for a volume */
- tmp_data = dict_get (set_dict, "nfs-volume-file");
- if (!tmp_data)
- ret = volgen_graph_set_options_generic (graph, set_dict, volname,
- &perfxl_option_handler);
- else
- ret = volgen_graph_set_options_generic (graph, set_dict, volname,
- &nfsperfxl_option_handler);
+ if (conf->op_version == GD_OP_VERSION_MIN) {
+ ret = glusterd_volinfo_get_boolean (volinfo,
+ VKEY_FEATURES_QUOTA);
+ if (ret == -1)
+ goto out;
+ if (ret) {
+ xl = volgen_graph_add (graph, "features/quota",
+ volname);
+ if (!xl) {
+ ret = -1;
+ goto out;
+ }
+ }
+ }
+
+
+ ret = glusterd_volinfo_get_boolean (volinfo, "features.file-snapshot");
+ if (ret == -1)
+ goto out;
+ if (ret) {
+ xl = volgen_graph_add (graph, "features/qemu-block", volname);
+
+ if (!xl) {
+ ret = -1;
+ goto out;
+ }
+ }
+
+ /* Do not allow changing read-after-open option if root-squash is
+ enabled.
+ */
+ ret = dict_get_str (set_dict, "performance.read-after-open", &tmp);
+ if (!ret) {
+ ret = dict_get_str (volinfo->dict, "server.root-squash", &tmp);
+ if (!ret) {
+ ob = _gf_false;
+ ret = gf_string2boolean (tmp, &ob);
+ if (!ret && ob) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "root-squash is enabled. Please turn it"
+ " off to change read-after-open "
+ "option");
+ ret = -1;
+ goto out;
+ }
+ }
+ }
+
+ /* open behind causes problems when root-squash is enabled
+ (by allowing reads to happen even though the squashed user
+ does not have permissions to do so) as it fakes open to be
+ successful and later sends reads on anonymous fds. So when
+ root-squash is enabled, open-behind's option to read after
+ open is done is also enabled.
+ */
+ ret = dict_get_str (set_dict, "server.root-squash", &tmp);
+ if (!ret) {
+ ret = gf_string2boolean (tmp, &var);
+ if (ret)
+ goto out;
+
+ if (var) {
+ ret = dict_get_str (volinfo->dict,
+ "performance.read-after-open",
+ &tmp);
+ if (!ret) {
+ ret = gf_string2boolean (tmp, &ob);
+ /* go ahead with turning read-after-open on
+ even if string2boolean conversion fails,
+ OR if read-after-open option is turned off
+ */
+ if (ret || !ob)
+ ret = dict_set_str (set_dict,
+ "performance.read-after-open",
+ "yes");
+ } else {
+ ret = dict_set_str (set_dict,
+ "performance.read-after-open",
+ "yes");
+ }
+ } else {
+ /* When root-squash has to be turned off, open-behind's
+ read-after-open option should be reset to what was
+ there before root-squash was turned on. If the option
+ cannot be found in volinfo's dict, it means that
+ option was not set before turning on root-squash.
+ */
+ ob = _gf_false;
+ ret = dict_get_str (volinfo->dict,
+ "performance.read-after-open",
+ &tmp);
+ if (!ret) {
+ ret = gf_string2boolean (tmp, &ob);
+
+ if (!ret && ob) {
+ ret = dict_set_str (set_dict,
+ "performance.read-after-open",
+ "yes");
+ }
+ }
+ /* consider operation is failure only if read-after-open
+ option is enabled and could not set into set_dict
+ */
+ if (!ob)
+ ret = 0;
+ }
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "setting "
+ "open behind option as part of root "
+ "squash failed");
+ goto out;
+ }
+ }
+ ret = client_graph_set_perf_options(graph, volinfo, set_dict);
if (ret)
goto out;
@@ -2521,14 +3083,28 @@ client_graph_builder (volgen_graph_t *graph, glusterd_volinfo_t *volinfo,
&loglevel_option_handler);
if (ret)
- gf_log (THIS->name, GF_LOG_WARNING, "changing client log level"
+ gf_log (this->name, GF_LOG_WARNING, "changing client log level"
" failed");
ret = volgen_graph_set_options_generic (graph, set_dict, "client",
&sys_loglevel_option_handler);
if (ret)
- gf_log (THIS->name, GF_LOG_WARNING, "changing client syslog "
+ gf_log (this->name, GF_LOG_WARNING, "changing client syslog "
"level failed");
+
+ ret = volgen_graph_set_options_generic (graph, set_dict, "client",
+ &logger_option_handler);
+
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING, "changing client logger"
+ " failed");
+
+ ret = volgen_graph_set_options_generic (graph, set_dict, "client",
+ &log_format_option_handler);
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING, "changing client log format"
+ " failed");
+
out:
return ret;
}
@@ -2865,16 +3441,31 @@ build_shd_graph (volgen_graph_t *graph, dict_t *mod_dict)
&loglevel_option_handler);
if (ret)
- gf_log (THIS->name, GF_LOG_WARNING, "changing loglevel "
+ gf_log (this->name, GF_LOG_WARNING, "changing loglevel "
"of self-heal daemon failed");
ret = volgen_graph_set_options_generic (graph, set_dict,
"client",
&sys_loglevel_option_handler);
if (ret)
- gf_log (THIS->name, GF_LOG_WARNING, "changing syslog "
+ gf_log (this->name, GF_LOG_WARNING, "changing syslog "
"level of self-heal daemon failed");
+ ret = volgen_graph_set_options_generic (graph, set_dict,
+ "client",
+ &logger_option_handler);
+
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING, "changing logger "
+ "of self-heal daemon failed");
+
+ ret = volgen_graph_set_options_generic (graph, set_dict,
+ "client",
+ &log_format_option_handler);
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING, "changing log "
+ "format level of self-heal daemon failed");
+
ret = dict_reset (set_dict);
if (ret)
goto out;
@@ -2900,6 +3491,8 @@ build_nfs_graph (volgen_graph_t *graph, dict_t *mod_dict)
char *skey = NULL;
int ret = 0;
char nfs_xprt[16] = {0,};
+ char *volname = NULL;
+ data_t *data = NULL;
this = THIS;
GF_ASSERT (this);
@@ -2925,6 +3518,10 @@ build_nfs_graph (volgen_graph_t *graph, dict_t *mod_dict)
if (ret)
goto out;
+ ret = xlator_set_option (nfsxl, "nfs.drc", "on");
+ if (ret)
+ goto out;
+
list_for_each_entry (voliter, &priv->volumes, vol_list) {
if (voliter->status != GLUSTERD_STATUS_STARTED)
continue;
@@ -2962,7 +3559,7 @@ build_nfs_graph (volgen_graph_t *graph, dict_t *mod_dict)
else
get_transport_type (voliter, voliter->dict, nfs_xprt, _gf_true);
- ret = dict_set_str (set_dict, VKEY_PERF_STAT_PREFETCH, "off");
+ ret = dict_set_str (set_dict, "performance.stat-prefetch", "off");
if (ret)
goto out;
@@ -2985,6 +3582,12 @@ build_nfs_graph (volgen_graph_t *graph, dict_t *mod_dict)
if (ret)
goto out;
+ if (mod_dict && (data = dict_get (mod_dict, "volume-name"))) {
+ volname = data->data;
+ if (strcmp (volname, voliter->volname) == 0)
+ dict_copy (mod_dict, set_dict);
+ }
+
ret = build_client_graph (&cgraph, voliter, set_dict);
if (ret)
goto out;
@@ -3031,9 +3634,6 @@ build_nfs_graph (volgen_graph_t *graph, dict_t *mod_dict)
return ret;
}
-
-
-
/****************************
*
* Volume generation interface
@@ -3067,16 +3667,21 @@ glusterd_is_valid_volfpath (char *volname, char *brick)
glusterd_brickinfo_t *brickinfo = NULL;
glusterd_volinfo_t *volinfo = NULL;
int32_t ret = 0;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
ret = glusterd_brickinfo_new_from_brick (brick, &brickinfo);
if (ret) {
- gf_log ("", GF_LOG_WARNING, "brick path validation failed");
+ gf_log (this->name, GF_LOG_WARNING, "Failed to create brickinfo"
+ " for brick %s", brick );
ret = 0;
goto out;
}
ret = glusterd_volinfo_new (&volinfo);
if (ret) {
- gf_log ("", GF_LOG_WARNING, "brick path validation failed");
+ gf_log (this->name, GF_LOG_WARNING, "Failed to create volinfo");
ret = 0;
goto out;
}
@@ -3089,7 +3694,7 @@ out:
if (brickinfo)
glusterd_brickinfo_delete (brickinfo);
if (volinfo)
- glusterd_volinfo_delete (volinfo);
+ glusterd_volinfo_unref (volinfo);
return ret;
}
@@ -3106,7 +3711,7 @@ glusterd_generate_brick_volfile (glusterd_volinfo_t *volinfo,
get_brick_filepath (filename, volinfo, brickinfo);
- ret = build_server_graph (&graph, volinfo, NULL, brickinfo->path);
+ ret = build_server_graph (&graph, volinfo, NULL, brickinfo);
if (!ret)
ret = volgen_write_volfile (&graph, filename);
@@ -3115,7 +3720,100 @@ glusterd_generate_brick_volfile (glusterd_volinfo_t *volinfo,
return ret;
}
+static int
+build_quotad_graph (volgen_graph_t *graph, dict_t *mod_dict)
+{
+ volgen_graph_t cgraph = {0};
+ glusterd_volinfo_t *voliter = NULL;
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ dict_t *set_dict = NULL;
+ int ret = 0;
+ xlator_t *quotad_xl = NULL;
+ char *skey = NULL;
+
+ this = THIS;
+ priv = this->private;
+
+ set_dict = dict_new ();
+ if (!set_dict) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ quotad_xl = volgen_graph_add_as (graph, "features/quotad", "quotad");
+ if (!quotad_xl) {
+ ret = -1;
+ goto out;
+ }
+
+ list_for_each_entry (voliter, &priv->volumes, vol_list) {
+ if (voliter->status != GLUSTERD_STATUS_STARTED)
+ continue;
+
+ if (1 != glusterd_is_volume_quota_enabled (voliter))
+ continue;
+
+ ret = dict_set_uint32 (set_dict, "trusted-client",
+ GF_CLIENT_TRUSTED);
+ if (ret)
+ goto out;
+
+ dict_copy (voliter->dict, set_dict);
+ if (mod_dict)
+ dict_copy (mod_dict, set_dict);
+
+ ret = gf_asprintf(&skey, "%s.volume-id", voliter->volname);
+ if (ret == -1) {
+ gf_log("", GF_LOG_ERROR, "Out of memory");
+ goto out;
+ }
+ ret = xlator_set_option(quotad_xl, skey, voliter->volname);
+ GF_FREE(skey);
+ if (ret)
+ goto out;
+ memset (&cgraph, 0, sizeof (cgraph));
+ ret = volgen_graph_build_clients (&cgraph, voliter, set_dict,
+ NULL);
+ if (ret)
+ goto out;
+
+ ret = volume_volgen_graph_build_clusters (&cgraph, voliter,
+ _gf_true);
+ if (ret) {
+ ret = -1;
+ goto out;
+ }
+
+ if (mod_dict) {
+ dict_copy (mod_dict, set_dict);
+ ret = volgen_graph_set_options_generic (&cgraph, set_dict,
+ voliter,
+ basic_option_handler);
+ } else {
+ ret = volgen_graph_set_options_generic (&cgraph,
+ voliter->dict,
+ voliter,
+ basic_option_handler);
+ }
+ if (ret)
+ goto out;
+
+ ret = volgen_graph_merge_sub (graph, &cgraph, 1);
+ if (ret)
+ goto out;
+
+ ret = dict_reset (set_dict);
+ if (ret)
+ goto out;
+ }
+
+out:
+ if (set_dict)
+ dict_unref (set_dict);
+ return ret;
+}
static void
get_vol_tstamp_file (char *filename, glusterd_volinfo_t *volinfo)
@@ -3166,6 +3864,10 @@ generate_brick_volfiles (glusterd_volinfo_t *volinfo)
}
}
+ if (glusterd_volinfo_get_boolean(volinfo,"cluster.nsr") > 0) {
+ assign_groups(volinfo);
+ }
+
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
gf_log ("", GF_LOG_DEBUG,
"Found a brick - %s:%s", brickinfo->hostname,
@@ -3217,7 +3919,7 @@ enumerate_transport_reqs (gf_transport_type type, char **types)
}
}
-static int
+int
generate_client_volfiles (glusterd_volinfo_t *volinfo,
glusterd_client_type_t client_type)
{
@@ -3262,6 +3964,8 @@ generate_client_volfiles (glusterd_volinfo_t *volinfo,
out:
if (dict)
dict_unref (dict);
+
+ gf_log ("", GF_LOG_TRACE, "Returning %d", ret);
return ret;
}
@@ -3397,7 +4101,6 @@ glusterd_check_nfs_volfile_identical (gf_boolean_t *identical)
GF_ASSERT (this);
GF_ASSERT (identical);
-
conf = this->private;
glusterd_get_nodesvc_volfile ("nfs", conf->workdir,
@@ -3435,6 +4138,67 @@ out:
}
int
+glusterd_check_nfs_topology_identical (gf_boolean_t *identical)
+{
+ char nfsvol[PATH_MAX] = {0,};
+ char tmpnfsvol[PATH_MAX] = {0,};
+ glusterd_conf_t *conf = NULL;
+ xlator_t *this = THIS;
+ int ret = -1;
+ int tmpclean = 0;
+ int tmpfd = -1;
+
+ if ((!identical) || (!this) || (!this->private))
+ goto out;
+
+ conf = (glusterd_conf_t *) this->private;
+
+ /* Fetch the original NFS volfile */
+ glusterd_get_nodesvc_volfile ("nfs", conf->workdir,
+ nfsvol, sizeof (nfsvol));
+
+ /* Create the temporary NFS volfile */
+ snprintf (tmpnfsvol, sizeof (tmpnfsvol), "/tmp/gnfs-XXXXXX");
+ tmpfd = mkstemp (tmpnfsvol);
+ if (tmpfd < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Unable to create temp file %s: (%s)",
+ tmpnfsvol, strerror (errno));
+ goto out;
+ }
+
+ tmpclean = 1; /* SET the flag to unlink() tmpfile */
+
+ ret = glusterd_create_global_volfile (build_nfs_graph,
+ tmpnfsvol, NULL);
+ if (ret)
+ goto out;
+
+ /* Compare the topology of volfiles */
+ ret = glusterd_check_topology_identical (nfsvol, tmpnfsvol,
+ identical);
+out:
+ if (tmpfd >= 0)
+ close (tmpfd);
+ if (tmpclean)
+ unlink (tmpnfsvol);
+ return ret;
+}
+
+int
+glusterd_create_quotad_volfile ()
+{
+ char filepath[PATH_MAX] = {0,};
+ glusterd_conf_t *conf = THIS->private;
+
+ glusterd_get_nodesvc_volfile ("quotad", conf->workdir,
+ filepath, sizeof (filepath));
+ return glusterd_create_global_volfile (build_quotad_graph,
+ filepath, NULL);
+}
+
+
+int
glusterd_delete_volfile (glusterd_volinfo_t *volinfo,
glusterd_brickinfo_t *brickinfo)
{
@@ -3491,6 +4255,9 @@ validate_nfsopts (glusterd_volinfo_t *volinfo,
char transport_type[16] = {0,};
char *tt = NULL;
char err_str[4096] = {0,};
+ xlator_t *this = THIS;
+
+ GF_ASSERT (this);
graph.errstr = op_errstr;
@@ -3501,7 +4268,7 @@ validate_nfsopts (glusterd_volinfo_t *volinfo,
snprintf (err_str, sizeof (err_str), "Changing nfs "
"transport type is allowed only for volumes "
"of transport type tcp,rdma");
- gf_log ("", GF_LOG_ERROR, "%s", err_str);
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
*op_errstr = gf_strdup (err_str);
ret = -1;
goto out;
@@ -3515,6 +4282,12 @@ validate_nfsopts (glusterd_volinfo_t *volinfo,
}
}
+ ret = dict_set_str (val_dict, "volume-name", volinfo->volname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set volume name");
+ goto out;
+ }
+
ret = build_nfs_graph (&graph, val_dict);
if (!ret)
ret = graph_reconf_validateopt (&graph.graph, op_errstr);
@@ -3522,60 +4295,12 @@ validate_nfsopts (glusterd_volinfo_t *volinfo,
volgen_graph_free (&graph);
out:
- gf_log ("glusterd", GF_LOG_DEBUG, "Returning %d", ret);
+ if (dict_get (val_dict, "volume-name"))
+ dict_del (val_dict, "volume-name");
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
-int
-validate_wb_eagerlock (glusterd_volinfo_t *volinfo, dict_t *val_dict,
- char **op_errstr)
-{
- int ret = -1;
- gf_boolean_t wb_val = _gf_false;
- gf_boolean_t el_val = _gf_false;
- char msg[2048] = {0};
- char *wb_key = NULL;
- char *el_key = NULL;
-
- wb_key = "performance.write-behind";
- el_key = "cluster.eager-lock";
- ret = dict_get_str_boolean (val_dict, wb_key, -1);
- if (ret < 0)
- goto check_eager_lock;
- wb_val = ret;
- ret = glusterd_volinfo_get_boolean (volinfo, el_key);
- if (ret < 0)
- goto out;
- el_val = ret;
- goto done;
-
-check_eager_lock:
- ret = dict_get_str_boolean (val_dict, el_key, -1);
- if (ret < 0) {
- ret = 0; //Keys of intereset to this fn are not present.
- goto out;
- }
- el_val = ret;
- ret = glusterd_volinfo_get_boolean (volinfo, wb_key);
- if (ret < 0)
- goto out;
- wb_val = ret;
- goto done;
-
-done:
- ret = 0;
- if (!wb_val && el_val) {
- ret = -1;
- snprintf (msg, sizeof (msg), "%s off and %s on is not "
- "valid configuration", wb_key, el_key);
- gf_log ("glusterd", GF_LOG_ERROR, "%s", msg);
- if (op_errstr)
- *op_errstr = gf_strdup (msg);
- goto out;
- }
-out:
- return ret;
-}
int
validate_clientopts (glusterd_volinfo_t *volinfo,
@@ -3601,7 +4326,7 @@ validate_clientopts (glusterd_volinfo_t *volinfo,
int
validate_brickopts (glusterd_volinfo_t *volinfo,
- char *brickinfo_path,
+ glusterd_brickinfo_t *brickinfo,
dict_t *val_dict,
char **op_errstr)
{
@@ -3612,7 +4337,7 @@ validate_brickopts (glusterd_volinfo_t *volinfo,
graph.errstr = op_errstr;
- ret = build_server_graph (&graph, volinfo, val_dict, brickinfo_path);
+ ret = build_server_graph (&graph, volinfo, val_dict, brickinfo);
if (!ret)
ret = graph_reconf_validateopt (&graph.graph, op_errstr);
@@ -3634,7 +4359,7 @@ glusterd_validate_brickreconf (glusterd_volinfo_t *volinfo,
gf_log ("", GF_LOG_DEBUG,
"Validating %s", brickinfo->hostname);
- ret = validate_brickopts (volinfo, brickinfo->path, val_dict,
+ ret = validate_brickopts (volinfo, brickinfo, val_dict,
op_errstr);
if (ret)
goto out;
@@ -3736,10 +4461,6 @@ glusterd_validate_reconfopts (glusterd_volinfo_t *volinfo, dict_t *val_dict,
goto out;
}
- ret = validate_wb_eagerlock (volinfo, val_dict, op_errstr);
- if (ret)
- goto out;
-
ret = validate_clientopts (volinfo, val_dict, op_errstr);
if (ret) {
gf_log ("", GF_LOG_DEBUG,
@@ -3765,3 +4486,112 @@ out:
gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
+
+static struct volopt_map_entry *
+_gd_get_vmep (char *key) {
+ char *completion = NULL;
+ struct volopt_map_entry *vmep = NULL;
+ int ret = 0;
+
+ COMPLETE_OPTION ((char *)key, completion, ret);
+ for (vmep = glusterd_volopt_map; vmep->key; vmep++) {
+ if (strcmp (vmep->key, key) == 0)
+ return vmep;
+ }
+
+ return NULL;
+}
+
+uint32_t
+glusterd_get_op_version_for_key (char *key)
+{
+ struct volopt_map_entry *vmep = NULL;
+
+ GF_ASSERT (key);
+
+ vmep = _gd_get_vmep (key);
+ if (vmep)
+ return vmep->op_version;
+
+ return 0;
+}
+
+gf_boolean_t
+gd_is_client_option (char *key)
+{
+ struct volopt_map_entry *vmep = NULL;
+
+ GF_ASSERT (key);
+
+ vmep = _gd_get_vmep (key);
+ if (vmep && (vmep->flags & OPT_FLAG_CLIENT_OPT))
+ return _gf_true;
+
+ return _gf_false;
+}
+
+gf_boolean_t
+gd_is_xlator_option (char *key)
+{
+ struct volopt_map_entry *vmep = NULL;
+
+ GF_ASSERT (key);
+
+ vmep = _gd_get_vmep (key);
+ if (vmep && (vmep->flags & OPT_FLAG_XLATOR_OPT))
+ return _gf_true;
+
+ return _gf_false;
+}
+
+volume_option_type_t
+_gd_get_option_type (char *key)
+{
+ struct volopt_map_entry *vmep = NULL;
+ void *dl_handle = NULL;
+ volume_opt_list_t vol_opt_list = {{0},};
+ int ret = -1;
+ volume_option_t *opt = NULL;
+ char *xlopt_key = NULL;
+ volume_option_type_t opt_type = GF_OPTION_TYPE_MAX;
+
+ GF_ASSERT (key);
+
+ vmep = _gd_get_vmep (key);
+
+ if (vmep) {
+ INIT_LIST_HEAD (&vol_opt_list.list);
+ ret = xlator_volopt_dynload (vmep->voltype, &dl_handle,
+ &vol_opt_list);
+ if (ret)
+ goto out;
+
+ if (_get_xlator_opt_key_from_vme (vmep, &xlopt_key))
+ goto out;
+
+ opt = xlator_volume_option_get_list (&vol_opt_list, xlopt_key);
+ _free_xlator_opt_key (xlopt_key);
+
+ if (opt)
+ opt_type = opt->type;
+ }
+
+out:
+ if (dl_handle) {
+ dlclose (dl_handle);
+ dl_handle = NULL;
+ }
+
+ return opt_type;
+}
+
+gf_boolean_t
+gd_is_boolean_option (char *key)
+{
+ GF_ASSERT (key);
+
+ if (GF_OPTION_TYPE_BOOL == _gd_get_option_type (key))
+ return _gf_true;
+
+ return _gf_false;
+}
diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.h b/xlators/mgmt/glusterd/src/glusterd-volgen.h
index f8279f6c9..f4703c288 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volgen.h
+++ b/xlators/mgmt/glusterd/src/glusterd-volgen.h
@@ -22,9 +22,23 @@
#define VKEY_DIAG_CNT_FOP_HITS "diagnostics.count-fop-hits"
#define VKEY_DIAG_LAT_MEASUREMENT "diagnostics.latency-measurement"
#define VKEY_FEATURES_LIMIT_USAGE "features.limit-usage"
+#define VKEY_FEATURES_SOFT_LIMIT "features.soft-limit"
#define VKEY_MARKER_XTIME GEOREP".indexing"
+#define VKEY_MARKER_XTIME_FORCE GEOREP".ignore-pid-check"
+#define VKEY_CHANGELOG "changelog.changelog"
#define VKEY_FEATURES_QUOTA "features.quota"
-#define VKEY_PERF_STAT_PREFETCH "performance.stat-prefetch"
+
+#define AUTH_ALLOW_MAP_KEY "auth.allow"
+#define AUTH_REJECT_MAP_KEY "auth.reject"
+#define NFS_DISABLE_MAP_KEY "nfs.disable"
+#define AUTH_ALLOW_OPT_KEY "auth.addr.*.allow"
+#define AUTH_REJECT_OPT_KEY "auth.addr.*.reject"
+#define NFS_DISABLE_OPT_KEY "nfs.*.disable"
+
+// TBD - bring this from a common conf file
+#define NSR_MAX_REPLICA_GROUP_SIZE 8
+#define NSR_MAX_PATH_SIZE (1024 + PATH_MAX)
+#define NSR_CONF_PATH "/var/lib/glusterd/nsr/"
typedef enum {
GF_CLIENT_TRUSTED,
@@ -53,9 +67,54 @@ typedef enum {
typedef enum gd_volopt_flags_ {
OPT_FLAG_NONE,
- OPT_FLAG_FORCE = 1,
+ OPT_FLAG_FORCE = 0x01, // option needs force to be reset
+ OPT_FLAG_XLATOR_OPT = 0x02, // option enables/disables xlators
+ OPT_FLAG_CLIENT_OPT = 0x04, // option affects clients
} gd_volopt_flags_t;
+typedef enum {
+ GF_XLATOR_POSIX = 0,
+ GF_XLATOR_ACL,
+ GF_XLATOR_LOCKS,
+ GF_XLATOR_IOT,
+ GF_XLATOR_INDEX,
+ GF_XLATOR_MARKER,
+ GF_XLATOR_IO_STATS,
+ GF_XLATOR_BD,
+ GF_XLATOR_NONE,
+} glusterd_server_xlator_t;
+
+/* As of now debug xlators can be loaded only below fuse in the client
+ * graph via cli. More xlators can be added below when the cli option
+ * for adding debug xlators anywhere in the client graph has to be made
+ * available.
+ */
+typedef enum {
+ GF_CLNT_XLATOR_FUSE = 0,
+ GF_CLNT_XLATOR_NONE,
+} glusterd_client_xlator_t;
+
+typedef enum { DOC, NO_DOC, GLOBAL_DOC, GLOBAL_NO_DOC } option_type_t;
+
+typedef int (*vme_option_validation) (dict_t *dict, char *key, char *value,
+ char **op_errstr);
+
+struct volopt_map_entry {
+ char *key;
+ char *voltype;
+ char *option;
+ char *value;
+ option_type_t type;
+ uint32_t flags;
+ uint32_t op_version;
+ char *description;
+ vme_option_validation validate_fn;
+ /* If client_option is true, the option affects clients.
+ * this is used to calculate client-op-version of volumes
+ */
+ //gf_boolean_t client_option;
+};
+
int glusterd_create_rb_volfiles (glusterd_volinfo_t *volinfo,
glusterd_brickinfo_t *brickinfo);
@@ -67,9 +126,14 @@ void glusterd_get_shd_filepath (char *filename);
int glusterd_create_nfs_volfile ();
int glusterd_create_shd_volfile ();
+int glusterd_create_quotad_volfile ();
int glusterd_delete_volfile (glusterd_volinfo_t *volinfo,
glusterd_brickinfo_t *brickinfo);
+int
+glusterd_delete_snap_volfile (glusterd_volinfo_t *volinfo,
+ glusterd_volinfo_t *snap_volinfo,
+ glusterd_brickinfo_t *brickinfo);
int glusterd_volinfo_get (glusterd_volinfo_t *volinfo, char *key, char **value);
int glusterd_volinfo_get_boolean (glusterd_volinfo_t *volinfo, char *key);
@@ -83,9 +147,33 @@ glusterd_check_voloption_flags (char *key, int32_t flags);
gf_boolean_t
glusterd_is_valid_volfpath (char *volname, char *brick);
int generate_brick_volfiles (glusterd_volinfo_t *volinfo);
+int generate_snap_brick_volfiles (glusterd_volinfo_t *volinfo,
+ glusterd_volinfo_t *snap_volinfo);
+int generate_client_volfiles (glusterd_volinfo_t *volinfo,
+ glusterd_client_type_t client_type);
+int
+generate_snap_client_volfiles (glusterd_volinfo_t *actual_volinfo,
+ glusterd_volinfo_t *snap_volinfo,
+ glusterd_client_type_t client_type,
+ gf_boolean_t vol_restore);
int glusterd_get_volopt_content (dict_t *dict, gf_boolean_t xml_out);
char*
glusterd_get_trans_type_rb (gf_transport_type ttype);
int
glusterd_check_nfs_volfile_identical (gf_boolean_t *identical);
+int
+glusterd_check_nfs_topology_identical (gf_boolean_t *identical);
+
+uint32_t
+glusterd_get_op_version_for_key (char *key);
+
+gf_boolean_t
+gd_is_client_option (char *key);
+
+gf_boolean_t
+gd_is_xlator_option (char *key);
+
+gf_boolean_t
+gd_is_boolean_option (char *key);
+
#endif
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
index d42694353..504aeb839 100644
--- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
@@ -12,6 +12,10 @@
#include "config.h"
#endif
+#ifdef HAVE_BD_XLATOR
+#include <lvm2app.h>
+#endif
+
#include "common-utils.h"
#include "syscall.h"
#include "cli1-xdr.h"
@@ -26,20 +30,18 @@
#define glusterd_op_start_volume_args_get(dict, volname, flags) \
glusterd_op_stop_volume_args_get (dict, volname, flags)
+extern int
+_get_slave_status (dict_t *this, char *key, data_t *value, void *data);
+
int
-glusterd_handle_create_volume (rpcsvc_request_t *req)
+__glusterd_handle_create_volume (rpcsvc_request_t *req)
{
int32_t ret = -1;
gf_cli_req cli_req = {{0,}};
dict_t *dict = NULL;
- glusterd_brickinfo_t *brickinfo = NULL;
- char *brick = NULL;
char *bricks = NULL;
char *volname = NULL;
int brick_count = 0;
- char *tmpptr = NULL;
- int i = 0;
- char *brick_list = NULL;
void *cli_rsp = NULL;
char err_str[2048] = {0,};
gf_cli_rsp rsp = {0,};
@@ -48,28 +50,26 @@ glusterd_handle_create_volume (rpcsvc_request_t *req)
char *trans_type = NULL;
uuid_t volume_id = {0,};
uuid_t tmp_uuid = {0};
- glusterd_volinfo_t tmpvolinfo = {{0},};
int32_t type = 0;
char *username = NULL;
char *password = NULL;
GF_ASSERT (req);
- INIT_LIST_HEAD (&tmpvolinfo.bricks);
-
this = THIS;
GF_ASSERT(this);
ret = -1;
- if (!xdr_to_generic (req->msg[0], &cli_req,
- (xdrproc_t)xdr_gf_cli_req)) {
- //failed to decode msg;
+ ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
req->rpc_err = GARBAGE_ARGS;
- snprintf (err_str, sizeof (err_str), "Garbage args received");
+ snprintf (err_str, sizeof (err_str), "Failed to decode request "
+ "received from cli");
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
- gf_log ("glusterd", GF_LOG_INFO, "Received create volume req");
+ gf_log (this->name, GF_LOG_DEBUG, "Received create volume req");
if (cli_req.dict.dict_len) {
/* Unserialize the dictionary */
@@ -79,11 +79,11 @@ glusterd_handle_create_volume (rpcsvc_request_t *req)
cli_req.dict.dict_len,
&dict);
if (ret < 0) {
- gf_log ("glusterd", GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"failed to "
"unserialize req-buffer to dictionary");
snprintf (err_str, sizeof (err_str), "Unable to decode "
- "the buffer");
+ "the command");
goto out;
} else {
dict->extra_stdfree = cli_req.dict.dict_val;
@@ -93,46 +93,52 @@ glusterd_handle_create_volume (rpcsvc_request_t *req)
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get volume name");
snprintf (err_str, sizeof (err_str), "Unable to get volume "
"name");
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
if ((ret = glusterd_check_volume_exists (volname))) {
- snprintf(err_str, 2048, "Volume %s already exists", volname);
- gf_log ("glusterd", GF_LOG_ERROR, "%s", err_str);
+ snprintf (err_str, sizeof (err_str), "Volume %s already exists",
+ volname);
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
ret = dict_get_int32 (dict, "count", &brick_count);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get count");
- snprintf (err_str, sizeof (err_str), "Unable to get volume "
- "brick count");
+ snprintf (err_str, sizeof (err_str), "Unable to get brick count"
+ " for volume %s", volname);
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
ret = dict_get_int32 (dict, "type", &type);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get type");
- snprintf (err_str, sizeof (err_str), "Unable to get volume "
- "type");
+ snprintf (err_str, sizeof (err_str), "Unable to get type of "
+ "volume %s", volname);
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
ret = dict_get_str (dict, "transport", &trans_type);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get transport-type");
- snprintf (err_str, sizeof (err_str), "Unable to get volume "
- "transport-type");
+ snprintf (err_str, sizeof (err_str), "Unable to get "
+ "transport-type of volume %s", volname);
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
ret = dict_get_str (dict, "bricks", &bricks);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get bricks");
- snprintf (err_str, sizeof (err_str), "Unable to get volume "
- "bricks");
+ snprintf (err_str, sizeof (err_str), "Unable to get bricks for "
+ "volume %s", volname);
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
+ goto out;
+ }
+
+ if (!dict_get (dict, "force")) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get 'force' flag");
goto out;
}
@@ -140,99 +146,82 @@ glusterd_handle_create_volume (rpcsvc_request_t *req)
free_ptr = gf_strdup (uuid_utoa (volume_id));
ret = dict_set_dynstr (dict, "volume-id", free_ptr);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "unable to set volume-id");
snprintf (err_str, sizeof (err_str), "Unable to set volume "
- "id");
+ "id of volume %s", volname);
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
free_ptr = NULL;
- if (bricks) {
- brick_list = gf_strdup (bricks);
- free_ptr = brick_list;
- }
-
- while ( i < brick_count) {
- i++;
- brick= strtok_r (brick_list, " \n", &tmpptr);
- brick_list = tmpptr;
- ret = glusterd_brickinfo_new_from_brick (brick, &brickinfo);
- if (ret) {
- snprintf (err_str, sizeof (err_str), "Unable to get "
- "brick info from brick %s", brick);
- goto out;
- }
-
- ret = glusterd_new_brick_validate (brick, brickinfo, err_str,
- sizeof (err_str));
- if (ret)
- goto out;
-
- list_add_tail (&brickinfo->brick_list, &tmpvolinfo.bricks);
- brickinfo = NULL;
- }
-
/* generate internal username and password */
uuid_generate (tmp_uuid);
username = gf_strdup (uuid_utoa (tmp_uuid));
ret = dict_set_dynstr (dict, "internal-username", username);
- if (ret)
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set username for "
+ "volume %s", volname);
goto out;
+ }
uuid_generate (tmp_uuid);
password = gf_strdup (uuid_utoa (tmp_uuid));
ret = dict_set_dynstr (dict, "internal-password", password);
- if (ret)
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set password for "
+ "volume %s", volname);
goto out;
+ }
- ret = glusterd_op_begin (req, GD_OP_CREATE_VOLUME, dict);
+ ret = glusterd_op_begin_synctask (req, GD_OP_CREATE_VOLUME, dict);
out:
if (ret) {
rsp.op_ret = -1;
rsp.op_errno = 0;
if (err_str[0] == '\0')
- snprintf (err_str, sizeof (err_str), "Operation failed");
+ snprintf (err_str, sizeof (err_str),
+ "Operation failed");
rsp.op_errstr = err_str;
cli_rsp = &rsp;
glusterd_to_cli (req, cli_rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_cli_rsp, dict);
- if (dict)
- dict_unref (dict);
-
ret = 0; //Client response sent, prevent second response
}
GF_FREE(free_ptr);
- glusterd_volume_brickinfos_delete (&tmpvolinfo);
- if (brickinfo)
- glusterd_brickinfo_delete (brickinfo);
-
- glusterd_friend_sm ();
- glusterd_op_sm ();
-
return ret;
}
int
-glusterd_handle_cli_start_volume (rpcsvc_request_t *req)
+glusterd_handle_create_volume (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ __glusterd_handle_create_volume);
+}
+
+int
+__glusterd_handle_cli_start_volume (rpcsvc_request_t *req)
{
int32_t ret = -1;
gf_cli_req cli_req = {{0,}};
char *volname = NULL;
dict_t *dict = NULL;
glusterd_op_t cli_op = GD_OP_START_VOLUME;
+ char errstr[2048] = {0,};
xlator_t *this = NULL;
this = THIS;
+ GF_ASSERT (this);
GF_ASSERT (req);
- if (!xdr_to_generic (req->msg[0], &cli_req,
- (xdrproc_t)xdr_gf_cli_req)) {
- //failed to decode msg;
+ ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
+ snprintf (errstr, sizeof (errstr), "Failed to decode message "
+ "received from cli");
req->rpc_err = GARBAGE_ARGS;
+ gf_log (this->name, sizeof (errstr), "%s", errstr);
goto out;
}
@@ -244,20 +233,23 @@ glusterd_handle_cli_start_volume (rpcsvc_request_t *req)
cli_req.dict.dict_len,
&dict);
if (ret < 0) {
- gf_log (THIS->name, GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"failed to "
"unserialize req-buffer to dictionary");
+ snprintf (errstr, sizeof (errstr), "Unable to decode "
+ "the command");
goto out;
}
}
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "dict get failed");
+ snprintf (errstr, sizeof (errstr), "Unable to get volume name");
+ gf_log (this->name, GF_LOG_ERROR, "%s", errstr);
goto out;
}
- gf_log (this->name, GF_LOG_INFO, "Received start vol req"
+ gf_log (this->name, GF_LOG_DEBUG, "Received start vol req"
" for volume %s", volname);
ret = glusterd_op_begin_synctask (req, GD_OP_START_VOLUME, dict);
@@ -266,18 +258,24 @@ out:
free (cli_req.dict.dict_val); //its malloced by xdr
if (ret) {
+ if(errstr[0] == '\0')
+ snprintf (errstr, sizeof (errstr), "Operation failed");
ret = glusterd_op_send_cli_response (cli_op, ret, 0, req,
- dict, "operation failed");
- if (dict)
- dict_unref (dict);
+ dict, errstr);
}
return ret;
}
+int
+glusterd_handle_cli_start_volume (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ __glusterd_handle_cli_start_volume);
+}
int
-glusterd_handle_cli_stop_volume (rpcsvc_request_t *req)
+__glusterd_handle_cli_stop_volume (rpcsvc_request_t *req)
{
int32_t ret = -1;
gf_cli_req cli_req = {{0,}};
@@ -285,14 +283,18 @@ glusterd_handle_cli_stop_volume (rpcsvc_request_t *req)
dict_t *dict = NULL;
glusterd_op_t cli_op = GD_OP_STOP_VOLUME;
xlator_t *this = NULL;
+ char err_str[2048] = {0,};
this = THIS;
+ GF_ASSERT (this);
GF_ASSERT (req);
- if (!xdr_to_generic (req->msg[0], &cli_req,
- (xdrproc_t)xdr_gf_cli_req)) {
- //failed to decode msg;
+ ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
+ snprintf (err_str, sizeof (err_str), "Failed to decode message "
+ "received from cli");
req->rpc_err = GARBAGE_ARGS;
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
if (cli_req.dict.dict_len) {
@@ -306,6 +308,8 @@ glusterd_handle_cli_stop_volume (rpcsvc_request_t *req)
gf_log (this->name, GF_LOG_ERROR,
"failed to "
"unserialize req-buffer to dictionary");
+ snprintf (err_str, sizeof (err_str), "Unable to decode "
+ "the command");
goto out;
}
}
@@ -313,45 +317,59 @@ glusterd_handle_cli_stop_volume (rpcsvc_request_t *req)
ret = dict_get_str (dict, "volname", &dup_volname);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get volume name");
+ snprintf (err_str, sizeof (err_str), "Failed to get volume "
+ "name");
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
- gf_log (this->name, GF_LOG_INFO, "Received stop vol req "
+ gf_log (this->name, GF_LOG_DEBUG, "Received stop vol req "
"for volume %s", dup_volname);
- ret = glusterd_op_begin (req, GD_OP_STOP_VOLUME, dict);
+ ret = glusterd_op_begin_synctask (req, GD_OP_STOP_VOLUME, dict);
out:
free (cli_req.dict.dict_val); //its malloced by xdr
- glusterd_friend_sm ();
- glusterd_op_sm ();
-
if (ret) {
+ if (err_str[0] == '\0')
+ snprintf (err_str, sizeof (err_str),
+ "Operation failed");
ret = glusterd_op_send_cli_response (cli_op, ret, 0, req,
- dict, "operation failed");
- if (dict)
- dict_unref (dict);
+ dict, err_str);
}
return ret;
}
int
-glusterd_handle_cli_delete_volume (rpcsvc_request_t *req)
+glusterd_handle_cli_stop_volume (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ __glusterd_handle_cli_stop_volume);
+}
+
+int
+__glusterd_handle_cli_delete_volume (rpcsvc_request_t *req)
{
int32_t ret = -1;
gf_cli_req cli_req = {{0,},};
glusterd_op_t cli_op = GD_OP_DELETE_VOLUME;
dict_t *dict = NULL;
char *volname = NULL;
+ char err_str[2048]= {0,};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
GF_ASSERT (req);
- if (!xdr_to_generic (req->msg[0], &cli_req,
- (xdrproc_t)xdr_gf_cli_req)) {
- //failed to decode msg;
+ ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
+ snprintf (err_str, sizeof (err_str), "Failed to decode request "
+ "received from cli");
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
req->rpc_err = GARBAGE_ARGS;
goto out;
}
@@ -364,44 +382,52 @@ glusterd_handle_cli_delete_volume (rpcsvc_request_t *req)
cli_req.dict.dict_len,
&dict);
if (ret < 0) {
- gf_log (THIS->name, GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"failed to "
"unserialize req-buffer to dictionary");
+ snprintf (err_str, sizeof (err_str), "Unable to decode "
+ "the command");
goto out;
}
}
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR,
- "Failed to get volname");
+ snprintf (err_str, sizeof (err_str), "Failed to get volume "
+ "name");
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
req->rpc_err = GARBAGE_ARGS;
goto out;
}
- gf_log ("glusterd", GF_LOG_INFO, "Received delete vol req"
+ gf_log (this->name, GF_LOG_DEBUG, "Received delete vol req"
"for volume %s", volname);
- ret = glusterd_op_begin (req, GD_OP_DELETE_VOLUME, dict);
+ ret = glusterd_op_begin_synctask (req, GD_OP_DELETE_VOLUME, dict);
out:
free (cli_req.dict.dict_val); //its malloced by xdr
- glusterd_friend_sm ();
- glusterd_op_sm ();
-
if (ret) {
+ if (err_str[0] == '\0')
+ snprintf (err_str, sizeof (err_str),
+ "Operation failed");
ret = glusterd_op_send_cli_response (cli_op, ret, 0, req,
- dict, "operation failed");
- if (dict)
- dict_unref (dict);
+ dict, err_str);
}
return ret;
}
int
-glusterd_handle_cli_heal_volume (rpcsvc_request_t *req)
+glusterd_handle_cli_delete_volume (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ __glusterd_handle_cli_delete_volume);
+}
+
+int
+__glusterd_handle_cli_heal_volume (rpcsvc_request_t *req)
{
int32_t ret = -1;
gf_cli_req cli_req = {{0,}};
@@ -410,18 +436,19 @@ glusterd_handle_cli_heal_volume (rpcsvc_request_t *req)
char *volname = NULL;
glusterd_volinfo_t *volinfo = NULL;
xlator_t *this = NULL;
- char *op_errstr = NULL;
+ char op_errstr[2048] = {0,};
GF_ASSERT (req);
- if (!xdr_to_generic (req->msg[0], &cli_req,
- (xdrproc_t)xdr_gf_cli_req)) {
+ ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
//failed to decode msg;
req->rpc_err = GARBAGE_ARGS;
goto out;
}
this = THIS;
+ GF_ASSERT (this);
if (cli_req.dict.dict_len) {
/* Unserialize the dictionary */
@@ -434,6 +461,8 @@ glusterd_handle_cli_heal_volume (rpcsvc_request_t *req)
gf_log (this->name, GF_LOG_ERROR,
"failed to "
"unserialize req-buffer to dictionary");
+ snprintf (op_errstr, sizeof (op_errstr),
+ "Unable to decode the command");
goto out;
} else {
dict->extra_stdfree = cli_req.dict.dict_val;
@@ -442,8 +471,9 @@ glusterd_handle_cli_heal_volume (rpcsvc_request_t *req)
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "failed to get volname");
- gf_asprintf (&op_errstr, "Unable to find volume name");
+ snprintf (op_errstr, sizeof (op_errstr), "Unable to find "
+ "volume name");
+ gf_log (this->name, GF_LOG_ERROR, "%s", op_errstr);
goto out;
}
@@ -452,7 +482,9 @@ glusterd_handle_cli_heal_volume (rpcsvc_request_t *req)
ret = glusterd_volinfo_find (volname, &volinfo);
if (ret) {
- gf_asprintf (&op_errstr, "Volume %s does not exist", volname);
+ snprintf (op_errstr, sizeof (op_errstr),
+ "Volume %s does not exist", volname);
+ gf_log (this->name, GF_LOG_ERROR, "%s", op_errstr);
goto out;
}
@@ -464,27 +496,29 @@ glusterd_handle_cli_heal_volume (rpcsvc_request_t *req)
if (ret)
goto out;
- ret = glusterd_op_begin (req, GD_OP_HEAL_VOLUME, dict);
+ ret = glusterd_op_begin_synctask (req, GD_OP_HEAL_VOLUME, dict);
out:
- glusterd_friend_sm ();
- glusterd_op_sm ();
-
if (ret) {
- if (!op_errstr)
- op_errstr = gf_strdup ("operation failed");
+ if (op_errstr[0] == '\0')
+ snprintf (op_errstr, sizeof (op_errstr),
+ "operation failed");
ret = glusterd_op_send_cli_response (cli_op, ret, 0, req,
dict, op_errstr);
- if (dict)
- dict_unref (dict);
- GF_FREE (op_errstr);
}
return ret;
}
int
-glusterd_handle_cli_statedump_volume (rpcsvc_request_t *req)
+glusterd_handle_cli_heal_volume (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ __glusterd_handle_cli_heal_volume);
+}
+
+int
+__glusterd_handle_cli_statedump_volume (rpcsvc_request_t *req)
{
int32_t ret = -1;
gf_cli_req cli_req = {{0,}};
@@ -493,12 +527,20 @@ glusterd_handle_cli_statedump_volume (rpcsvc_request_t *req)
dict_t *dict = NULL;
int32_t option_cnt = 0;
glusterd_op_t cli_op = GD_OP_STATEDUMP_VOLUME;
+ char err_str[2048] = {0,};
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
GF_ASSERT (req);
ret = -1;
- if (!xdr_to_generic (req->msg[0], &cli_req,
- (xdrproc_t)xdr_gf_cli_req)) {
+ ret = xdr_to_generic (req->msg[0], &cli_req, (xdrproc_t)xdr_gf_cli_req);
+ if (ret < 0) {
req->rpc_err = GARBAGE_ARGS;
goto out;
}
@@ -510,50 +552,166 @@ glusterd_handle_cli_statedump_volume (rpcsvc_request_t *req)
cli_req.dict.dict_len,
&dict);
if (ret < 0) {
- gf_log (THIS->name, GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"failed to "
"unserialize req-buffer to dictionary");
+ snprintf (err_str, sizeof (err_str), "Unable to "
+ "decode the command");
goto out;
}
}
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "failed to get volname");
+ snprintf (err_str, sizeof (err_str), "Unable to get the volume "
+ "name");
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
ret = dict_get_str (dict, "options", &options);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "failed to get options");
+ snprintf (err_str, sizeof (err_str), "Unable to get options");
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
ret = dict_get_int32 (dict, "option_cnt", &option_cnt);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR, "failed to get option cnt");
+ snprintf (err_str , sizeof (err_str), "Unable to get option "
+ "count");
+ gf_log (this->name, GF_LOG_ERROR, "%s", err_str);
goto out;
}
+ if (priv->op_version == GD_OP_VERSION_MIN &&
+ strstr (options, "quotad")) {
+ snprintf (err_str, sizeof (err_str), "The cluster is operating "
+ "at op-version 1. Taking quotad's statedump is "
+ "disallowed in this state");
+ ret = -1;
+ goto out;
+ }
- gf_log ("glusterd", GF_LOG_INFO, "Received statedump request for "
+ gf_log (this->name, GF_LOG_INFO, "Received statedump request for "
"volume %s with options %s", volname, options);
- ret = glusterd_op_begin (req, GD_OP_STATEDUMP_VOLUME, dict);
+ ret = glusterd_op_begin_synctask (req, GD_OP_STATEDUMP_VOLUME, dict);
out:
if (ret) {
+ if (err_str[0] == '\0')
+ snprintf (err_str, sizeof (err_str),
+ "Operation failed");
ret = glusterd_op_send_cli_response (cli_op, ret, 0, req,
- dict, "Operation failed");
- if (dict)
- dict_unref (dict);
+ dict, err_str);
}
free (cli_req.dict.dict_val);
- glusterd_friend_sm ();
- glusterd_op_sm();
return ret;
}
+int
+glusterd_handle_cli_statedump_volume (rpcsvc_request_t *req)
+{
+ return glusterd_big_locked_handler (req,
+ __glusterd_handle_cli_statedump_volume);
+}
+
+#ifdef HAVE_BD_XLATOR
+/*
+ * Validates if given VG in the brick exists or not. Also checks if VG has
+ * GF_XATTR_VOL_ID_KEY tag set to avoid using same VG for multiple bricks.
+ * Tag is checked only during glusterd_op_stage_create_volume. Tag is set during
+ * glusterd_validate_and_create_brickpath().
+ * @brick - brick info, @check_tag - check for VG tag or not
+ * @msg - Error message to return to caller
+ */
+int
+glusterd_is_valid_vg (glusterd_brickinfo_t *brick, int check_tag, char *msg)
+{
+ lvm_t handle = NULL;
+ vg_t vg = NULL;
+ char *vg_name = NULL;
+ int retval = 0;
+ char *p = NULL;
+ char *ptr = NULL;
+ struct dm_list *dm_lvlist = NULL;
+ struct dm_list *dm_seglist = NULL;
+ struct lvm_lv_list *lv_list = NULL;
+ struct lvm_property_value prop = {0, };
+ struct lvm_lvseg_list *seglist = NULL;
+ struct dm_list *taglist = NULL;
+ struct lvm_str_list *strl = NULL;
+
+ handle = lvm_init (NULL);
+ if (!handle) {
+ sprintf (msg, "lvm_init failed, could not validate vg");
+ return -1;
+ }
+ if (*brick->vg == '\0') { /* BD xlator has vg in brick->path */
+ p = gf_strdup (brick->path);
+ vg_name = strtok_r (p, "/", &ptr);
+ } else
+ vg_name = brick->vg;
+
+ vg = lvm_vg_open (handle, vg_name, "r", 0);
+ if (!vg) {
+ sprintf (msg, "no such vg: %s", vg_name);
+ retval = -1;
+ goto out;
+ }
+ if (!check_tag)
+ goto next;
+
+ taglist = lvm_vg_get_tags (vg);
+ if (!taglist)
+ goto next;
+
+ dm_list_iterate_items (strl, taglist) {
+ if (!strncmp(strl->str, GF_XATTR_VOL_ID_KEY,
+ strlen (GF_XATTR_VOL_ID_KEY))) {
+ sprintf (msg, "VG %s is already part of"
+ " a brick", vg_name);
+ retval = -1;
+ goto out;
+ }
+ }
+next:
+
+ brick->caps = CAPS_BD | CAPS_OFFLOAD_COPY | CAPS_OFFLOAD_SNAPSHOT;
+
+ dm_lvlist = lvm_vg_list_lvs (vg);
+ if (!dm_lvlist)
+ goto out;
+
+ dm_list_iterate_items (lv_list, dm_lvlist) {
+ dm_seglist = lvm_lv_list_lvsegs (lv_list->lv);
+ dm_list_iterate_items (seglist, dm_seglist) {
+ prop = lvm_lvseg_get_property (seglist->lvseg,
+ "segtype");
+ if (!prop.is_valid || !prop.value.string)
+ continue;
+ if (!strcmp (prop.value.string, "thin-pool")) {
+ brick->caps |= CAPS_THIN;
+ gf_log (THIS->name, GF_LOG_INFO, "Thin Pool "
+ "\"%s\" will be used for thin LVs",
+ lvm_lv_get_name (lv_list->lv));
+ break;
+ }
+ }
+ }
+
+ retval = 0;
+out:
+ if (vg)
+ lvm_vg_close (vg);
+ lvm_quit (handle);
+ if (p)
+ GF_FREE (p);
+ return retval;
+}
+#endif
+
/* op-sm */
int
glusterd_op_stage_create_volume (dict_t *dict, char **op_errstr)
@@ -569,73 +727,68 @@ glusterd_op_stage_create_volume (dict_t *dict, char **op_errstr)
int32_t i = 0;
char *brick = NULL;
char *tmpptr = NULL;
- char cmd_str[1024];
xlator_t *this = NULL;
glusterd_conf_t *priv = NULL;
char msg[2048] = {0};
uuid_t volume_uuid;
char *volume_uuid_str;
+ gf_boolean_t is_force = _gf_false;
this = THIS;
- if (!this) {
- gf_log ("glusterd", GF_LOG_ERROR,
- "this is NULL");
- goto out;
- }
-
+ GF_ASSERT (this);
priv = this->private;
- if (!priv) {
- gf_log ("glusterd", GF_LOG_ERROR,
- "priv is NULL");
- goto out;
- }
+ GF_ASSERT (priv);
ret = dict_get_str (dict, "volname", &volname);
-
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get volume name");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get volume name");
goto out;
}
exists = glusterd_check_volume_exists (volname);
-
if (exists) {
snprintf (msg, sizeof (msg), "Volume %s already exists",
volname);
- gf_log ("", GF_LOG_ERROR, "%s", msg);
- *op_errstr = gf_strdup (msg);
ret = -1;
goto out;
} else {
ret = 0;
}
+
ret = dict_get_int32 (dict, "count", &brick_count);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get count");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get brick count "
+ "for volume %s", volname);
goto out;
}
+
ret = dict_get_str (dict, "volume-id", &volume_uuid_str);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get volume id");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get volume id of "
+ "volume %s", volname);
goto out;
}
+
ret = uuid_parse (volume_uuid_str, volume_uuid);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to parse volume id");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to parse volume id of"
+ " volume %s", volname);
goto out;
}
ret = dict_get_str (dict, "bricks", &bricks);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get bricks");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get bricks for "
+ "volume %s", volname);
goto out;
}
+ is_force = dict_get_str_boolean (dict, "force", _gf_false);
+
if (bricks) {
brick_list = gf_strdup (bricks);
if (!brick_list) {
ret = -1;
- gf_log ("", GF_LOG_ERROR, "Out of memory");
goto out;
} else {
free_ptr = brick_list;
@@ -651,9 +804,6 @@ glusterd_op_stage_create_volume (dict_t *dict, char **op_errstr)
!glusterd_is_valid_volfpath (volname, brick)) {
snprintf (msg, sizeof (msg), "brick path %s is too "
"long.", brick);
- gf_log ("", GF_LOG_ERROR, "%s", msg);
- *op_errstr = gf_strdup (msg);
-
ret = -1;
goto out;
}
@@ -661,20 +811,30 @@ glusterd_op_stage_create_volume (dict_t *dict, char **op_errstr)
ret = glusterd_brickinfo_new_from_brick (brick, &brick_info);
if (ret)
goto out;
- snprintf (cmd_str, 1024, "%s", brick_info->path);
+
+ ret = glusterd_new_brick_validate (brick, brick_info, msg,
+ sizeof (msg));
+ if (ret)
+ goto out;
+
ret = glusterd_resolve_brick (brick_info);
if (ret) {
- gf_log ("glusterd", GF_LOG_ERROR, "cannot resolve "
- "brick: %s:%s", brick_info->hostname,
- brick_info->path);
+ gf_log (this->name, GF_LOG_ERROR, FMTSTR_RESOLVE_BRICK,
+ brick_info->hostname, brick_info->path);
goto out;
}
if (!uuid_compare (brick_info->uuid, MY_UUID)) {
- ret = glusterd_brick_create_path (brick_info->hostname,
- brick_info->path,
- volume_uuid,
- op_errstr);
+#ifdef HAVE_BD_XLATOR
+ if (brick_info->vg[0]) {
+ ret = glusterd_is_valid_vg (brick_info, 1, msg);
+ if (ret)
+ goto out;
+ }
+#endif
+ ret = glusterd_validate_and_create_brickpath (brick_info,
+ volume_uuid, op_errstr,
+ is_force);
if (ret)
goto out;
brick_list = tmpptr;
@@ -686,7 +846,12 @@ out:
GF_FREE (free_ptr);
if (brick_info)
glusterd_brickinfo_delete (brick_info);
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+
+ if (msg[0] != '\0') {
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg);
+ *op_errstr = gf_strdup (msg);
+ }
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -695,19 +860,23 @@ int
glusterd_op_stop_volume_args_get (dict_t *dict, char** volname, int *flags)
{
int ret = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
if (!dict || !volname || !flags)
goto out;
ret = dict_get_str (dict, "volname", volname);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get volume name");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get volume name");
goto out;
}
ret = dict_get_int32 (dict, "flags", flags);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get flags");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get flags");
goto out;
}
out:
@@ -754,16 +923,18 @@ glusterd_op_stage_start_volume (dict_t *dict, char **op_errstr)
gf_boolean_t exists = _gf_false;
glusterd_volinfo_t *volinfo = NULL;
glusterd_brickinfo_t *brickinfo = NULL;
- char msg[2048];
+ char msg[2048] = {0,};
glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+ uuid_t volume_id = {0,};
+ char volid[50] = {0,};
+ char xattr_volid[50] = {0,};
+ int caps = 0;
- priv = THIS->private;
- if (!priv) {
- gf_log ("glusterd", GF_LOG_ERROR,
- "priv is NULL");
- ret = -1;
- goto out;
- }
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
ret = glusterd_op_start_volume_args_get (dict, &volname, &flags);
if (ret)
@@ -772,48 +943,110 @@ glusterd_op_stage_start_volume (dict_t *dict, char **op_errstr)
exists = glusterd_check_volume_exists (volname);
if (!exists) {
- snprintf (msg, sizeof (msg), "Volume %s does not exist", volname);
- gf_log ("", GF_LOG_ERROR, "%s",
- msg);
- *op_errstr = gf_strdup (msg);
+ snprintf (msg, sizeof (msg), FMTSTR_CHECK_VOL_EXISTS, volname);
ret = -1;
- } else {
- ret = 0;
+ goto out;
}
ret = glusterd_volinfo_find (volname, &volinfo);
- if (ret)
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, FMTSTR_CHECK_VOL_EXISTS,
+ volname);
goto out;
+ }
ret = glusterd_validate_volume_id (dict, volinfo);
if (ret)
goto out;
+ if (!(flags & GF_CLI_FLAG_OP_FORCE)) {
+ if (glusterd_is_volume_started (volinfo)) {
+ snprintf (msg, sizeof (msg), "Volume %s already "
+ "started", volname);
+ ret = -1;
+ goto out;
+ }
+ }
+
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
ret = glusterd_resolve_brick (brickinfo);
if (ret) {
- gf_log ("", GF_LOG_ERROR,
- "Unable to resolve brick %s:%s",
+ gf_log (this->name, GF_LOG_ERROR, FMTSTR_RESOLVE_BRICK,
brickinfo->hostname, brickinfo->path);
goto out;
}
- if (!(flags & GF_CLI_FLAG_OP_FORCE)) {
- if (glusterd_is_volume_started (volinfo)) {
- snprintf (msg, sizeof (msg), "Volume %s already"
- " started", volname);
- gf_log ("glusterd", GF_LOG_ERROR, "%s", msg);
- *op_errstr = gf_strdup (msg);
- ret = -1;
+ if (uuid_compare (brickinfo->uuid, MY_UUID))
+ continue;
+
+ ret = gf_lstat_dir (brickinfo->path, NULL);
+ if (ret && (flags & GF_CLI_FLAG_OP_FORCE)) {
+ continue;
+ } else if (ret) {
+ snprintf (msg, sizeof (msg), "Failed to find "
+ "brick directory %s for volume %s. "
+ "Reason : %s", brickinfo->path,
+ volname, strerror (errno));
+ goto out;
+ }
+ ret = sys_lgetxattr (brickinfo->path, GF_XATTR_VOL_ID_KEY,
+ volume_id, 16);
+ if (ret < 0 && (!(flags & GF_CLI_FLAG_OP_FORCE))) {
+ snprintf (msg, sizeof (msg), "Failed to get "
+ "extended attribute %s for brick dir %s. "
+ "Reason : %s", GF_XATTR_VOL_ID_KEY,
+ brickinfo->path, strerror (errno));
+ ret = -1;
+ goto out;
+ } else if (ret < 0) {
+ ret = sys_lsetxattr (brickinfo->path,
+ GF_XATTR_VOL_ID_KEY,
+ volinfo->volume_id, 16,
+ XATTR_CREATE);
+ if (ret) {
+ snprintf (msg, sizeof (msg), "Failed to set "
+ "extended attribute %s on %s. Reason: "
+ "%s", GF_XATTR_VOL_ID_KEY,
+ brickinfo->path, strerror (errno));
goto out;
+ } else {
+ continue;
}
}
+ if (uuid_compare (volinfo->volume_id, volume_id)) {
+ snprintf (msg, sizeof (msg), "Volume id mismatch for "
+ "brick %s:%s. Expected volume id %s, "
+ "volume id %s found", brickinfo->hostname,
+ brickinfo->path,
+ uuid_utoa_r (volinfo->volume_id, volid),
+ uuid_utoa_r (volume_id, xattr_volid));
+ ret = -1;
+ goto out;
+ }
+#ifdef HAVE_BD_XLATOR
+ if (brickinfo->vg[0])
+ caps = CAPS_BD | CAPS_THIN |
+ CAPS_OFFLOAD_COPY | CAPS_OFFLOAD_SNAPSHOT;
+ /* Check for VG/thin pool if its BD volume */
+ if (brickinfo->vg[0]) {
+ ret = glusterd_is_valid_vg (brickinfo, 0, msg);
+ if (ret)
+ goto out;
+ /* if anyone of the brick does not have thin support,
+ disable it for entire volume */
+ caps &= brickinfo->caps;
+ } else
+ caps = 0;
+#endif
}
+ volinfo->caps = caps;
ret = 0;
out:
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
-
+ if (ret && (msg[0] != '\0')) {
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg);
+ *op_errstr = gf_strdup (msg);
+ }
return ret;
}
@@ -827,7 +1060,11 @@ glusterd_op_stage_stop_volume (dict_t *dict, char **op_errstr)
gf_boolean_t is_run = _gf_false;
glusterd_volinfo_t *volinfo = NULL;
char msg[2048] = {0};
+ xlator_t *this = NULL;
+ gsync_status_param_t param = {0,};
+ this = THIS;
+ GF_ASSERT (this);
ret = glusterd_op_stop_volume_args_get (dict, &volname, &flags);
if (ret)
@@ -836,18 +1073,18 @@ glusterd_op_stage_stop_volume (dict_t *dict, char **op_errstr)
exists = glusterd_check_volume_exists (volname);
if (!exists) {
- snprintf (msg, sizeof (msg), "Volume %s does not exist", volname);
- gf_log ("", GF_LOG_ERROR, "%s", msg);
- *op_errstr = gf_strdup (msg);
+ snprintf (msg, sizeof (msg), FMTSTR_CHECK_VOL_EXISTS, volname);
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg);
ret = -1;
goto out;
- } else {
- ret = 0;
}
ret = glusterd_volinfo_find (volname, &volinfo);
- if (ret)
+ if (ret) {
+ snprintf (msg, sizeof (msg), FMTSTR_CHECK_VOL_EXISTS, volname);
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg);
goto out;
+ }
ret = glusterd_validate_volume_id (dict, volinfo);
if (ret)
@@ -860,25 +1097,37 @@ glusterd_op_stage_stop_volume (dict_t *dict, char **op_errstr)
if (_gf_false == glusterd_is_volume_started (volinfo)) {
snprintf (msg, sizeof(msg), "Volume %s "
"is not in the started state", volname);
- gf_log ("", GF_LOG_ERROR, "Volume %s "
- "has not been started", volname);
- *op_errstr = gf_strdup (msg);
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg);
ret = -1;
goto out;
}
ret = glusterd_check_gsync_running (volinfo, &is_run);
if (ret && (is_run == _gf_false))
- gf_log ("", GF_LOG_WARNING, "Unable to get the status"
+ gf_log (this->name, GF_LOG_WARNING, "Unable to get the status"
" of active "GEOREP" session");
- if (is_run) {
- gf_log ("", GF_LOG_WARNING, GEOREP" sessions active"
+
+ param.volinfo = volinfo;
+ ret = dict_foreach (volinfo->gsync_slaves, _get_slave_status, &param);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "_get_slave_satus failed");
+ snprintf (msg, sizeof(msg), GEOREP" Unable to get the status "
+ "of active "GEOREP" session for the volume '%s'.\n"
+ "Please check the log file for more info. Use "
+ "'force' option to ignore and stop the volume.",
+ volname);
+ ret = -1;
+ goto out;
+ }
+
+ if (is_run && param.is_active) {
+ gf_log (this->name, GF_LOG_WARNING, GEOREP" sessions active"
"for the volume %s ", volname);
snprintf (msg, sizeof(msg), GEOREP" sessions are active "
"for the volume '%s'.\nUse 'volume "GEOREP" "
"status' command for more info. Use 'force' "
"option to ignore and stop the volume.",
volname);
- *op_errstr = gf_strdup (msg);
ret = -1;
goto out;
}
@@ -887,8 +1136,8 @@ glusterd_op_stage_stop_volume (dict_t *dict, char **op_errstr)
snprintf (msg, sizeof (msg), "Replace brick is in progress on "
"volume %s. Please retry after replace-brick "
"operation is committed or aborted", volname);
- gf_log (THIS->name, GF_LOG_ERROR, "%s", msg);
- *op_errstr = gf_strdup (msg);
+ gf_log (this->name, GF_LOG_WARNING, "replace-brick in progress "
+ "on volume %s", volname);
ret = -1;
goto out;
}
@@ -896,23 +1145,22 @@ glusterd_op_stage_stop_volume (dict_t *dict, char **op_errstr)
if (glusterd_is_defrag_on (volinfo)) {
snprintf (msg, sizeof(msg), "rebalance session is "
"in progress for the volume '%s'", volname);
- gf_log (THIS->name, GF_LOG_WARNING, "%s", msg);
- *op_errstr = gf_strdup (msg);
+ gf_log (this->name, GF_LOG_WARNING, "%s", msg);
ret = -1;
goto out;
}
- if (volinfo->rb_status != GF_RB_STATUS_NONE) {
+ if (volinfo->rep_brick.rb_status != GF_RB_STATUS_NONE) {
snprintf (msg, sizeof(msg), "replace-brick session is "
"in progress for the volume '%s'", volname);
- gf_log (THIS->name, GF_LOG_WARNING, "%s", msg);
- *op_errstr = gf_strdup (msg);
+ gf_log (this->name, GF_LOG_WARNING, "%s", msg);
ret = -1;
goto out;
}
out:
-
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ if (msg[0] != 0)
+ *op_errstr = gf_strdup (msg);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -925,21 +1173,20 @@ glusterd_op_stage_delete_volume (dict_t *dict, char **op_errstr)
gf_boolean_t exists = _gf_false;
glusterd_volinfo_t *volinfo = NULL;
char msg[2048] = {0};
+ xlator_t *this = NULL;
- ret = dict_get_str (dict, "volname", &volname);
+ this = THIS;
+ GF_ASSERT (this);
+ ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get volume name");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get volume name");
goto out;
}
exists = glusterd_check_volume_exists (volname);
-
if (!exists) {
- snprintf (msg, sizeof (msg), "Volume %s does not exist",
- volname);
- gf_log ("", GF_LOG_ERROR, "%s", msg);
- *op_errstr = gf_strdup (msg);
+ snprintf (msg, sizeof (msg), FMTSTR_CHECK_VOL_EXISTS, volname);
ret = -1;
goto out;
} else {
@@ -947,9 +1194,10 @@ glusterd_op_stage_delete_volume (dict_t *dict, char **op_errstr)
}
ret = glusterd_volinfo_find (volname, &volinfo);
-
- if (ret)
+ if (ret) {
+ snprintf (msg, sizeof (msg), FMTSTR_CHECK_VOL_EXISTS, volname);
goto out;
+ }
ret = glusterd_validate_volume_id (dict, volinfo);
if (ret)
@@ -959,8 +1207,16 @@ glusterd_op_stage_delete_volume (dict_t *dict, char **op_errstr)
snprintf (msg, sizeof (msg), "Volume %s has been started."
"Volume needs to be stopped before deletion.",
volname);
- gf_log ("", GF_LOG_ERROR, "%s", msg);
- *op_errstr = gf_strdup (msg);
+ ret = -1;
+ goto out;
+ }
+
+ if (volinfo->snap_count > 0 || !list_empty(&volinfo->snap_volumes)) {
+ snprintf (msg, sizeof (msg), "Cannot delete Volume %s ,"
+ "as it has %"PRIu64" snapshots. "
+ "To delete the volume, "
+ "first delete all the snapshots under it.",
+ volname, volinfo->snap_count);
ret = -1;
goto out;
}
@@ -968,7 +1224,11 @@ glusterd_op_stage_delete_volume (dict_t *dict, char **op_errstr)
ret = 0;
out:
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ if (msg[0] != '\0') {
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg);
+ *op_errstr = gf_strdup (msg);
+ }
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -984,11 +1244,13 @@ glusterd_op_stage_heal_volume (dict_t *dict, char **op_errstr)
glusterd_conf_t *priv = NULL;
dict_t *opt_dict = NULL;
gf_xl_afr_op_t heal_op = GF_AFR_OP_INVALID;
+ xlator_t *this = NULL;
- priv = THIS->private;
+ this = THIS;
+ priv = this->private;
if (!priv) {
ret = -1;
- gf_log (THIS->name, GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"priv is NULL");
goto out;
}
@@ -1003,7 +1265,7 @@ glusterd_op_stage_heal_volume (dict_t *dict, char **op_errstr)
if (ret) {
ret = -1;
snprintf (msg, sizeof (msg), "Volume %s does not exist", volname);
- gf_log (THIS->name, GF_LOG_ERROR, "%s", msg);
+ gf_log (this->name, GF_LOG_ERROR, "%s", msg);
*op_errstr = gf_strdup (msg);
goto out;
}
@@ -1017,7 +1279,7 @@ glusterd_op_stage_heal_volume (dict_t *dict, char **op_errstr)
snprintf (msg, sizeof (msg), "Volume %s is not of type "
"replicate", volname);
*op_errstr = gf_strdup (msg);
- gf_log (THIS->name, GF_LOG_WARNING, "%s", msg);
+ gf_log (this->name, GF_LOG_WARNING, "%s", msg);
goto out;
}
@@ -1043,29 +1305,37 @@ glusterd_op_stage_heal_volume (dict_t *dict, char **op_errstr)
snprintf (msg, sizeof (msg), "Self-heal-daemon is "
"disabled. Heal will not be triggered on volume %s",
volname);
- gf_log (THIS->name, GF_LOG_WARNING, "%s", msg);
- *op_errstr = gf_strdup (msg);
- goto out;
- }
-
- if (!glusterd_nodesvc_is_running ("glustershd")) {
- ret = -1;
- snprintf (msg, sizeof (msg), "Self-heal daemon is not "
- "running. Check self-heal daemon log file.");
+ gf_log (this->name, GF_LOG_WARNING, "%s", msg);
*op_errstr = gf_strdup (msg);
- gf_log (THIS->name, GF_LOG_WARNING, "%s", msg);
goto out;
}
ret = dict_get_int32 (dict, "heal-op", (int32_t*)&heal_op);
if (ret || (heal_op == GF_AFR_OP_INVALID)) {
ret = -1;
- snprintf (msg, sizeof (msg), "Invalid heal-op");
- *op_errstr = gf_strdup (msg);
- gf_log (THIS->name, GF_LOG_WARNING, "%s", msg);
+ *op_errstr = gf_strdup("Invalid heal-op");
+ gf_log (this->name, GF_LOG_WARNING, "%s", "Invalid heal-op");
goto out;
}
+ switch (heal_op) {
+ case GF_AFR_OP_INDEX_SUMMARY:
+ case GF_AFR_OP_STATISTICS_HEAL_COUNT:
+ case GF_AFR_OP_STATISTICS_HEAL_COUNT_PER_REPLICA:
+ break;
+ default:
+ if (!glusterd_is_nodesvc_online("glustershd")){
+ ret = -1;
+ *op_errstr = gf_strdup ("Self-heal daemon is "
+ "not running. Check self-heal "
+ "daemon log file.");
+ gf_log (this->name, GF_LOG_WARNING, "%s",
+ "Self-heal daemon is not running."
+ "Check self-heal daemon log file.");
+ goto out;
+ }
+ }
+
ret = 0;
out:
gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
@@ -1083,6 +1353,13 @@ glusterd_op_stage_statedump_volume (dict_t *dict, char **op_errstr)
gf_boolean_t is_running = _gf_false;
glusterd_volinfo_t *volinfo = NULL;
char msg[2408] = {0,};
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
ret = glusterd_op_statedump_volume_args_get (dict, &volname, &options,
&option_cnt);
@@ -1091,10 +1368,7 @@ glusterd_op_stage_statedump_volume (dict_t *dict, char **op_errstr)
ret = glusterd_volinfo_find (volname, &volinfo);
if (ret) {
- snprintf (msg, sizeof(msg), "Volume %s does not exist",
- volname);
- gf_log ("", GF_LOG_ERROR, "%s", msg);
- *op_errstr = gf_strdup (msg);
+ snprintf (msg, sizeof(msg), FMTSTR_CHECK_VOL_EXISTS, volname);
goto out;
}
@@ -1104,16 +1378,31 @@ glusterd_op_stage_statedump_volume (dict_t *dict, char **op_errstr)
is_running = glusterd_is_volume_started (volinfo);
if (!is_running) {
- snprintf (msg, sizeof(msg), "Volume %s is not in a started"
+ snprintf (msg, sizeof(msg), "Volume %s is not in the started"
" state", volname);
- gf_log ("", GF_LOG_ERROR, "%s", msg);
- *op_errstr = gf_strdup (msg);
ret = -1;
goto out;
}
+ if (priv->op_version == GD_OP_VERSION_MIN &&
+ strstr (options, "quotad")) {
+ snprintf (msg, sizeof (msg), "The cluster is operating "
+ "at op-version 1. Taking quotad's statedump is "
+ "disallowed in this state");
+ ret = -1;
+ goto out;
+ }
+ if ((strstr (options, "quotad")) &&
+ (!glusterd_is_volume_quota_enabled (volinfo))) {
+ snprintf (msg, sizeof (msg), "Quota is not enabled on "
+ "volume %s", volname);
+ ret = -1;
+ goto out;
+ }
out:
- gf_log ("", GF_LOG_DEBUG, "Returning %d", ret);
+ if (ret && msg[0] != '\0')
+ *op_errstr = gf_strdup (msg);
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
return ret;
}
@@ -1187,7 +1476,6 @@ out:
return ret;
}
-
int
glusterd_op_create_volume (dict_t *dict, char **op_errstr)
{
@@ -1209,6 +1497,9 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr)
char *str = NULL;
char *username = NULL;
char *password = NULL;
+ int caps = 0;
+ int brickid = 0;
+ char msg[1024] __attribute__((unused)) = {0, };
this = THIS;
GF_ASSERT (this);
@@ -1220,15 +1511,14 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr)
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
- "Unable to allocate memory");
+ "Unable to allocate memory for volinfo");
goto out;
}
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "Unable to get volume name");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get volume name");
goto out;
}
@@ -1237,13 +1527,15 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr)
ret = dict_get_int32 (dict, "type", &volinfo->type);
if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Unable to get type");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get type of volume"
+ " %s", volname);
goto out;
}
ret = dict_get_int32 (dict, "count", &volinfo->brick_count);
if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Unable to get count");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get brick count of"
+ " volume %s", volname);
goto out;
}
@@ -1257,7 +1549,8 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr)
ret = dict_get_str (dict, "bricks", &bricks);
if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Unable to get bricks");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get bricks for "
+ "volume %s", volname);
goto out;
}
@@ -1269,28 +1562,44 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr)
if (GF_CLUSTER_TYPE_REPLICATE == volinfo->type) {
ret = dict_get_int32 (dict, "replica-count",
&volinfo->replica_count);
- if (ret)
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get "
+ "replica count for volume %s", volname);
goto out;
+ }
} else if (GF_CLUSTER_TYPE_STRIPE == volinfo->type) {
ret = dict_get_int32 (dict, "stripe-count",
&volinfo->stripe_count);
- if (ret)
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get stripe"
+ " count for volume %s", volname);
goto out;
+ }
} else if (GF_CLUSTER_TYPE_STRIPE_REPLICATE == volinfo->type) {
ret = dict_get_int32 (dict, "stripe-count",
&volinfo->stripe_count);
- if (ret)
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get stripe"
+ " count for volume %s", volname);
goto out;
+ }
ret = dict_get_int32 (dict, "replica-count",
&volinfo->replica_count);
- if (ret)
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to get "
+ "replica count for volume %s", volname);
goto out;
+ }
}
/* dist-leaf-count is the count of brick nodes for a given
subvolume of distribute */
- volinfo->dist_leaf_count = (volinfo->stripe_count *
- volinfo->replica_count);
+ volinfo->dist_leaf_count = glusterd_get_dist_leaf_count (volinfo);
+
+ /* subvol_count is the count of number of subvolumes present
+ for a given distribute volume */
+ volinfo->subvol_count = (volinfo->brick_count /
+ volinfo->dist_leaf_count);
/* Keep sub-count same as earlier, for the sake of backward
compatibility */
@@ -1300,27 +1609,28 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr)
ret = dict_get_str (dict, "transport", &trans_type);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
- "Unable to get transport");
+ "Unable to get transport type of volume %s", volname);
goto out;
}
ret = dict_get_str (dict, "volume-id", &str);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
- "Unable to get volume-id");
+ "Unable to get volume-id of volume %s", volname);
goto out;
}
ret = uuid_parse (str, volinfo->volume_id);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
- "unable to parse uuid %s", str);
+ "unable to parse uuid %s of volume %s", str, volname);
goto out;
}
ret = dict_get_str (dict, "internal-username", &username);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
- "unable to get internal username");
+ "unable to get internal username of volume %s",
+ volname);
goto out;
}
glusterd_auth_set_username (volinfo, username);
@@ -1328,7 +1638,8 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr)
ret = dict_get_str (dict, "internal-password", &password);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
- "unable to get internal password");
+ "unable to get internal password of volume %s",
+ volname);
goto out;
}
glusterd_auth_set_password (volinfo, password);
@@ -1351,20 +1662,55 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr)
if (count)
brick = strtok_r (brick_list+1, " \n", &saveptr);
+ caps = CAPS_BD | CAPS_THIN | CAPS_OFFLOAD_COPY | CAPS_OFFLOAD_SNAPSHOT;
+ brickid = glusterd_get_next_available_brickid (volinfo);
+ if (brickid < 0)
+ goto out;
while ( i <= count) {
ret = glusterd_brickinfo_new_from_brick (brick, &brickinfo);
if (ret)
goto out;
+ GLUSTERD_ASSIGN_BRICKID_TO_BRICKINFO (brickinfo, volinfo,
+ brickid++);
+
ret = glusterd_resolve_brick (brickinfo);
- if (ret)
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, FMTSTR_RESOLVE_BRICK,
+ brickinfo->hostname, brickinfo->path);
goto out;
+ }
+
+#ifdef HAVE_BD_XLATOR
+ if (!uuid_compare (brickinfo->uuid, MY_UUID)) {
+ if (brickinfo->vg[0]) {
+ ret = glusterd_is_valid_vg (brickinfo, 0, msg);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "%s",
+ msg);
+ goto out;
+ }
+
+ /* if anyone of the brick does not have thin
+ support, disable it for entire volume */
+ caps &= brickinfo->caps;
+
+
+ } else
+ caps = 0;
+ }
+#endif
+
list_add_tail (&brickinfo->brick_list, &volinfo->bricks);
brick = strtok_r (NULL, " \n", &saveptr);
i++;
}
+ gd_update_volume_op_versions (volinfo);
+
+ volinfo->caps = caps;
+
ret = glusterd_store_volinfo (volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT);
if (ret) {
glusterd_store_delete_volume (volinfo);
@@ -1378,19 +1724,15 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr)
goto out;
}
- ret = glusterd_volume_compute_cksum (volinfo);
- if (ret) {
- *op_errstr = gf_strdup ("Failed to compute checksum of volume");
- goto out;
- }
-
- volinfo->defrag_status = 0;
- list_add_tail (&volinfo->vol_list, &priv->volumes);
+ volinfo->rebal.defrag_status = 0;
+ list_add_order (&volinfo->vol_list, &priv->volumes,
+ glusterd_compare_volume_name);
vol_added = _gf_true;
+
out:
GF_FREE(free_ptr);
if (!vol_added && volinfo)
- glusterd_volinfo_delete (volinfo);
+ glusterd_volinfo_unref (volinfo);
return ret;
}
@@ -1402,18 +1744,28 @@ glusterd_op_start_volume (dict_t *dict, char **op_errstr)
int flags = 0;
glusterd_volinfo_t *volinfo = NULL;
glusterd_brickinfo_t *brickinfo = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
ret = glusterd_op_start_volume_args_get (dict, &volname, &flags);
if (ret)
goto out;
ret = glusterd_volinfo_find (volname, &volinfo);
-
- if (ret)
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, FMTSTR_CHECK_VOL_EXISTS,
+ volname);
goto out;
+ }
+
list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
ret = glusterd_brick_start (volinfo, brickinfo, _gf_true);
- if (ret)
+ /* If 'force' try to start all bricks regardless of success or
+ * failure
+ */
+ if (!(flags & GF_CLI_FLAG_OP_FORCE) && ret)
goto out;
}
@@ -1426,10 +1778,78 @@ glusterd_op_start_volume (dict_t *dict, char **op_errstr)
ret = glusterd_nodesvcs_handle_graph_change (volinfo);
out:
- gf_log ("", GF_LOG_DEBUG, "returning %d ", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "returning %d ", ret);
return ret;
}
+int
+glusterd_stop_volume (glusterd_volinfo_t *volinfo)
+{
+ int ret = -1;
+ glusterd_brickinfo_t *brickinfo = NULL;
+ char mountdir[PATH_MAX] = {0,};
+ runner_t runner = {0,};
+ char pidfile[PATH_MAX] = {0,};
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+
+ GF_VALIDATE_OR_GOTO (this->name, volinfo, out);
+
+ list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
+ ret = glusterd_brick_stop (volinfo, brickinfo, _gf_false);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to stop "
+ "brick (%s)", brickinfo->path);
+ goto out;
+ }
+ }
+
+ glusterd_set_volume_status (volinfo, GLUSTERD_STATUS_STOPPED);
+
+ ret = glusterd_store_volinfo (volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to store volinfo of "
+ "%s volume", volinfo->volname);
+ goto out;
+ }
+
+ /* If quota auxiliary mount is present, unmount it */
+ GLUSTERFS_GET_AUX_MOUNT_PIDFILE (pidfile, volinfo->volname);
+
+ if (!gf_is_service_running (pidfile, NULL)) {
+ gf_log (this->name, GF_LOG_DEBUG, "Aux mount of volume %s "
+ "absent", volinfo->volname);
+ } else {
+ GLUSTERD_GET_QUOTA_AUX_MOUNT_PATH (mountdir, volinfo->volname,
+ "/");
+
+ runinit (&runner);
+ runner_add_args (&runner, "umount",
+
+ #if GF_LINUX_HOST_OS
+ "-l",
+ #endif
+ mountdir, NULL);
+ ret = runner_run_reuse (&runner);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR, "umount on %s failed, "
+ "reason : %s", mountdir, strerror (errno));
+
+ runner_end (&runner);
+ }
+
+ ret = glusterd_nodesvcs_handle_graph_change (volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to notify graph "
+ "change for %s volume", volinfo->volname);
+ goto out;
+ }
+
+out:
+ return ret;
+}
int
glusterd_op_stop_volume (dict_t *dict)
@@ -1438,30 +1858,28 @@ glusterd_op_stop_volume (dict_t *dict)
int flags = 0;
char *volname = NULL;
glusterd_volinfo_t *volinfo = NULL;
- glusterd_brickinfo_t *brickinfo = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
ret = glusterd_op_stop_volume_args_get (dict, &volname, &flags);
if (ret)
goto out;
ret = glusterd_volinfo_find (volname, &volinfo);
-
- if (ret)
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, FMTSTR_CHECK_VOL_EXISTS,
+ volname);
goto out;
-
- list_for_each_entry (brickinfo, &volinfo->bricks, brick_list) {
- ret = glusterd_brick_stop (volinfo, brickinfo, _gf_false);
- if (ret)
- goto out;
}
- glusterd_set_volume_status (volinfo, GLUSTERD_STATUS_STOPPED);
-
- ret = glusterd_store_volinfo (volinfo, GLUSTERD_VOLINFO_VER_AC_INCREMENT);
- if (ret)
+ ret = glusterd_stop_volume (volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to stop %s volume",
+ volname);
goto out;
-
- ret = glusterd_nodesvcs_handle_graph_change (volinfo);
+ }
out:
return ret;
}
@@ -1482,18 +1900,24 @@ glusterd_op_delete_volume (dict_t *dict)
ret = dict_get_str (dict, "volname", &volname);
if (ret) {
- gf_log ("", GF_LOG_ERROR, "Unable to get volume name");
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get volume name");
goto out;
}
ret = glusterd_volinfo_find (volname, &volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, FMTSTR_CHECK_VOL_EXISTS,
+ volname);
+ goto out;
+ }
+ ret = glusterd_remove_auxiliary_mount (volname);
if (ret)
goto out;
ret = glusterd_delete_volume (volinfo);
out:
- gf_log ("", GF_LOG_DEBUG, "returning %d", ret);
+ gf_log (this->name, GF_LOG_DEBUG, "returning %d", ret);
return ret;
}
@@ -1529,6 +1953,12 @@ glusterd_op_statedump_volume (dict_t *dict, char **op_errstr)
ret = glusterd_nfs_statedump (options, option_cnt, op_errstr);
if (ret)
goto out;
+
+ } else if (strstr (options, "quotad")) {
+ ret = glusterd_quotad_statedump (options, option_cnt,
+ op_errstr);
+ if (ret)
+ goto out;
} else {
list_for_each_entry (brickinfo, &volinfo->bricks,
brick_list) {
@@ -1611,7 +2041,9 @@ glusterd_clearlocks_unmount (glusterd_volinfo_t *volinfo, char *mntpt)
runner_add_args (&runner, "/bin/umount", "-f", NULL);
runner_argprintf (&runner, "%s", mntpt);
+ synclock_unlock (&priv->big_lock);
ret = runner_run (&runner);
+ synclock_lock (&priv->big_lock);
if (ret) {
ret = 0;
gf_log ("", GF_LOG_DEBUG,
@@ -1683,7 +2115,9 @@ glusterd_clearlocks_mount (glusterd_volinfo_t *volinfo, char **xl_opts,
}
runner_argprintf (&runner, "%s", mntpt);
+ synclock_unlock (&priv->big_lock);
ret = runner_run (&runner);
+ synclock_lock (&priv->big_lock);
if (ret) {
gf_log (THIS->name, GF_LOG_DEBUG,
"Could not start glusterfs");
@@ -1747,7 +2181,7 @@ out:
}
int
-glusterd_op_clearlocks_volume (dict_t *dict, char **op_errstr)
+glusterd_op_clearlocks_volume (dict_t *dict, char **op_errstr, dict_t *rsp_dict)
{
int32_t ret = -1;
int i = 0;
@@ -1762,7 +2196,6 @@ glusterd_op_clearlocks_volume (dict_t *dict, char **op_errstr)
char result[PATH_MAX] = {0,};
char *mntpt = NULL;
char **xl_opts = NULL;
- dict_t *ctx = NULL;
glusterd_volinfo_t *volinfo = NULL;
ret = dict_get_str (dict, "volname", &volname);
@@ -1851,14 +2284,8 @@ glusterd_op_clearlocks_volume (dict_t *dict, char **op_errstr)
goto umount;
}
- ctx = glusterd_op_get_ctx ();
- if (!ctx)
- /*Impossible. Only originator glusterd can
- * come here. */
- goto umount;
-
free_ptr = gf_strdup(result);
- if (dict_set_dynstr (ctx, "lk-summary", free_ptr)) {
+ if (dict_set_dynstr (rsp_dict, "lk-summary", free_ptr)) {
GF_FREE (free_ptr);
snprintf (msg, sizeof (msg), "Failed to set clear-locks "
"result");
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
new file mode 100644
index 000000000..1374e82cd
--- /dev/null
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
@@ -0,0 +1,1558 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterd-volgen.h"
+#include "glusterd-utils.h"
+
+static int
+check_dict_key_value (dict_t *dict, char *key, char *value)
+{
+ glusterd_conf_t *priv = NULL;
+ int ret = 0;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ if (!dict) {
+ gf_log (this->name, GF_LOG_ERROR, "Received Empty Dict.");
+ ret = -1;
+ goto out;
+ }
+
+ if (!key) {
+ gf_log (this->name, GF_LOG_ERROR, "Received Empty Key.");
+ ret = -1;
+ goto out;
+ }
+
+ if (!value) {
+ gf_log (this->name, GF_LOG_ERROR, "Received Empty Value.");
+ ret = -1;
+ goto out;
+ }
+
+out:
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
+
+ return ret;
+}
+
+static int
+get_volname_volinfo (dict_t *dict, char **volname, glusterd_volinfo_t **volinfo)
+{
+ glusterd_conf_t *priv = NULL;
+ int ret = 0;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ ret = dict_get_str (dict, "volname", volname);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to get volume name");
+ goto out;
+ }
+
+ ret = glusterd_volinfo_find (*volname, volinfo);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to allocate memory");
+ goto out;
+ }
+
+out:
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
+
+ return ret;
+}
+
+static int
+validate_cache_max_min_size (dict_t *dict, char *key, char *value,
+ char **op_errstr)
+{
+ char *current_max_value = NULL;
+ char *current_min_value = NULL;
+ char errstr[2048] = "";
+ char *volname = NULL;
+ glusterd_conf_t *priv = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ int ret = 0;
+ uint64_t max_value = 0;
+ uint64_t min_value = 0;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ ret = check_dict_key_value (dict, key, value);
+ if (ret)
+ goto out;
+
+ ret = get_volname_volinfo (dict, &volname, &volinfo);
+ if (ret)
+ goto out;
+
+ if ((!strcmp (key, "performance.cache-min-file-size")) ||
+ (!strcmp (key, "cache-min-file-size"))) {
+ glusterd_volinfo_get (volinfo,
+ "performance.cache-max-file-size",
+ &current_max_value);
+ if (current_max_value) {
+ gf_string2bytesize_uint64 (current_max_value, &max_value);
+ gf_string2bytesize_uint64 (value, &min_value);
+ current_min_value = value;
+ }
+ } else if ((!strcmp (key, "performance.cache-max-file-size")) ||
+ (!strcmp (key, "cache-max-file-size"))) {
+ glusterd_volinfo_get (volinfo,
+ "performance.cache-min-file-size",
+ &current_min_value);
+ if (current_min_value) {
+ gf_string2bytesize_uint64 (current_min_value, &min_value);
+ gf_string2bytesize_uint64 (value, &max_value);
+ current_max_value = value;
+ }
+ }
+
+ if (min_value > max_value) {
+ snprintf (errstr, sizeof (errstr),
+ "cache-min-file-size (%s) is greater than "
+ "cache-max-file-size (%s)",
+ current_min_value, current_max_value);
+ gf_log (this->name, GF_LOG_ERROR, "%s", errstr);
+ *op_errstr = gf_strdup (errstr);
+ ret = -1;
+ goto out;
+ }
+
+out:
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
+
+ return ret;
+}
+
+static int
+validate_quota (dict_t *dict, char *key, char *value,
+ char **op_errstr)
+{
+ char errstr[2048] = "";
+ char *volname = NULL;
+ glusterd_conf_t *priv = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ int ret = 0;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ ret = check_dict_key_value (dict, key, value);
+ if (ret)
+ goto out;
+
+ ret = get_volname_volinfo (dict, &volname, &volinfo);
+ if (ret)
+ goto out;
+
+ ret = glusterd_volinfo_get_boolean (volinfo, VKEY_FEATURES_QUOTA);
+ if (ret == -1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "failed to get the quota status");
+ goto out;
+ }
+
+ if (ret == _gf_false) {
+ snprintf (errstr, sizeof (errstr),
+ "Cannot set %s. Enable quota first.", key);
+ gf_log (this->name, GF_LOG_ERROR, "%s", errstr);
+ *op_errstr = gf_strdup (errstr);
+ ret = -1;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
+
+ return ret;
+}
+
+static int
+validate_stripe (dict_t *dict, char *key, char *value, char **op_errstr)
+{
+ char errstr[2048] = "";
+ char *volname = NULL;
+ glusterd_conf_t *priv = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ int ret = 0;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ ret = check_dict_key_value (dict, key, value);
+ if (ret)
+ goto out;
+
+ ret = get_volname_volinfo (dict, &volname, &volinfo);
+ if (ret)
+ goto out;
+
+ if (volinfo->stripe_count == 1) {
+ snprintf (errstr, sizeof (errstr),
+ "Cannot set %s for a non-stripe volume.", key);
+ gf_log (this->name, GF_LOG_ERROR, "%s", errstr);
+ *op_errstr = gf_strdup (errstr);
+ ret = -1;
+ goto out;
+ }
+
+out:
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
+
+ return ret;
+}
+
+static int
+validate_subvols_per_directory (dict_t *dict, char *key, char *value,
+ char **op_errstr)
+{
+ char errstr[2048] = "";
+ char *volname = NULL;
+ glusterd_conf_t *priv = NULL;
+ glusterd_volinfo_t *volinfo = NULL;
+ int ret = 0;
+ int subvols = 0;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ ret = check_dict_key_value (dict, key, value);
+ if (ret)
+ goto out;
+
+ ret = get_volname_volinfo (dict, &volname, &volinfo);
+ if (ret)
+ goto out;
+
+ subvols = atoi(value);
+
+ /* Checking if the subvols-per-directory exceed the total
+ number of subvolumes. */
+ if (subvols > volinfo->subvol_count) {
+ snprintf (errstr, sizeof(errstr),
+ "subvols-per-directory(%d) is greater "
+ "than the number of subvolumes(%d).",
+ subvols, volinfo->subvol_count);
+ gf_log (this->name, GF_LOG_ERROR,
+ "%s.", errstr);
+ *op_errstr = gf_strdup (errstr);
+ ret = -1;
+ goto out;
+ }
+
+out:
+ gf_log (this->name, GF_LOG_DEBUG, "Returning %d", ret);
+
+ return ret;
+}
+
+
+/* dispatch table for VOLUME SET
+ * -----------------------------
+ *
+ * Format of entries:
+ *
+ * First field is the <key>, for the purpose of looking it up
+ * in volume dictionary. Each <key> is of the format "<domain>.<specifier>".
+ *
+ * Second field is <voltype>.
+ *
+ * Third field is <option>, if its unset, it's assumed to be
+ * the same as <specifier>.
+ *
+ * Fourth field is <value>. In this context they are used to specify
+ * a default. That is, even the volume dict doesn't have a value,
+ * we procced as if the default value were set for it.
+ *
+ * Fifth field is <doctype>, which decides if the option is public and available
+ * in "set help" or not. "NO_DOC" entries are not part of the public interface
+ * and are subject to change at any time. This also decides if an option is
+ * global (apllies to all volumes) or normal (applies to only specified volume).
+ *
+ * Sixth field is <flags>.
+ *
+ * Seventh field is <op-version>.
+ *
+ * Eight field is description of option: If NULL, tried to fetch from
+ * translator code's xlator_options table.
+ *
+ * Nineth field is validation function: If NULL, xlator's option specific
+ * validation will be tried, otherwise tried at glusterd code itself.
+ *
+ * There are two type of entries: basic and special.
+ *
+ * - Basic entries are the ones where the <option> does _not_ start with
+ * the bang! character ('!').
+ *
+ * In their case, <option> is understood as an option for an xlator of
+ * type <voltype>. Their effect is to copy over the volinfo->dict[<key>]
+ * value to all graph nodes of type <voltype> (if such a value is set).
+ *
+ * You are free to add entries of this type, they will become functional
+ * just by being present in the table.
+ *
+ * - Special entries where the <option> starts with the bang!.
+ *
+ * They are not applied to all graphs during generation, and you cannot
+ * extend them in a trivial way which could be just picked up. Better
+ * not touch them unless you know what you do.
+ *
+ *
+ * Another kind of grouping for options, according to visibility:
+ *
+ * - Exported: one which is used in the code. These are characterized by
+ * being used a macro as <key> (of the format VKEY_..., defined in
+ * glusterd-volgen.h
+ *
+ * - Non-exported: the rest; these have string literal <keys>.
+ *
+ * Adhering to this policy, option name changes shall be one-liners.
+ *
+ */
+
+struct volopt_map_entry glusterd_volopt_map[] = {
+ /* DHT xlator options */
+ { .key = "cluster.lookup-unhashed",
+ .voltype = "cluster/distribute",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.min-free-disk",
+ .voltype = "cluster/distribute",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.min-free-inodes",
+ .voltype = "cluster/distribute",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.rebalance-stats",
+ .voltype = "cluster/distribute",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.subvols-per-directory",
+ .voltype = "cluster/distribute",
+ .option = "directory-layout-spread",
+ .op_version = 2,
+ .validate_fn = validate_subvols_per_directory,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.readdir-optimize",
+ .voltype = "cluster/distribute",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.rsync-hash-regex",
+ .voltype = "cluster/distribute",
+ .type = NO_DOC,
+ .op_version = 3,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.extra-hash-regex",
+ .voltype = "cluster/distribute",
+ .type = NO_DOC,
+ .op_version = 3,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.dht-xattr-name",
+ .voltype = "cluster/distribute",
+ .option = "xattr-name",
+ .type = NO_DOC,
+ .op_version = 3,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+
+ /* NUFA xlator options (Distribute special case) */
+ { .key = "cluster.nufa",
+ .voltype = "cluster/distribute",
+ .option = "!nufa",
+ .type = NO_DOC,
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.local-volume-name",
+ .voltype = "cluster/nufa",
+ .option = "local-volume-name",
+ .type = NO_DOC,
+ .op_version = 3,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+
+ /* Switch xlator options (Distribute special case) */
+ { .key = "cluster.switch",
+ .voltype = "cluster/distribute",
+ .option = "!switch",
+ .type = NO_DOC,
+ .op_version = 3,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.switch-pattern",
+ .voltype = "cluster/switch",
+ .option = "pattern.switch.case",
+ .type = NO_DOC,
+ .op_version = 3,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+
+ /* AFR xlator options */
+ { .key = "cluster.entry-change-log",
+ .voltype = "cluster/replicate",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.read-subvolume",
+ .voltype = "cluster/replicate",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.read-subvolume-index",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.read-hash-mode",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.background-self-heal-count",
+ .voltype = "cluster/replicate",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.metadata-self-heal",
+ .voltype = "cluster/replicate",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.data-self-heal",
+ .voltype = "cluster/replicate",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.entry-self-heal",
+ .voltype = "cluster/replicate",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.self-heal-daemon",
+ .voltype = "cluster/replicate",
+ .option = "!self-heal-daemon",
+ .op_version = 1
+ },
+ { .key = "cluster.heal-timeout",
+ .voltype = "cluster/replicate",
+ .option = "!heal-timeout",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.strict-readdir",
+ .voltype = "cluster/replicate",
+ .type = NO_DOC,
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.self-heal-window-size",
+ .voltype = "cluster/replicate",
+ .option = "data-self-heal-window-size",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.data-change-log",
+ .voltype = "cluster/replicate",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.metadata-change-log",
+ .voltype = "cluster/replicate",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.data-self-heal-algorithm",
+ .voltype = "cluster/replicate",
+ .option = "data-self-heal-algorithm",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.eager-lock",
+ .voltype = "cluster/replicate",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.quorum-type",
+ .voltype = "cluster/replicate",
+ .option = "quorum-type",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.quorum-count",
+ .voltype = "cluster/replicate",
+ .option = "quorum-count",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.choose-local",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.self-heal-readdir-size",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.post-op-delay-secs",
+ .voltype = "cluster/replicate",
+ .type = NO_DOC,
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.readdir-failover",
+ .voltype = "cluster/replicate",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.ensure-durability",
+ .voltype = "cluster/replicate",
+ .op_version = 3,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+
+ /* Stripe xlator options */
+ { .key = "cluster.stripe-block-size",
+ .voltype = "cluster/stripe",
+ .option = "block-size",
+ .op_version = 1,
+ .validate_fn = validate_stripe,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "cluster.stripe-coalesce",
+ .voltype = "cluster/stripe",
+ .option = "coalesce",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+
+ /* IO-stats xlator options */
+ { .key = VKEY_DIAG_LAT_MEASUREMENT,
+ .voltype = "debug/io-stats",
+ .option = "latency-measurement",
+ .value = "off",
+ .op_version = 1
+ },
+ { .key = "diagnostics.dump-fd-stats",
+ .voltype = "debug/io-stats",
+ .op_version = 1
+ },
+ { .key = VKEY_DIAG_CNT_FOP_HITS,
+ .voltype = "debug/io-stats",
+ .option = "count-fop-hits",
+ .value = "off",
+ .type = NO_DOC,
+ .op_version = 1
+ },
+ { .key = "diagnostics.brick-log-level",
+ .voltype = "debug/io-stats",
+ .option = "!brick-log-level",
+ .op_version = 1
+ },
+ { .key = "diagnostics.client-log-level",
+ .voltype = "debug/io-stats",
+ .option = "!client-log-level",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "diagnostics.brick-sys-log-level",
+ .voltype = "debug/io-stats",
+ .option = "!sys-log-level",
+ .op_version = 1
+ },
+ { .key = "diagnostics.client-sys-log-level",
+ .voltype = "debug/io-stats",
+ .option = "!sys-log-level",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "diagnostics.brick-logger",
+ .voltype = "debug/io-stats",
+ .option = "!logger",
+ .op_version = 4
+ },
+ { .key = "diagnostics.client-logger",
+ .voltype = "debug/io-stats",
+ .option = "!logger",
+ .op_version = 4,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "diagnostics.brick-log-format",
+ .voltype = "debug/io-stats",
+ .option = "!log-format",
+ .op_version = 4
+ },
+ { .key = "diagnostics.client-log-format",
+ .voltype = "debug/io-stats",
+ .option = "!log-format",
+ .op_version = 4,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+
+ /* IO-cache xlator options */
+ { .key = "performance.cache-max-file-size",
+ .voltype = "performance/io-cache",
+ .option = "max-file-size",
+ .op_version = 1,
+ .validate_fn = validate_cache_max_min_size,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "performance.cache-min-file-size",
+ .voltype = "performance/io-cache",
+ .option = "min-file-size",
+ .op_version = 1,
+ .validate_fn = validate_cache_max_min_size,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "performance.cache-refresh-timeout",
+ .voltype = "performance/io-cache",
+ .option = "cache-timeout",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "performance.cache-priority",
+ .voltype = "performance/io-cache",
+ .option = "priority",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "performance.cache-size",
+ .voltype = "performance/io-cache",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+
+ /* IO-threads xlator options */
+ { .key = "performance.io-thread-count",
+ .voltype = "performance/io-threads",
+ .option = "thread-count",
+ .op_version = 1
+ },
+ { .key = "performance.high-prio-threads",
+ .voltype = "performance/io-threads",
+ .op_version = 1
+ },
+ { .key = "performance.normal-prio-threads",
+ .voltype = "performance/io-threads",
+ .op_version = 1
+ },
+ { .key = "performance.low-prio-threads",
+ .voltype = "performance/io-threads",
+ .op_version = 1
+ },
+ { .key = "performance.least-prio-threads",
+ .voltype = "performance/io-threads",
+ .op_version = 1
+ },
+ { .key = "performance.enable-least-priority",
+ .voltype = "performance/io-threads",
+ .op_version = 1
+ },
+ { .key = "performance.least-rate-limit",
+ .voltype = "performance/io-threads",
+ .op_version = 2
+ },
+
+ /* Other perf xlators' options */
+ { .key = "performance.cache-size",
+ .voltype = "performance/quick-read",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "performance.flush-behind",
+ .voltype = "performance/write-behind",
+ .option = "flush-behind",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "performance.write-behind-window-size",
+ .voltype = "performance/write-behind",
+ .option = "cache-size",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "performance.strict-o-direct",
+ .voltype = "performance/write-behind",
+ .option = "strict-O_DIRECT",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "performance.strict-write-ordering",
+ .voltype = "performance/write-behind",
+ .option = "strict-write-ordering",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "performance.lazy-open",
+ .voltype = "performance/open-behind",
+ .option = "lazy-open",
+ .op_version = 3,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "performance.read-after-open",
+ .voltype = "performance/open-behind",
+ .option = "read-after-open",
+ .op_version = 3,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "performance.read-ahead-page-count",
+ .voltype = "performance/read-ahead",
+ .option = "page-count",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "performance.md-cache-timeout",
+ .voltype = "performance/md-cache",
+ .option = "md-cache-timeout",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+
+ /* Crypt xlator options */
+
+ { .key = "features.encryption",
+ .voltype = "encryption/crypt",
+ .option = "!feat",
+ .value = "off",
+ .op_version = 3,
+ .description = "enable/disable client-side encryption for "
+ "the volume.",
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT
+ },
+
+ { .key = "encryption.master-key",
+ .voltype = "encryption/crypt",
+ .op_version = 3,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "encryption.data-key-size",
+ .voltype = "encryption/crypt",
+ .op_version = 3,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "encryption.block-size",
+ .voltype = "encryption/crypt",
+ .op_version = 3,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+
+ /* Client xlator options */
+ { .key = "network.frame-timeout",
+ .voltype = "protocol/client",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "network.ping-timeout",
+ .voltype = "protocol/client",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "network.tcp-window-size",
+ .voltype = "protocol/client",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "features.lock-heal",
+ .voltype = "protocol/client",
+ .option = "lk-heal",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "features.grace-timeout",
+ .voltype = "protocol/client",
+ .option = "grace-timeout",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "client.ssl",
+ .voltype = "protocol/client",
+ .option = "transport.socket.ssl-enabled",
+ .type = NO_DOC,
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "network.remote-dio",
+ .voltype = "protocol/client",
+ .option = "filter-O_DIRECT",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+
+ /* Server xlator options */
+ { .key = "network.tcp-window-size",
+ .voltype = "protocol/server",
+ .op_version = 1
+ },
+ { .key = "network.inode-lru-limit",
+ .voltype = "protocol/server",
+ .op_version = 1
+ },
+ { .key = AUTH_ALLOW_MAP_KEY,
+ .voltype = "protocol/server",
+ .option = "!server-auth",
+ .value = "*",
+ .op_version = 1
+ },
+ { .key = AUTH_REJECT_MAP_KEY,
+ .voltype = "protocol/server",
+ .option = "!server-auth",
+ .op_version = 1
+ },
+ { .key = "transport.keepalive",
+ .voltype = "protocol/server",
+ .option = "transport.socket.keepalive",
+ .type = NO_DOC,
+ .op_version = 1
+ },
+ { .key = "server.allow-insecure",
+ .voltype = "protocol/server",
+ .option = "rpc-auth-allow-insecure",
+ .type = NO_DOC,
+ .op_version = 1
+ },
+ { .key = "server.root-squash",
+ .voltype = "protocol/server",
+ .option = "root-squash",
+ .op_version = 2
+ },
+ { .key = "server.anonuid",
+ .voltype = "protocol/server",
+ .option = "anonuid",
+ .op_version = 3
+ },
+ { .key = "server.anongid",
+ .voltype = "protocol/server",
+ .option = "anongid",
+ .op_version = 3
+ },
+ { .key = "server.statedump-path",
+ .voltype = "protocol/server",
+ .option = "statedump-path",
+ .op_version = 1
+ },
+ { .key = "server.outstanding-rpc-limit",
+ .voltype = "protocol/server",
+ .option = "rpc.outstanding-rpc-limit",
+ .type = GLOBAL_DOC,
+ .op_version = 3
+ },
+ { .key = "features.lock-heal",
+ .voltype = "protocol/server",
+ .option = "lk-heal",
+ .type = NO_DOC,
+ .op_version = 1
+ },
+ { .key = "features.grace-timeout",
+ .voltype = "protocol/server",
+ .option = "grace-timeout",
+ .type = NO_DOC,
+ .op_version = 1
+ },
+ { .key = "server.ssl",
+ .voltype = "protocol/server",
+ .option = "transport.socket.ssl-enabled",
+ .type = NO_DOC,
+ .op_version = 2
+ },
+ { .key = "cluster.nsr",
+ .voltype = "cluster/nsr",
+ .option = "!nsr",
+ .op_version = 3,
+ .description = "enable NSR instead of AFR for replication",
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT
+ },
+ { .key = "cluster.nsr.recon",
+ .voltype = "cluster/nsr",
+ .op_version = 3,
+ .description = "enable NSR reconciliation",
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT
+ },
+ { .key = "cluster.nsr.quorum-percent",
+ .voltype = "cluster/nsr",
+ .option = "quorum-percent",
+ .op_version = 3,
+ .description = "percent of rep_count-1 bricks that must be up"
+ },
+
+ /* Performance xlators enable/disbable options */
+ { .key = "performance.write-behind",
+ .voltype = "performance/write-behind",
+ .option = "!perf",
+ .value = "on",
+ .op_version = 1,
+ .description = "enable/disable write-behind translator in the "
+ "volume.",
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT
+ },
+ { .key = "performance.read-ahead",
+ .voltype = "performance/read-ahead",
+ .option = "!perf",
+ .value = "on",
+ .op_version = 1,
+ .description = "enable/disable read-ahead translator in the volume.",
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT
+ },
+ { .key = "performance.readdir-ahead",
+ .voltype = "performance/readdir-ahead",
+ .option = "!perf",
+ .value = "off",
+ .op_version = 3,
+ .description = "enable/disable readdir-ahead translator in the volume.",
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT
+ },
+
+ { .key = "performance.io-cache",
+ .voltype = "performance/io-cache",
+ .option = "!perf",
+ .value = "on",
+ .op_version = 1,
+ .description = "enable/disable io-cache translator in the volume.",
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "performance.quick-read",
+ .voltype = "performance/quick-read",
+ .option = "!perf",
+ .value = "on",
+ .op_version = 1,
+ .description = "enable/disable quick-read translator in the volume.",
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT
+
+ },
+ { .key = "performance.open-behind",
+ .voltype = "performance/open-behind",
+ .option = "!perf",
+ .value = "on",
+ .op_version = 2,
+ .description = "enable/disable open-behind translator in the volume.",
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT
+
+ },
+ { .key = "performance.stat-prefetch",
+ .voltype = "performance/md-cache",
+ .option = "!perf",
+ .value = "on",
+ .op_version = 1,
+ .description = "enable/disable meta-data caching translator in the "
+ "volume.",
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT
+ },
+ { .key = "performance.client-io-threads",
+ .voltype = "performance/io-threads",
+ .option = "!perf",
+ .value = "off",
+ .op_version = 1,
+ .description = "enable/disable io-threads translator in the client "
+ "graph of volume.",
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT
+ },
+ { .key = "performance.nfs.write-behind",
+ .voltype = "performance/write-behind",
+ .option = "!nfsperf",
+ .value = "on",
+ .type = NO_DOC,
+ .op_version = 1,
+ .flags = OPT_FLAG_XLATOR_OPT
+ },
+ { .key = "performance.nfs.read-ahead",
+ .voltype = "performance/read-ahead",
+ .option = "!nfsperf",
+ .value = "off",
+ .type = NO_DOC,
+ .op_version = 1,
+ .flags = OPT_FLAG_XLATOR_OPT
+ },
+ { .key = "performance.nfs.io-cache",
+ .voltype = "performance/io-cache",
+ .option = "!nfsperf",
+ .value = "off",
+ .type = NO_DOC,
+ .op_version = 1,
+ .flags = OPT_FLAG_XLATOR_OPT
+ },
+ { .key = "performance.nfs.quick-read",
+ .voltype = "performance/quick-read",
+ .option = "!nfsperf",
+ .value = "off",
+ .type = NO_DOC,
+ .op_version = 1,
+ .flags = OPT_FLAG_XLATOR_OPT
+ },
+ { .key = "performance.nfs.stat-prefetch",
+ .voltype = "performance/md-cache",
+ .option = "!nfsperf",
+ .value = "off",
+ .type = NO_DOC,
+ .op_version = 1,
+ .flags = OPT_FLAG_XLATOR_OPT
+ },
+ { .key = "performance.nfs.io-threads",
+ .voltype = "performance/io-threads",
+ .option = "!nfsperf",
+ .value = "off",
+ .type = NO_DOC,
+ .op_version = 1,
+ .flags = OPT_FLAG_XLATOR_OPT
+ },
+ { .key = "performance.force-readdirp",
+ .voltype = "performance/md-cache",
+ .option = "force-readdirp",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+
+ /* Feature translators */
+ { .key = "features.file-snapshot",
+ .voltype = "features/qemu-block",
+ .option = "!feat",
+ .value = "off",
+ .op_version = 3,
+ .description = "enable/disable file-snapshot feature in the "
+ "volume.",
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT
+ },
+
+#ifdef HAVE_LIB_Z
+ /* Compressor-decompressor xlator options
+ * defaults used from xlator/features/compress/src/cdc.h
+ */
+ { .key = "network.compression",
+ .voltype = "features/cdc",
+ .option = "!feat",
+ .value = "off",
+ .op_version = 3,
+ .description = "enable/disable network compression translator",
+ .flags = OPT_FLAG_XLATOR_OPT
+ },
+ { .key = "network.compression.window-size",
+ .voltype = "features/cdc",
+ .option = "window-size",
+ .op_version = 3
+ },
+ { .key = "network.compression.mem-level",
+ .voltype = "features/cdc",
+ .option = "mem-level",
+ .op_version = 3
+ },
+ { .key = "network.compression.min-size",
+ .voltype = "features/cdc",
+ .option = "min-size",
+ .op_version = 3
+ },
+ { .key = "network.compression.compression-level",
+ .voltype = "features/cdc",
+ .option = "compression-level",
+ .op_version = 3
+ },
+ { .key = "network.compression.debug",
+ .voltype = "features/cdc",
+ .option = "debug",
+ .type = NO_DOC,
+ .op_version = 3
+ },
+#endif
+
+ /* Quota xlator options */
+ { .key = VKEY_FEATURES_LIMIT_USAGE,
+ .voltype = "features/quota",
+ .option = "limit-set",
+ .type = NO_DOC,
+ .op_version = 1,
+ },
+ {
+ .key = "features.quota-timeout",
+ .voltype = "features/quota",
+ .option = "timeout",
+ .value = "0",
+ .op_version = 1,
+ .validate_fn = validate_quota,
+ },
+ { .key = "features.default-soft-limit",
+ .voltype = "features/quota",
+ .option = "default-soft-limit",
+ .type = NO_DOC,
+ .op_version = 3,
+ },
+ { .key = "features.soft-timeout",
+ .voltype = "features/quota",
+ .option = "soft-timeout",
+ .type = NO_DOC,
+ .op_version = 3,
+ },
+ { .key = "features.hard-timeout",
+ .voltype = "features/quota",
+ .option = "hard-timeout",
+ .type = NO_DOC,
+ .op_version = 3,
+ },
+ { .key = "features.alert-time",
+ .voltype = "features/quota",
+ .option = "alert-time",
+ .type = NO_DOC,
+ .op_version = 3,
+ },
+ { .key = "features.quota-deem-statfs",
+ .voltype = "features/quota",
+ .option = "deem-statfs",
+ .value = "off",
+ .type = DOC,
+ .op_version = 2,
+ .validate_fn = validate_quota,
+ },
+
+ /* Marker xlator options */
+ { .key = VKEY_MARKER_XTIME,
+ .voltype = "features/marker",
+ .option = "xtime",
+ .value = "off",
+ .type = NO_DOC,
+ .flags = OPT_FLAG_FORCE,
+ .op_version = 1
+ },
+ { .key = VKEY_MARKER_XTIME,
+ .voltype = "features/marker",
+ .option = "!xtime",
+ .value = "off",
+ .type = NO_DOC,
+ .flags = OPT_FLAG_FORCE,
+ .op_version = 1
+ },
+ { .key = VKEY_MARKER_XTIME_FORCE,
+ .voltype = "features/marker",
+ .option = "gsync-force-xtime",
+ .value = "off",
+ .type = NO_DOC,
+ .flags = OPT_FLAG_FORCE,
+ .op_version = 2
+ },
+ { .key = VKEY_MARKER_XTIME_FORCE,
+ .voltype = "features/marker",
+ .option = "!gsync-force-xtime",
+ .value = "off",
+ .type = NO_DOC,
+ .flags = OPT_FLAG_FORCE,
+ .op_version = 2
+ },
+ { .key = VKEY_FEATURES_QUOTA,
+ .voltype = "features/marker",
+ .option = "quota",
+ .value = "off",
+ .type = NO_DOC,
+ .flags = OPT_FLAG_FORCE,
+ .op_version = 1
+ },
+
+ /* Debug xlators options */
+ { .key = "debug.trace",
+ .voltype = "debug/trace",
+ .option = "!debug",
+ .value = "off",
+ .type = NO_DOC,
+ .op_version = 1,
+ .flags = OPT_FLAG_XLATOR_OPT
+ },
+ { .key = "debug.log-history",
+ .voltype = "debug/trace",
+ .option = "log-history",
+ .type = NO_DOC,
+ .op_version = 2
+ },
+ { .key = "debug.log-file",
+ .voltype = "debug/trace",
+ .option = "log-file",
+ .type = NO_DOC,
+ .op_version = 2
+ },
+ { .key = "debug.exclude-ops",
+ .voltype = "debug/trace",
+ .option = "exclude-ops",
+ .type = NO_DOC,
+ .op_version = 2
+ },
+ { .key = "debug.include-ops",
+ .voltype = "debug/trace",
+ .option = "include-ops",
+ .type = NO_DOC,
+ .op_version = 2
+ },
+ { .key = "debug.error-gen",
+ .voltype = "debug/error-gen",
+ .option = "!debug",
+ .value = "off",
+ .type = NO_DOC,
+ .op_version = 1,
+ .flags = OPT_FLAG_XLATOR_OPT
+ },
+ { .key = "debug.error-failure",
+ .voltype = "debug/error-gen",
+ .option = "failure",
+ .type = NO_DOC,
+ .op_version = 3
+ },
+ { .key = "debug.error-number",
+ .voltype = "debug/error-gen",
+ .option = "error-no",
+ .type = NO_DOC,
+ .op_version = 3
+ },
+ { .key = "debug.random-failure",
+ .voltype = "debug/error-gen",
+ .option = "random-failure",
+ .type = NO_DOC,
+ .op_version = 3
+ },
+ { .key = "debug.error-fops",
+ .voltype = "debug/error-gen",
+ .option = "enable",
+ .type = NO_DOC,
+ .op_version = 3
+ },
+
+
+ /* NFS xlator options */
+ { .key = "nfs.enable-ino32",
+ .voltype = "nfs/server",
+ .option = "nfs.enable-ino32",
+ .type = GLOBAL_DOC,
+ .op_version = 1
+ },
+ { .key = "nfs.mem-factor",
+ .voltype = "nfs/server",
+ .option = "nfs.mem-factor",
+ .type = GLOBAL_DOC,
+ .op_version = 1
+ },
+ { .key = "nfs.export-dirs",
+ .voltype = "nfs/server",
+ .option = "nfs3.export-dirs",
+ .type = GLOBAL_DOC,
+ .op_version = 1
+ },
+ { .key = "nfs.export-volumes",
+ .voltype = "nfs/server",
+ .option = "nfs3.export-volumes",
+ .type = GLOBAL_DOC,
+ .op_version = 1
+ },
+ { .key = "nfs.addr-namelookup",
+ .voltype = "nfs/server",
+ .option = "rpc-auth.addr.namelookup",
+ .type = GLOBAL_DOC,
+ .op_version = 1
+ },
+ { .key = "nfs.dynamic-volumes",
+ .voltype = "nfs/server",
+ .option = "nfs.dynamic-volumes",
+ .type = GLOBAL_DOC,
+ .op_version = 1
+ },
+ { .key = "nfs.register-with-portmap",
+ .voltype = "nfs/server",
+ .option = "rpc.register-with-portmap",
+ .type = GLOBAL_DOC,
+ .op_version = 1
+ },
+ { .key = "nfs.outstanding-rpc-limit",
+ .voltype = "nfs/server",
+ .option = "rpc.outstanding-rpc-limit",
+ .type = GLOBAL_DOC,
+ .op_version = 3
+ },
+ { .key = "nfs.port",
+ .voltype = "nfs/server",
+ .option = "nfs.port",
+ .type = GLOBAL_DOC,
+ .op_version = 1
+ },
+ { .key = "nfs.rpc-auth-unix",
+ .voltype = "nfs/server",
+ .option = "!rpc-auth.auth-unix.*",
+ .op_version = 1
+ },
+ { .key = "nfs.rpc-auth-null",
+ .voltype = "nfs/server",
+ .option = "!rpc-auth.auth-null.*",
+ .op_version = 1
+ },
+ { .key = "nfs.rpc-auth-allow",
+ .voltype = "nfs/server",
+ .option = "!rpc-auth.addr.*.allow",
+ .op_version = 1
+ },
+ { .key = "nfs.rpc-auth-reject",
+ .voltype = "nfs/server",
+ .option = "!rpc-auth.addr.*.reject",
+ .op_version = 1
+ },
+ { .key = "nfs.ports-insecure",
+ .voltype = "nfs/server",
+ .option = "!rpc-auth.ports.*.insecure",
+ .op_version = 1
+ },
+ { .key = "nfs.transport-type",
+ .voltype = "nfs/server",
+ .option = "!nfs.transport-type",
+ .value = "tcp",
+ .op_version = 1,
+ .description = "Specifies the nfs transport type. Valid "
+ "transport types are 'tcp' and 'rdma'."
+ },
+ { .key = "nfs.trusted-sync",
+ .voltype = "nfs/server",
+ .option = "!nfs3.*.trusted-sync",
+ .op_version = 1
+ },
+ { .key = "nfs.trusted-write",
+ .voltype = "nfs/server",
+ .option = "!nfs3.*.trusted-write",
+ .op_version = 1
+ },
+ { .key = "nfs.volume-access",
+ .voltype = "nfs/server",
+ .option = "!nfs3.*.volume-access",
+ .op_version = 1
+ },
+ { .key = "nfs.export-dir",
+ .voltype = "nfs/server",
+ .option = "!nfs3.*.export-dir",
+ .op_version = 1
+ },
+ { .key = NFS_DISABLE_MAP_KEY,
+ .voltype = "nfs/server",
+ .option = "!nfs-disable",
+ .op_version = 1
+ },
+ { .key = "nfs.nlm",
+ .voltype = "nfs/server",
+ .option = "nfs.nlm",
+ .type = GLOBAL_DOC,
+ .op_version = 1
+ },
+ { .key = "nfs.acl",
+ .voltype = "nfs/server",
+ .option = "nfs.acl",
+ .type = GLOBAL_DOC,
+ .op_version = 3
+ },
+ { .key = "nfs.mount-udp",
+ .voltype = "nfs/server",
+ .option = "nfs.mount-udp",
+ .type = GLOBAL_DOC,
+ .op_version = 1
+ },
+ { .key = "nfs.mount-rmtab",
+ .voltype = "nfs/server",
+ .option = "nfs.mount-rmtab",
+ .type = GLOBAL_DOC,
+ .op_version = 1
+ },
+ { .key = "nfs.rpc-statd",
+ .voltype = "nfs/server",
+ .option = "nfs.rpc-statd",
+ .type = NO_DOC,
+ .op_version = 4,
+ },
+ { .key = "nfs.log-level",
+ .voltype = "nfs/server",
+ .option = "nfs.log-level",
+ .type = NO_DOC,
+ .op_version = 4,
+ },
+ { .key = "nfs.server-aux-gids",
+ .voltype = "nfs/server",
+ .option = "nfs.server-aux-gids",
+ .type = NO_DOC,
+ .op_version = 2
+ },
+ { .key = "nfs.drc",
+ .voltype = "nfs/server",
+ .option = "nfs.drc",
+ .type = GLOBAL_DOC,
+ .op_version = 3
+ },
+ { .key = "nfs.drc-size",
+ .voltype = "nfs/server",
+ .option = "nfs.drc-size",
+ .type = GLOBAL_DOC,
+ .op_version = 3
+ },
+ { .key = "nfs.read-size",
+ .voltype = "nfs/server",
+ .option = "nfs3.read-size",
+ .type = GLOBAL_DOC,
+ .op_version = 3
+ },
+ { .key = "nfs.write-size",
+ .voltype = "nfs/server",
+ .option = "nfs3.write-size",
+ .type = GLOBAL_DOC,
+ .op_version = 3
+ },
+ { .key = "nfs.readdir-size",
+ .voltype = "nfs/server",
+ .option = "nfs3.readdir-size",
+ .type = GLOBAL_DOC,
+ .op_version = 3
+ },
+
+ /* Other options which don't fit any place above */
+ { .key = "features.read-only",
+ .voltype = "features/read-only",
+ .option = "!read-only",
+ .value = "off",
+ .op_version = 1,
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT
+ },
+ { .key = "features.worm",
+ .voltype = "features/worm",
+ .option = "!worm",
+ .value = "off",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_XLATOR_OPT
+ },
+ { .key = "storage.linux-aio",
+ .voltype = "storage/posix",
+ .op_version = 1
+ },
+ { .key = "storage.batch-fsync-mode",
+ .voltype = "storage/posix",
+ .op_version = 3
+ },
+ { .key = "storage.batch-fsync-delay-usec",
+ .voltype = "storage/posix",
+ .op_version = 3
+ },
+ { .key = "storage.xattr-user-namespace-mode",
+ .voltype = "storage/posix",
+ .op_version = 4
+ },
+ { .key = "storage.owner-uid",
+ .voltype = "storage/posix",
+ .option = "brick-uid",
+ .op_version = 1
+ },
+ { .key = "storage.owner-gid",
+ .voltype = "storage/posix",
+ .option = "brick-gid",
+ .op_version = 1
+ },
+ { .key = "storage.node-uuid-pathinfo",
+ .voltype = "storage/posix",
+ .op_version = 3
+ },
+ { .key = "storage.health-check-interval",
+ .voltype = "storage/posix",
+ .op_version = 3
+ },
+ { .option = "update-link-count-parent",
+ .key = "storage.build-pgfid",
+ .voltype = "storage/posix",
+ .op_version = 4
+ },
+ { .key = "storage.bd-aio",
+ .voltype = "storage/bd",
+ .op_version = 3
+ },
+ { .key = "config.memory-accounting",
+ .voltype = "configuration",
+ .option = "!config",
+ .op_version = 2,
+ .flags = OPT_FLAG_CLIENT_OPT
+ },
+ { .key = "config.transport",
+ .voltype = "configuration",
+ .option = "!config",
+ .op_version = 2
+ },
+ { .key = GLUSTERD_QUORUM_TYPE_KEY,
+ .voltype = "mgmt/glusterd",
+ .value = "off",
+ .op_version = 2
+ },
+ { .key = GLUSTERD_QUORUM_RATIO_KEY,
+ .voltype = "mgmt/glusterd",
+ .value = "0",
+ .op_version = 2
+ },
+ /* changelog translator - global tunables */
+ { .key = "changelog.changelog",
+ .voltype = "features/changelog",
+ .type = NO_DOC,
+ .op_version = 3
+ },
+ { .key = "changelog.changelog-dir",
+ .voltype = "features/changelog",
+ .type = NO_DOC,
+ .op_version = 3
+ },
+ { .key = "changelog.encoding",
+ .voltype = "features/changelog",
+ .type = NO_DOC,
+ .op_version = 3
+ },
+ { .key = "changelog.rollover-time",
+ .voltype = "features/changelog",
+ .type = NO_DOC,
+ .op_version = 3
+ },
+ { .key = "changelog.fsync-interval",
+ .voltype = "features/changelog",
+ .type = NO_DOC,
+ .op_version = 3
+ },
+ { .key = "features.barrier",
+ .voltype = "features/barrier",
+ .value = "disable",
+ .op_version = 4
+ },
+ { .key = "features.barrier-timeout",
+ .voltype = "features/barrier",
+ .value = "120",
+ .op_version = 4
+ },
+ { .key = NULL
+ }
+};
diff --git a/xlators/mgmt/glusterd/src/glusterd.c b/xlators/mgmt/glusterd/src/glusterd.c
index 164009bfe..4d09d7fd9 100644
--- a/xlators/mgmt/glusterd/src/glusterd.c
+++ b/xlators/mgmt/glusterd/src/glusterd.c
@@ -1,5 +1,5 @@
/*
- Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+ Copyright (c) 2006-2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
@@ -30,12 +30,14 @@
#include "dict.h"
#include "compat.h"
#include "compat-errno.h"
+#include "syscall.h"
#include "statedump.h"
#include "glusterd-sm.h"
#include "glusterd-op-sm.h"
#include "glusterd-store.h"
#include "glusterd-hooks.h"
#include "glusterd-utils.h"
+#include "glusterd-locks.h"
#include "common-utils.h"
#include "run.h"
@@ -43,14 +45,19 @@
#include "glusterd-mountbroker.h"
-static uuid_t glusterd_uuid;
extern struct rpcsvc_program gluster_handshake_prog;
+extern struct rpcsvc_program gluster_cli_getspec_prog;
extern struct rpcsvc_program gluster_pmap_prog;
extern glusterd_op_info_t opinfo;
extern struct rpcsvc_program gd_svc_mgmt_prog;
+extern struct rpcsvc_program gd_svc_mgmt_v3_prog;
extern struct rpcsvc_program gd_svc_peer_prog;
extern struct rpcsvc_program gd_svc_cli_prog;
+extern struct rpcsvc_program gd_svc_cli_prog_ro;
extern struct rpc_clnt_program gd_brick_prog;
+extern struct rpcsvc_program glusterd_mgmt_hndsk_prog;
+
+extern char snap_mount_folder[PATH_MAX];
rpcsvc_cbk_program_t glusterd_cbk_prog = {
.progname = "Gluster Callback",
@@ -58,6 +65,57 @@ rpcsvc_cbk_program_t glusterd_cbk_prog = {
.progver = GLUSTER_CBK_VERSION,
};
+struct rpcsvc_program *gd_inet_programs[] = {
+ &gd_svc_peer_prog,
+ &gd_svc_cli_prog_ro,
+ &gd_svc_mgmt_prog,
+ &gd_svc_mgmt_v3_prog,
+ &gluster_pmap_prog,
+ &gluster_handshake_prog,
+ &glusterd_mgmt_hndsk_prog,
+};
+int gd_inet_programs_count = (sizeof (gd_inet_programs) /
+ sizeof (gd_inet_programs[0]));
+
+struct rpcsvc_program *gd_uds_programs[] = {
+ &gd_svc_cli_prog,
+ &gluster_cli_getspec_prog,
+};
+int gd_uds_programs_count = (sizeof (gd_uds_programs) /
+ sizeof (gd_uds_programs[0]));
+
+const char *gd_op_list[GD_OP_MAX + 1] = {
+ [GD_OP_NONE] = "Invalid op",
+ [GD_OP_CREATE_VOLUME] = "Create",
+ [GD_OP_START_BRICK] = "Start Brick",
+ [GD_OP_STOP_BRICK] = "Stop Brick",
+ [GD_OP_DELETE_VOLUME] = "Delete",
+ [GD_OP_START_VOLUME] = "Start",
+ [GD_OP_STOP_VOLUME] = "Stop",
+ [GD_OP_DEFRAG_VOLUME] = "Rebalance",
+ [GD_OP_ADD_BRICK] = "Add brick",
+ [GD_OP_REMOVE_BRICK] = "Remove brick",
+ [GD_OP_REPLACE_BRICK] = "Replace brick",
+ [GD_OP_SET_VOLUME] = "Set",
+ [GD_OP_RESET_VOLUME] = "Reset",
+ [GD_OP_SYNC_VOLUME] = "Sync",
+ [GD_OP_LOG_ROTATE] = "Log rotate",
+ [GD_OP_GSYNC_SET] = "Geo-replication",
+ [GD_OP_PROFILE_VOLUME] = "Profile",
+ [GD_OP_QUOTA] = "Quota",
+ [GD_OP_STATUS_VOLUME] = "Status",
+ [GD_OP_REBALANCE] = "Rebalance",
+ [GD_OP_HEAL_VOLUME] = "Heal",
+ [GD_OP_STATEDUMP_VOLUME] = "Statedump",
+ [GD_OP_LIST_VOLUME] = "Lists",
+ [GD_OP_CLEARLOCKS_VOLUME] = "Clear locks",
+ [GD_OP_DEFRAG_BRICK_VOLUME] = "Rebalance",
+ [GD_OP_COPY_FILE] = "Copy File",
+ [GD_OP_SYS_EXEC] = "Execute system commands",
+ [GD_OP_GSYNC_CREATE] = "Geo-replication Create",
+ [GD_OP_SNAP] = "Snapshot",
+ [GD_OP_MAX] = "Invalid op"
+};
static int
glusterd_opinfo_init ()
@@ -69,33 +127,30 @@ glusterd_opinfo_init ()
return ret;
}
+
int
glusterd_uuid_init ()
{
int ret = -1;
+ xlator_t *this = NULL;
glusterd_conf_t *priv = NULL;
- priv = THIS->private;
-
- ret = glusterd_retrieve_uuid ();
- if (ret == 0) {
- uuid_copy (glusterd_uuid, priv->uuid);
- gf_log ("glusterd", GF_LOG_INFO,
- "retrieved UUID: %s", uuid_utoa (priv->uuid));
- return 0;
- }
-
- uuid_generate (glusterd_uuid);
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
- gf_log ("glusterd", GF_LOG_INFO,
- "generated UUID: %s", uuid_utoa (glusterd_uuid));
- uuid_copy (priv->uuid, glusterd_uuid);
+ ret = glusterd_retrieve_uuid ();
+ if (ret == 0) {
+ gf_log (this->name, GF_LOG_INFO,
+ "retrieved UUID: %s", uuid_utoa (priv->uuid));
+ return 0;
+ }
- ret = glusterd_store_uuid ();
+ ret = glusterd_uuid_generate_save ();
if (ret) {
gf_log ("glusterd", GF_LOG_ERROR,
- "Unable to store generated UUID");
+ "Unable to generate and save new UUID");
return ret;
}
@@ -103,6 +158,63 @@ glusterd_uuid_init ()
}
int
+glusterd_uuid_generate_save ()
+{
+ int ret = -1;
+ glusterd_conf_t *priv = NULL;
+ xlator_t *this = NULL;
+
+ this = THIS;
+ GF_ASSERT (this);
+ priv = this->private;
+ GF_ASSERT (priv);
+
+ uuid_generate (priv->uuid);
+
+ gf_log (this->name, GF_LOG_INFO, "generated UUID: %s",
+ uuid_utoa (priv->uuid));
+
+ ret = glusterd_store_global_info (this);
+
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Unable to store the generated uuid %s",
+ uuid_utoa (priv->uuid));
+
+ return ret;
+}
+
+int
+glusterd_options_init (xlator_t *this)
+{
+ int ret = -1;
+ glusterd_conf_t *priv = NULL;
+ char *initial_version = "0";
+
+ priv = this->private;
+
+ priv->opts = dict_new ();
+ if (!priv->opts)
+ goto out;
+
+ ret = glusterd_store_retrieve_options (this);
+ if (ret == 0)
+ goto out;
+
+ ret = dict_set_str (priv->opts, GLUSTERD_GLOBAL_OPT_VERSION,
+ initial_version);
+ if (ret)
+ goto out;
+ ret = glusterd_store_options (this, priv->opts);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Unable to store version");
+ return ret;
+ }
+out:
+
+ return 0;
+}
+int
glusterd_fetchspec_notify (xlator_t *this)
{
int ret = -1;
@@ -111,10 +223,15 @@ glusterd_fetchspec_notify (xlator_t *this)
priv = this->private;
- list_for_each_entry (trans, &priv->xprt_list, list) {
- rpcsvc_callback_submit (priv->rpc, trans, &glusterd_cbk_prog,
- GF_CBK_FETCHSPEC, NULL, 0);
+ pthread_mutex_lock (&priv->xprt_lock);
+ {
+ list_for_each_entry (trans, &priv->xprt_list, list) {
+ rpcsvc_callback_submit (priv->rpc, trans,
+ &glusterd_cbk_prog,
+ GF_CBK_FETCHSPEC, NULL, 0);
+ }
}
+ pthread_mutex_unlock (&priv->xprt_lock);
ret = 0;
@@ -172,12 +289,16 @@ glusterd_rpcsvc_notify (rpcsvc_t *rpc, void *xl, rpcsvc_event_t event,
{
INIT_LIST_HEAD (&xprt->list);
+ pthread_mutex_lock (&priv->xprt_lock);
list_add_tail (&xprt->list, &priv->xprt_list);
+ pthread_mutex_unlock (&priv->xprt_lock);
break;
}
case RPCSVC_EVENT_DISCONNECT:
{
+ pthread_mutex_lock (&priv->xprt_lock);
list_del (&xprt->list);
+ pthread_mutex_unlock (&priv->xprt_lock);
pmap_registry_remove (this, 0, NULL, GF_PMAP_PORT_NONE, xprt);
break;
}
@@ -191,7 +312,7 @@ out:
}
-inline int32_t
+static inline int32_t
glusterd_program_register (xlator_t *this, rpcsvc_t *svc,
rpcsvc_program_t *prog)
{
@@ -326,8 +447,8 @@ glusterd_crt_georep_folders (char *georepdir, glusterd_conf_t *conf)
if (strlen (conf->workdir)+2 > PATH_MAX-strlen(GEOREP)) {
ret = -1;
gf_log ("glusterd", GF_LOG_CRITICAL,
- "Unable to create "GEOREP" directory %s",
- georepdir);
+ "directory path %s/"GEOREP" is longer than PATH_MAX",
+ conf->workdir);
goto out;
}
@@ -343,8 +464,8 @@ glusterd_crt_georep_folders (char *georepdir, glusterd_conf_t *conf)
if (strlen (DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP) >= PATH_MAX) {
ret = -1;
gf_log ("glusterd", GF_LOG_CRITICAL,
- "Unable to create "GEOREP" directory %s",
- georepdir);
+ "directory path "DEFAULT_LOG_FILE_DIRECTORY"/"
+ GEOREP" is longer than PATH_MAX");
goto out;
}
ret = mkdir_p (DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP, 0777, _gf_true);
@@ -354,11 +475,12 @@ glusterd_crt_georep_folders (char *georepdir, glusterd_conf_t *conf)
goto out;
}
+ /* Slave log file directory */
if (strlen(DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"-slaves") >= PATH_MAX) {
ret = -1;
gf_log ("glusterd", GF_LOG_CRITICAL,
- "Unable to create "GEOREP" directory %s",
- georepdir);
+ "directory path "DEFAULT_LOG_FILE_DIRECTORY"/"
+ GEOREP"-slaves"" is longer than PATH_MAX");
goto out;
}
ret = mkdir_p (DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"-slaves", 0777,
@@ -369,6 +491,22 @@ glusterd_crt_georep_folders (char *georepdir, glusterd_conf_t *conf)
goto out;
}
+ /* MountBroker log file directory */
+ if (strlen(DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"-slaves/mbr") >= PATH_MAX) {
+ ret = -1;
+ gf_log ("glusterd", GF_LOG_CRITICAL,
+ "directory path "DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP
+ "-slaves/mbr"" is longer than PATH_MAX");
+ goto out;
+ }
+ ret = mkdir_p (DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"-slaves/mbr", 0777,
+ _gf_true);
+ if (-1 == ret) {
+ gf_log ("glusterd", GF_LOG_CRITICAL,
+ "Unable to create "GEOREP" mountbroker slave log directory");
+ goto out;
+ }
+
ret = dict_get_str (THIS->options, GEOREP"-log-group", &greplg_s);
if (ret)
ret = 0;
@@ -386,6 +524,9 @@ glusterd_crt_georep_folders (char *georepdir, glusterd_conf_t *conf)
if (ret == 0)
ret = group_write_allow (DEFAULT_LOG_FILE_DIRECTORY"/"
GEOREP"-slaves", gr->gr_gid);
+ if (ret == 0)
+ ret = group_write_allow (DEFAULT_LOG_FILE_DIRECTORY"/"
+ GEOREP"-slaves/mbr", gr->gr_gid);
}
out:
@@ -398,7 +539,7 @@ runinit_gsyncd_setrx (runner_t *runner, glusterd_conf_t *conf)
{
runinit (runner);
runner_add_args (runner, GSYNCD_PREFIX"/gsyncd", "-c", NULL);
- runner_argprintf (runner, "%s/"GSYNC_CONF,conf->workdir);
+ runner_argprintf (runner, "%s/"GSYNC_CONF_TEMPLATE, conf->workdir);
runner_add_arg (runner, "--config-set-rx");
}
@@ -447,8 +588,8 @@ configure_syncdaemon (glusterd_conf_t *conf)
RUN_GSYNCD_CMD;
runinit_gsyncd_setrx (&runner, conf);
- runner_add_args (&runner, "remote-gsyncd",
- "/usr/local/libexec/glusterfs/gsyncd", ".", "^ssh:", NULL);
+ runner_add_args (&runner, "remote-gsyncd", "/nonexistent/gsyncd",
+ ".", "^ssh:", NULL);
RUN_GSYNCD_CMD;
/* gluster-command-dir */
@@ -460,7 +601,7 @@ configure_syncdaemon (glusterd_conf_t *conf)
/* gluster-params */
runinit_gsyncd_setrx (&runner, conf);
runner_add_args (&runner, "gluster-params",
- "xlator-option=*-dht.assert-no-child-down=true",
+ "aux-gfid-mount",
".", ".", NULL);
RUN_GSYNCD_CMD;
@@ -474,17 +615,50 @@ configure_syncdaemon (glusterd_conf_t *conf)
runner_add_args (&runner, ".", ".", NULL);
RUN_GSYNCD_CMD;
+ /* ssh-command tar */
+ runinit_gsyncd_setrx (&runner, conf);
+ runner_add_arg (&runner, "ssh-command-tar");
+ runner_argprintf (&runner,
+ "ssh -oPasswordAuthentication=no "
+ "-oStrictHostKeyChecking=no "
+ "-i %s/tar_ssh.pem", georepdir);
+ runner_add_args (&runner, ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
/* pid-file */
runinit_gsyncd_setrx (&runner, conf);
runner_add_arg (&runner, "pid-file");
- runner_argprintf (&runner, "%s/${mastervol}/${eSlave}.pid", georepdir);
+ runner_argprintf (&runner, "%s/${mastervol}_${remotehost}_${slavevol}/${eSlave}.pid", georepdir);
+ runner_add_args (&runner, ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* geo-rep working dir */
+ runinit_gsyncd_setrx (&runner, conf);
+ runner_add_arg (&runner, "georep-session-working-dir");
+ runner_argprintf (&runner, "%s/${mastervol}_${remotehost}_${slavevol}/", georepdir);
runner_add_args (&runner, ".", ".", NULL);
RUN_GSYNCD_CMD;
/* state-file */
runinit_gsyncd_setrx (&runner, conf);
runner_add_arg (&runner, "state-file");
- runner_argprintf (&runner, "%s/${mastervol}/${eSlave}.status", georepdir);
+ runner_argprintf (&runner, "%s/${mastervol}_${remotehost}_${slavevol}/${eSlave}.status", georepdir);
+ runner_add_args (&runner, ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* state-detail-file */
+ runinit_gsyncd_setrx (&runner, conf);
+ runner_add_arg (&runner, "state-detail-file");
+ runner_argprintf (&runner, "%s/${mastervol}_${remotehost}_${slavevol}/${eSlave}-detail.status",
+ georepdir);
+ runner_add_args (&runner, ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* state-detail-file */
+ runinit_gsyncd_setrx (&runner, conf);
+ runner_add_arg (&runner, "state-detail-file");
+ runner_argprintf (&runner, "%s/${mastervol}_${remotehost}_${slavevol}/${eSlave}-detail.status",
+ georepdir);
runner_add_args (&runner, ".", ".", NULL);
RUN_GSYNCD_CMD;
@@ -495,6 +669,11 @@ configure_syncdaemon (glusterd_conf_t *conf)
runner_add_args (&runner, ".", ".", NULL);
RUN_GSYNCD_CMD;
+ /* socketdir */
+ runinit_gsyncd_setrx (&runner, conf);
+ runner_add_args (&runner, "socketdir", GLUSTERD_SOCK_DIR, ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
/* log-file */
runinit_gsyncd_setrx (&runner, conf);
runner_add_args (&runner,
@@ -507,10 +686,32 @@ configure_syncdaemon (glusterd_conf_t *conf)
runinit_gsyncd_setrx (&runner, conf);
runner_add_args (&runner,
"gluster-log-file",
- DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"/${mastervol}/${eSlave}.gluster.log",
+ DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"/${mastervol}/${eSlave}${local_id}.gluster.log",
".", ".", NULL);
RUN_GSYNCD_CMD;
+ /* ignore-deletes */
+ runinit_gsyncd_setrx (&runner, conf);
+ runner_add_args (&runner, "ignore-deletes", "true", ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* special-sync-mode */
+ runinit_gsyncd_setrx (&runner, conf);
+ runner_add_args (&runner, "special-sync-mode", "partial", ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ /* change-detector == changelog */
+ runinit_gsyncd_setrx (&runner, conf);
+ runner_add_args(&runner, "change-detector", "changelog", ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
+ runinit_gsyncd_setrx (&runner, conf);
+ runner_add_arg(&runner, "working-dir");
+ runner_argprintf(&runner, "%s/${mastervol}/${eSlave}",
+ DEFAULT_VAR_RUN_DIRECTORY);
+ runner_add_args (&runner, ".", ".", NULL);
+ RUN_GSYNCD_CMD;
+
/************
* slave pre-configuration
************/
@@ -524,7 +725,7 @@ configure_syncdaemon (glusterd_conf_t *conf)
/* gluster-params */
runinit_gsyncd_setrx (&runner, conf);
runner_add_args (&runner, "gluster-params",
- "xlator-option=*-dht.assert-no-child-down=true",
+ "aux-gfid-mount",
".", NULL);
RUN_GSYNCD_CMD;
@@ -536,6 +737,14 @@ configure_syncdaemon (glusterd_conf_t *conf)
".", NULL);
RUN_GSYNCD_CMD;
+ /* MountBroker log-file */
+ runinit_gsyncd_setrx (&runner, conf);
+ runner_add_args (&runner,
+ "log-file-mbr",
+ DEFAULT_LOG_FILE_DIRECTORY"/"GEOREP"-slaves/mbr/${session_owner}:${eSlave}.log",
+ ".", NULL);
+ RUN_GSYNCD_CMD;
+
/* gluster-log-file */
runinit_gsyncd_setrx (&runner, conf);
runner_add_args (&runner,
@@ -596,7 +805,7 @@ check_prepare_mountbroker_root (char *mountbroker_root)
dfd0 = dup (dfd);
for (;;) {
- ret = openat (dfd, "..", O_RDONLY);
+ ret = sys_openat (dfd, "..", O_RDONLY);
if (ret != -1) {
dfd2 = ret;
ret = fstat (dfd2, &st2);
@@ -631,11 +840,11 @@ check_prepare_mountbroker_root (char *mountbroker_root)
st = st2;
}
- ret = mkdirat (dfd0, MB_HIVE, 0711);
+ ret = sys_mkdirat (dfd0, MB_HIVE, 0711);
if (ret == -1 && errno == EEXIST)
ret = 0;
if (ret != -1)
- ret = fstatat (dfd0, MB_HIVE, &st, AT_SYMLINK_NOFOLLOW);
+ ret = sys_fstatat (dfd0, MB_HIVE, &st, AT_SYMLINK_NOFOLLOW);
if (ret == -1 || st.st_mode != (S_IFDIR|0711)) {
gf_log ("", GF_LOG_ERROR,
"failed to set up mountbroker-root directory %s",
@@ -732,32 +941,226 @@ _install_mount_spec (dict_t *opts, char *key, data_t *value, void *data)
"adding %smount spec failed: label: %s desc: %s",
georep ? GEOREP" " : "", label, pdesc);
+ if (mspec) {
+ if (mspec->patterns) {
+ GF_FREE (mspec->patterns->components);
+ GF_FREE (mspec->patterns);
+ }
+ GF_FREE (mspec);
+ }
+
return -1;
}
+
static int
-glusterd_default_synctask_cbk (int ret, call_frame_t *frame, void *opaque)
+gd_default_synctask_cbk (int ret, call_frame_t *frame, void *opaque)
+{
+ glusterd_conf_t *priv = THIS->private;
+ synclock_unlock (&priv->big_lock);
+ return ret;
+}
+
+static void
+glusterd_launch_synctask (synctask_fn_t fn, void *opaque)
+{
+ xlator_t *this = NULL;
+ glusterd_conf_t *priv = NULL;
+ int ret = -1;
+
+ this = THIS;
+ priv = this->private;
+
+ synclock_lock (&priv->big_lock);
+ ret = synctask_new (this->ctx->env, fn, gd_default_synctask_cbk, NULL,
+ opaque);
+ if (ret)
+ gf_log (this->name, GF_LOG_CRITICAL, "Failed to spawn bricks"
+ " and other volume related services");
+}
+
+int
+glusterd_uds_rpcsvc_notify (rpcsvc_t *rpc, void *xl, rpcsvc_event_t event,
+ void *data)
{
- return ret;
+ /* glusterd_rpcsvc_notify() does stuff that calls coming in from the
+ * unix domain socket don't need. This is just an empty function to be
+ * used for the uds listener. This will be used later if required.
+ */
+ return 0;
+}
+
+/* The glusterd unix domain socket listener only listens for cli */
+rpcsvc_t *
+glusterd_init_uds_listener (xlator_t *this)
+{
+ int ret = -1;
+ dict_t *options = NULL;
+ rpcsvc_t *rpc = NULL;
+ data_t *sock_data = NULL;
+ char sockfile[PATH_MAX+1] = {0,};
+ int i = 0;
+
+
+ GF_ASSERT (this);
+
+ sock_data = dict_get (this->options, "glusterd-sockfile");
+ if (!sock_data) {
+ strncpy (sockfile, DEFAULT_GLUSTERD_SOCKFILE, PATH_MAX);
+ } else {
+ strncpy (sockfile, sock_data->data, PATH_MAX);
+ }
+
+ options = dict_new ();
+ if (!options)
+ goto out;
+
+ ret = rpcsvc_transport_unix_options_build (&options, sockfile);
+ if (ret)
+ goto out;
+
+ rpc = rpcsvc_init (this, this->ctx, options, 8);
+ if (rpc == NULL) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = rpcsvc_register_notify (rpc, glusterd_uds_rpcsvc_notify,
+ this);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Failed to register notify function");
+ goto out;
+ }
+
+ ret = rpcsvc_create_listeners (rpc, options, this->name);
+ if (ret != 1) {
+ gf_log (this->name, GF_LOG_DEBUG, "Failed to create listener");
+ goto out;
+ }
+ ret = 0;
+
+ for (i = 0; i < gd_uds_programs_count; i++) {
+ ret = glusterd_program_register (this, rpc, gd_uds_programs[i]);
+ if (ret) {
+ i--;
+ for (; i >= 0; i--)
+ rpcsvc_program_unregister (rpc,
+ gd_uds_programs[i]);
+
+ goto out;
+ }
+ }
+
+out:
+ if (options)
+ dict_unref (options);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to start glusterd "
+ "unix domain socket listener.");
+ if (rpc) {
+ GF_FREE (rpc);
+ rpc = NULL;
+ }
+ }
+ return rpc;
+}
+
+void
+glusterd_stop_uds_listener (xlator_t *this)
+{
+ glusterd_conf_t *conf = NULL;
+ rpcsvc_listener_t *listener = NULL;
+ rpcsvc_listener_t *next = NULL;
+
+ GF_ASSERT (this);
+ conf = this->private;
+
+ (void) rpcsvc_program_unregister (conf->uds_rpc, &gd_svc_cli_prog);
+ (void) rpcsvc_program_unregister (conf->uds_rpc, &gluster_handshake_prog);
+
+ list_for_each_entry_safe (listener, next, &conf->uds_rpc->listeners,
+ list) {
+ rpcsvc_listener_destroy (listener);
+ }
+
+ (void) rpcsvc_unregister_notify (conf->uds_rpc, glusterd_rpcsvc_notify,
+ this);
+
+ unlink (DEFAULT_GLUSTERD_SOCKFILE);
+
+ GF_FREE (conf->uds_rpc);
+ conf->uds_rpc = NULL;
+
+ return;
}
static int
-glusterd_launch_synctask (xlator_t *this, synctask_fn_t fn)
+glusterd_init_snap_folder (xlator_t *this)
{
- glusterd_conf_t *priv = NULL;
- int ret = -1;
+ int ret = -1;
+ struct stat buf = {0,};
+
+ GF_ASSERT (this);
+
+ /* Snapshot volumes are mounted under /var/run/gluster/snaps folder.
+ * But /var/run is normally a symbolic link to /run folder, which
+ * creates problems as the entry point in the mtab for the mount point
+ * and glusterd maintained entry point will be different. Therefore
+ * identify the correct run folder and use it for snap volume mounting.
+ */
+ ret = lstat (GLUSTERD_VAR_RUN_DIR, &buf);
+ if (ret != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "stat fails on %s, exiting. (errno = %d)",
+ GLUSTERD_VAR_RUN_DIR, errno);
+ goto out;
+ }
- priv = this->private;
+ /* If /var/run is symlink then use /run folder */
+ if (S_ISLNK (buf.st_mode)) {
+ strcpy (snap_mount_folder, GLUSTERD_RUN_DIR);
+ } else {
+ strcpy (snap_mount_folder, GLUSTERD_VAR_RUN_DIR);
+ }
+
+ strcat (snap_mount_folder, GLUSTERD_DEFAULT_SNAPS_BRICK_DIR);
- ret = synctask_new (this->ctx->env, fn,
- glusterd_default_synctask_cbk, NULL, priv);
+ ret = stat (snap_mount_folder, &buf);
+ if ((ret != 0) && (ENOENT != errno)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "stat fails on %s, exiting. (errno = %d)",
+ snap_mount_folder, errno);
+ ret = -1;
+ goto out;
+ }
- if (ret)
- gf_log (this->name, GF_LOG_CRITICAL, "Failed to create synctask"
- "for starting process");
- return ret;
+ if ((!ret) && (!S_ISDIR(buf.st_mode))) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "Provided snap path %s is not a directory,"
+ "exiting", snap_mount_folder);
+ ret = -1;
+ goto out;
+ }
+
+ if ((-1 == ret) && (ENOENT == errno)) {
+ /* Create missing folders */
+ ret = mkdir_p (snap_mount_folder, 0777, _gf_false);
+
+ if (-1 == ret) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "Unable to create directory %s"
+ " ,errno = %d", snap_mount_folder, errno);
+ goto out;
+ }
+ }
+
+out:
+ return ret;
}
+
/*
* init - called during glusterd initialization
*
@@ -769,6 +1172,7 @@ init (xlator_t *this)
{
int32_t ret = -1;
rpcsvc_t *rpc = NULL;
+ rpcsvc_t *uds_rpc = NULL;
glusterd_conf_t *conf = NULL;
data_t *dir_data = NULL;
struct stat buf = {0,};
@@ -778,10 +1182,9 @@ init (xlator_t *this)
char cmd_log_filename [PATH_MAX] = {0,};
int first_time = 0;
char *mountbroker_root = NULL;
-
-#ifdef DEBUG
+ int i = 0;
char *valgrind_str = NULL;
-#endif
+
dir_data = dict_get (this->options, "working-directory");
if (!dir_data) {
@@ -795,7 +1198,7 @@ init (xlator_t *this)
if ((ret != 0) && (ENOENT != errno)) {
gf_log (this->name, GF_LOG_ERROR,
"stat fails on %s, exiting. (errno = %d)",
- workdir, errno);
+ workdir, errno);
exit (1);
}
@@ -820,9 +1223,18 @@ init (xlator_t *this)
first_time = 1;
}
+ setenv ("GLUSTERD_WORKING_DIR", workdir, 1);
gf_log (this->name, GF_LOG_INFO, "Using %s as working directory",
workdir);
+ ret = glusterd_init_snap_folder (this);
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_CRITICAL, "Unable to create "
+ "snap backend folder");
+ exit (1);
+ }
+
snprintf (cmd_log_filename, PATH_MAX,"%s/.cmd_log_history",
DEFAULT_LOG_FILE_DIRECTORY);
ret = gf_cmd_log_init (cmd_log_filename);
@@ -844,6 +1256,17 @@ init (xlator_t *this)
exit (1);
}
+ snprintf (storedir, PATH_MAX, "%s/snaps", workdir);
+
+ ret = mkdir (storedir, 0777);
+
+ if ((-1 == ret) && (errno != EEXIST)) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "Unable to create snaps directory %s"
+ " ,errno = %d", storedir, errno);
+ exit (1);
+ }
+
snprintf (storedir, PATH_MAX, "%s/peers", workdir);
ret = mkdir (storedir, 0777);
@@ -882,6 +1305,15 @@ init (xlator_t *this)
exit (1);
}
+ snprintf (storedir, PATH_MAX, "%s/quotad", workdir);
+ ret = mkdir (storedir, 0777);
+ if ((-1 == ret) && (errno != EEXIST)) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "Unable to create quotad directory %s"
+ " ,errno = %d", storedir, errno);
+ exit (1);
+ }
+
snprintf (storedir, PATH_MAX, "%s/groups", workdir);
ret = mkdir (storedir, 0777);
if ((-1 == ret) && (errno != EEXIST)) {
@@ -920,58 +1352,54 @@ init (xlator_t *this)
goto out;
}
- ret = glusterd_program_register (this, rpc, &gd_svc_peer_prog);
- if (ret) {
- goto out;
- }
-
- ret = glusterd_program_register (this, rpc, &gd_svc_cli_prog);
- if (ret) {
- rpcsvc_program_unregister (rpc, &gd_svc_peer_prog);
- goto out;
- }
-
- ret = glusterd_program_register (this, rpc, &gd_svc_mgmt_prog);
- if (ret) {
- rpcsvc_program_unregister (rpc, &gd_svc_peer_prog);
- rpcsvc_program_unregister (rpc, &gd_svc_cli_prog);
- goto out;
- }
+ for (i = 0; i < gd_inet_programs_count; i++) {
+ ret = glusterd_program_register (this, rpc,
+ gd_inet_programs[i]);
+ if (ret) {
+ i--;
+ for (; i >= 0; i--)
+ rpcsvc_program_unregister (rpc,
+ gd_inet_programs[i]);
- ret = glusterd_program_register (this, rpc, &gluster_pmap_prog);
- if (ret) {
- rpcsvc_program_unregister (rpc, &gd_svc_peer_prog);
- rpcsvc_program_unregister (rpc, &gd_svc_cli_prog);
- rpcsvc_program_unregister (rpc, &gd_svc_mgmt_prog);
- goto out;
+ goto out;
+ }
}
- ret = glusterd_program_register (this, rpc, &gluster_handshake_prog);
- if (ret) {
- rpcsvc_program_unregister (rpc, &gd_svc_peer_prog);
- rpcsvc_program_unregister (rpc, &gluster_pmap_prog);
- rpcsvc_program_unregister (rpc, &gd_svc_cli_prog);
- rpcsvc_program_unregister (rpc, &gd_svc_mgmt_prog);
+ /* Start a unix domain socket listener just for cli commands
+ * This should prevent ports from being wasted by being in TIMED_WAIT
+ * when cli commands are done continuously
+ */
+ uds_rpc = glusterd_init_uds_listener (this);
+ if (uds_rpc == NULL) {
+ ret = -1;
goto out;
}
conf = GF_CALLOC (1, sizeof (glusterd_conf_t),
gf_gld_mt_glusterd_conf_t);
GF_VALIDATE_OR_GOTO(this->name, conf, out);
- conf->shd = GF_CALLOC (1, sizeof (nodesrv_t),
- gf_gld_mt_nodesrv_t);
+
+ conf->shd = GF_CALLOC (1, sizeof (nodesrv_t), gf_gld_mt_nodesrv_t);
GF_VALIDATE_OR_GOTO(this->name, conf->shd, out);
- conf->nfs = GF_CALLOC (1, sizeof (nodesrv_t),
- gf_gld_mt_nodesrv_t);
+ conf->nfs = GF_CALLOC (1, sizeof (nodesrv_t), gf_gld_mt_nodesrv_t);
GF_VALIDATE_OR_GOTO(this->name, conf->nfs, out);
+ conf->quotad = GF_CALLOC (1, sizeof (nodesrv_t),
+ gf_gld_mt_nodesrv_t);
+ GF_VALIDATE_OR_GOTO(this->name, conf->quotad, out);
INIT_LIST_HEAD (&conf->peers);
INIT_LIST_HEAD (&conf->volumes);
+ INIT_LIST_HEAD (&conf->snapshots);
+ INIT_LIST_HEAD (&conf->missed_snaps_list);
+
pthread_mutex_init (&conf->mutex, NULL);
conf->rpc = rpc;
+ conf->uds_rpc = uds_rpc;
conf->gfs_mgmt = &gd_brick_prog;
strncpy (conf->workdir, workdir, PATH_MAX);
+ synclock_init (&conf->big_lock);
+ pthread_mutex_init (&conf->xprt_lock, NULL);
INIT_LIST_HEAD (&conf->xprt_list);
glusterd_friend_sm_init ();
@@ -984,8 +1412,13 @@ init (xlator_t *this)
if (ret)
goto out;
+ conf->base_port = GF_IANA_PRIV_PORTS_START;
+ if (dict_get_uint32(this->options, "base-port", &conf->base_port) == 0) {
+ gf_log (this->name, GF_LOG_INFO,
+ "base-port override: %d", conf->base_port);
+ }
+
/* Set option to run bricks on valgrind if enabled in glusterd.vol */
-#ifdef DEBUG
conf->valgrind = _gf_false;
ret = dict_get_str (this->options, "run-with-valgrind", &valgrind_str);
if (ret < 0) {
@@ -998,10 +1431,11 @@ init (xlator_t *this)
"run-with-valgrind value not a boolean string");
}
}
-#endif
this->private = conf;
- (void) glusterd_nodesvc_set_running ("glustershd", _gf_false);
+ glusterd_mgmt_v3_lock_init ();
+ glusterd_txn_opinfo_dict_init ();
+ (void) glusterd_nodesvc_set_online_status ("glustershd", _gf_false);
GLUSTERD_GET_HOOKS_DIR (hooks_dir, GLUSTERD_HOOK_VER, conf);
if (stat (hooks_dir, &buf)) {
@@ -1035,17 +1469,35 @@ init (xlator_t *this)
if (ret < 0)
goto out;
+ /* If there are no 'friends', this would be the best time to
+ * spawn process/bricks that may need (re)starting since last
+ * time (this) glusterd was up.*/
+
+ if (list_empty (&conf->peers)) {
+ glusterd_launch_synctask (glusterd_spawn_daemons, NULL);
+ gf_log (this->name, GF_LOG_INFO,
+ "no peers, should start FRESH etcd");
+ /*
+ * We might not have any peers now, but if we did once before
+ * then we don't want to start up with a config that still has
+ * references to them.
+ */
+ nuke_etcd_dir();
+ }
+ else {
+ gf_log (this->name, GF_LOG_INFO,
+ "have peers, should start etcd with old config");
+ }
+ conf->etcd_pid = start_etcd(uuid_utoa(MY_UUID),NULL);
+
+ ret = glusterd_options_init (this);
+ if (ret < 0)
+ goto out;
+
ret = glusterd_handle_upgrade_downgrade (this->options, conf);
if (ret)
goto out;
- glusterd_launch_synctask (this,
- (synctask_fn_t) glusterd_restart_bricks);
- glusterd_launch_synctask (this,
- (synctask_fn_t) glusterd_restart_gsyncds);
- glusterd_launch_synctask (this,
- (synctask_fn_t) glusterd_restart_rebalance);
-
ret = glusterd_hooks_spawn_worker (this);
if (ret)
goto out;
@@ -1081,11 +1533,19 @@ fini (xlator_t *this)
goto out;
conf = this->private;
+
+ glusterd_stop_uds_listener (this);
+ stop_etcd(conf->etcd_pid);
+ nuke_etcd_dir();
+
FREE (conf->pmap);
if (conf->handle)
- glusterd_store_handle_destroy (conf->handle);
+ gf_store_handle_destroy (conf->handle);
glusterd_sm_tr_log_delete (&conf->op_sm_log);
+ glusterd_mgmt_v3_lock_fini ();
+ glusterd_txn_opinfo_dict_fini ();
GF_FREE (conf);
+
this->private = NULL;
out:
return;
@@ -1123,11 +1583,9 @@ notify (xlator_t *this, int32_t event, void *data, ...)
}
-struct xlator_fops fops = {
-};
+struct xlator_fops fops;
-struct xlator_cbks cbks = {
-};
+struct xlator_cbks cbks;
struct xlator_dumpops dumpops = {
.priv = glusterd_priv,
@@ -1174,10 +1632,37 @@ struct volume_options options[] = {
{ .key = {GEOREP"-log-group"},
.type = GF_OPTION_TYPE_ANY,
},
-#ifdef DEBUG
{ .key = {"run-with-valgrind"},
.type = GF_OPTION_TYPE_BOOL,
},
-#endif
+ { .key = {"server-quorum-type"},
+ .type = GF_OPTION_TYPE_STR,
+ .value = { "none", "server"},
+ .description = "This feature is on the server-side i.e. in glusterd."
+ " Whenever the glusterd on a machine observes that "
+ "the quorum is not met, it brings down the bricks to "
+ "prevent data split-brains. When the network "
+ "connections are brought back up and the quorum is "
+ "restored the bricks in the volume are brought back "
+ "up."
+ },
+ { .key = {"server-quorum-ratio"},
+ .type = GF_OPTION_TYPE_PERCENT,
+ .description = "Sets the quorum percentage for the trusted "
+ "storage pool."
+ },
+ { .key = {"glusterd-sockfile"},
+ .type = GF_OPTION_TYPE_PATH,
+ .description = "The socket file on which glusterd should listen for "
+ "cli requests. Default is "DEFAULT_GLUSTERD_SOCKFILE "."
+ },
+ { .key = {"base-port"},
+ .type = GF_OPTION_TYPE_INT,
+ .description = "Sets the base port for portmap query"
+ },
+ { .key = {"snap-brick-path"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "directory where the bricks for the snapshots will be created"
+ },
{ .key = {NULL} },
};
diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h
index b3fe1400a..7157bee64 100644
--- a/xlators/mgmt/glusterd/src/glusterd.h
+++ b/xlators/mgmt/glusterd/src/glusterd.h
@@ -1,5 +1,5 @@
/*
- Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+ Copyright (c) 2006-2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
@@ -37,13 +37,47 @@
#include "glusterd-pmap.h"
#include "cli1-xdr.h"
#include "syncop.h"
+#include "store.h"
+#include "glusterd-etcd.h"
#define GLUSTERD_MAX_VOLUME_NAME 1000
-#define DEFAULT_LOG_FILE_DIRECTORY DATADIR "/log/glusterfs"
#define GLUSTERD_TR_LOG_SIZE 50
#define GLUSTERD_NAME "glusterd"
#define GLUSTERD_SOCKET_LISTEN_BACKLOG 128
+#define GLUSTERD_QUORUM_TYPE_KEY "cluster.server-quorum-type"
+#define GLUSTERD_QUORUM_RATIO_KEY "cluster.server-quorum-ratio"
+#define GLUSTERD_GLOBAL_OPT_VERSION "global-option-version"
+#define GLUSTERD_COMMON_PEM_PUB_FILE "/geo-replication/common_secret.pem.pub"
+#define GEO_CONF_MAX_OPT_VALS 6
+#define GLUSTERD_CREATE_HOOK_SCRIPT "/hooks/1/gsync-create/post/" \
+ "S56glusterd-geo-rep-create-post.sh"
+
+
+#define GLUSTERD_SNAPS_MAX_HARD_LIMIT 256
+#define GLUSTERD_SNAPS_DEF_SOFT_LIMIT_PERCENT 90
+#define GLUSTERD_SNAPS_MAX_SOFT_LIMIT_PERCENT 100
+#define GLUSTERD_SERVER_QUORUM "server"
+
+#define FMTSTR_CHECK_VOL_EXISTS "Volume %s does not exist"
+#define FMTSTR_RESOLVE_BRICK "Could not find peer on which brick %s:%s resides"
+
+#define LOGSTR_FOUND_BRICK "Found brick %s:%s in volume %s"
+#define LOGSTR_BUILD_PAYLOAD "Failed to build payload for operation 'Volume %s'"
+#define LOGSTR_STAGE_FAIL "Staging of operation 'Volume %s' failed on %s %s %s"
+#define LOGSTR_COMMIT_FAIL "Commit of operation 'Volume %s' failed on %s %s %s"
+
+#define OPERRSTR_BUILD_PAYLOAD "Failed to build payload. Please check the log "\
+ "file for more details."
+#define OPERRSTR_STAGE_FAIL "Staging failed on %s. Please check the log file " \
+ "for more details."
+#define OPERRSTR_COMMIT_FAIL "Commit failed on %s. Please check the log file "\
+ "for more details."
+struct glusterd_volinfo_;
+typedef struct glusterd_volinfo_ glusterd_volinfo_t;
+
+struct glusterd_snap_;
+typedef struct glusterd_snap_ glusterd_snap_t;
typedef enum glusterd_op_ {
GD_OP_NONE = 0,
@@ -71,72 +105,117 @@ typedef enum glusterd_op_ {
GD_OP_LIST_VOLUME,
GD_OP_CLEARLOCKS_VOLUME,
GD_OP_DEFRAG_BRICK_VOLUME,
+ GD_OP_COPY_FILE,
+ GD_OP_SYS_EXEC,
+ GD_OP_GSYNC_CREATE,
+ GD_OP_SNAP,
GD_OP_MAX,
} glusterd_op_t;
-
-struct glusterd_store_iter_ {
- int fd;
- FILE *file;
- char filepath[PATH_MAX];
-};
-
-typedef struct glusterd_store_iter_ glusterd_store_iter_t;
+extern const char * gd_op_list[];
struct glusterd_volgen {
dict_t *dict;
};
+
typedef struct {
- struct rpc_clnt *rpc;
- gf_boolean_t running;
+ struct rpc_clnt *rpc;
+ gf_boolean_t online;
} nodesrv_t;
typedef struct {
- struct _volfile_ctx *volfile;
- pthread_mutex_t mutex;
- struct list_head peers;
-// struct list_head pending_peers;
- gf_boolean_t verify_volfile_checksum;
- gf_boolean_t trace;
- uuid_t uuid;
- char workdir[PATH_MAX];
- rpcsvc_t *rpc;
- nodesrv_t *shd;
- nodesrv_t *nfs;
- struct pmap_registry *pmap;
- struct list_head volumes;
- struct list_head xprt_list;
- glusterd_store_handle_t *handle;
- gf_timer_t *timer;
- glusterd_sm_tr_log_t op_sm_log;
+ gf_boolean_t quorum;
+ double quorum_ratio;
+ uint64_t gl_opt_version;
+} gd_global_opts_t;
+
+typedef struct {
+ struct _volfile_ctx *volfile;
+ pthread_mutex_t mutex;
+ struct list_head peers;
+ struct list_head xaction_peers;
+ gf_boolean_t verify_volfile_checksum;
+ gf_boolean_t trace;
+ uuid_t uuid;
+ char workdir[PATH_MAX];
+ rpcsvc_t *rpc;
+ nodesrv_t *shd;
+ nodesrv_t *nfs;
+ nodesrv_t *quotad;
+ struct pmap_registry *pmap;
+ struct list_head volumes;
+ struct list_head snapshots; /*List of snap volumes */
+ pthread_mutex_t xprt_lock;
+ struct list_head xprt_list;
+ gf_store_handle_t *handle;
+ gf_timer_t *timer;
+ glusterd_sm_tr_log_t op_sm_log;
struct rpc_clnt_program *gfs_mgmt;
- struct list_head mount_specs;
-#ifdef DEBUG
- gf_boolean_t valgrind;
-#endif
- pthread_t brick_thread;
- void *hooks_priv;
- xlator_t *xl; /* Should be set to 'THIS' before creating thread */
+ dict_t *mgmt_v3_lock; /* Dict for saving
+ * mgmt_v3 locks */
+ dict_t *glusterd_txn_opinfo; /* Dict for saving
+ * transaction opinfos */
+ uuid_t global_txn_id; /* To be used in
+ * heterogeneous
+ * cluster with no
+ * transaction ids */
+
+ struct list_head mount_specs;
+ gf_boolean_t valgrind;
+ pthread_t brick_thread;
+ void *hooks_priv;
+
+ /* need for proper handshake_t */
+ int op_version; /* Starts with 1 for 3.3.0 */
+ xlator_t *xl; /* Should be set to 'THIS' before creating thread */
+ gf_boolean_t pending_quorum_action;
+ dict_t *opts;
+ synclock_t big_lock;
+ gf_boolean_t restart_done;
+ rpcsvc_t *uds_rpc; /* RPCSVC for the unix domain socket */
+ uint32_t base_port;
+ uint64_t snap_max_hard_limit;
+ uint64_t snap_max_soft_limit;
+ char *snap_bricks_directory;
+ gf_store_handle_t *missed_snaps_list_shandle;
+ struct list_head missed_snaps_list;
+ pid_t etcd_pid;
} glusterd_conf_t;
+
typedef enum gf_brick_status {
GF_BRICK_STOPPED,
GF_BRICK_STARTED,
} gf_brick_status_t;
struct glusterd_brickinfo {
- char hostname[1024];
- char path[PATH_MAX];
- struct list_head brick_list;
- uuid_t uuid;
- int port;
- int rdma_port;
- char *logfile;
- gf_boolean_t signed_in;
- glusterd_store_handle_t *shandle;
- gf_brick_status_t status;
- struct rpc_clnt *rpc;
- int decommissioned;
+ char hostname[1024];
+ char path[PATH_MAX];
+ char device_path[PATH_MAX];
+ char brick_id[1024];/*Client xlator name, AFR changelog name*/
+ struct list_head brick_list;
+ uuid_t uuid;
+ int port;
+ int rdma_port;
+ char *logfile;
+ gf_boolean_t signed_in;
+ gf_store_handle_t *shandle;
+ gf_brick_status_t status;
+ struct rpc_clnt *rpc;
+ int decommissioned;
+ char vg[PATH_MAX]; /* FIXME: Use max size for length of vg */
+ int caps; /* Capability */
+ int32_t snap_status;
+ /*
+ * The group is used to identify which bricks are part of the same
+ * replica set during brick-volfile generation, so that NSR volfiles
+ * can "cross-connect" the bricks to one another. This same approach
+ * could be used to make client-volfile generation much simpler and
+ * more efficient too, though it would require some further adaptation
+ * to support more than one layer of hierarchy.
+ */
+ uint16_t group;
+ uuid_t nsr_uuid;
};
typedef struct glusterd_brickinfo glusterd_brickinfo_t;
@@ -147,9 +226,6 @@ struct gf_defrag_brickinfo_ {
int size;
};
-struct glusterd_volinfo_;
-typedef struct glusterd_volinfo_ glusterd_volinfo_t;
-
typedef int (*defrag_cbk_fn_t) (glusterd_volinfo_t *volinfo,
gf_defrag_status_t status);
@@ -162,7 +238,7 @@ struct glusterd_defrag_info_ {
int cmd;
pthread_t th;
gf_defrag_status_t defrag_status;
- struct rpc_clnt * rpc;
+ struct rpc_clnt *rpc;
uint32_t connected;
char mount[1024];
char databuf[131072];
@@ -195,65 +271,170 @@ struct _auth {
typedef struct _auth auth_t;
+/* Capabilities of xlator */
+#define CAPS_BD 0x00000001
+#define CAPS_THIN 0x00000002
+#define CAPS_OFFLOAD_COPY 0x00000004
+#define CAPS_OFFLOAD_SNAPSHOT 0x00000008
+#define CAPS_OFFLOAD_ZERO 0x00000020
+
+struct glusterd_rebalance_ {
+ gf_defrag_status_t defrag_status;
+ uint64_t rebalance_files;
+ uint64_t rebalance_data;
+ uint64_t lookedup_files;
+ uint64_t skipped_files;
+ glusterd_defrag_info_t *defrag;
+ gf_cli_defrag_type defrag_cmd;
+ uint64_t rebalance_failures;
+ uuid_t rebalance_id;
+ double rebalance_time;
+ glusterd_op_t op;
+ dict_t *dict; /* Dict to store misc information
+ * like list of bricks being removed */
+};
+
+typedef struct glusterd_rebalance_ glusterd_rebalance_t;
+
+struct glusterd_replace_brick_ {
+ gf_rb_status_t rb_status;
+ glusterd_brickinfo_t *src_brick;
+ glusterd_brickinfo_t *dst_brick;
+ uuid_t rb_id;
+};
+
+typedef struct glusterd_replace_brick_ glusterd_replace_brick_t;
+
struct glusterd_volinfo_ {
- char volname[GLUSTERD_MAX_VOLUME_NAME];
- int type;
- int brick_count;
- struct list_head vol_list;
- struct list_head bricks;
- glusterd_volume_status status;
- int sub_count; /* backward compatibility */
- int stripe_count;
- int replica_count;
- int dist_leaf_count; /* Number of bricks in one
+ gf_lock_t lock;
+ char volname[GLUSTERD_MAX_VOLUME_NAME];
+ gf_boolean_t is_snap_volume;
+ glusterd_snap_t *snapshot;
+ uuid_t restored_from_snap;
+ char parent_volname[GLUSTERD_MAX_VOLUME_NAME];
+ /* In case of a snap volume
+ i.e (is_snap_volume == TRUE) this
+ field will contain the name of
+ the volume which is snapped. In
+ case of a non-snap volume, this
+ field will be initialized as N/A */
+ int type;
+ int brick_count;
+ uint64_t snap_count;
+ uint64_t snap_max_hard_limit;
+ struct list_head vol_list;
+ /* In case of a snap volume
+ i.e (is_snap_volume == TRUE) this
+ is linked to glusterd_snap_t->volumes.
+ In case of a non-snap volume, this is
+ linked to glusterd_conf_t->volumes */
+ struct list_head snapvol_list;
+ /* This is a current pointer for
+ glusterd_volinfo_t->snap_volumes */
+ struct list_head bricks;
+ struct list_head snap_volumes;
+ /* TODO : Need to remove this, as this
+ * is already part of snapshot object.
+ */
+ glusterd_volume_status status;
+ int sub_count; /* backward compatibility */
+ int stripe_count;
+ int replica_count;
+ int subvol_count; /* Number of subvolumes in a
+ distribute volume */
+ int dist_leaf_count; /* Number of bricks in one
distribute subvolume */
- int port;
- glusterd_store_handle_t *shandle;
- glusterd_store_handle_t *rb_shandle;
- glusterd_store_handle_t *node_state_shandle;
+ int port;
+ gf_store_handle_t *shandle;
+ gf_store_handle_t *rb_shandle;
+ gf_store_handle_t *node_state_shandle;
+ gf_store_handle_t *quota_conf_shandle;
/* Defrag/rebalance related */
- gf_defrag_status_t defrag_status;
- uint64_t rebalance_files;
- uint64_t rebalance_data;
- uint64_t lookedup_files;
- glusterd_defrag_info_t *defrag;
- gf_cli_defrag_type defrag_cmd;
- uint64_t rebalance_failures;
- double rebalance_time;
+ glusterd_rebalance_t rebal;
/* Replace brick status */
- gf_rb_status_t rb_status;
- glusterd_brickinfo_t *src_brick;
- glusterd_brickinfo_t *dst_brick;
+ glusterd_replace_brick_t rep_brick;
+
+ int version;
+ uint32_t quota_conf_version;
+ uint32_t cksum;
+ uint32_t quota_conf_cksum;
+ gf_transport_type transport_type;
+ gf_transport_type nfs_transport_type;
+
+ dict_t *dict;
- int version;
- uint32_t cksum;
- gf_transport_type transport_type;
- gf_transport_type nfs_transport_type;
+ uuid_t volume_id;
+ auth_t auth;
+ char *logdir;
- dict_t *dict;
+ dict_t *gsync_slaves;
- uuid_t volume_id;
- auth_t auth;
- char *logdir;
+ int decommission_in_progress;
+ xlator_t *xl;
- dict_t *gsync_slaves;
+ gf_boolean_t memory_accounting;
+ int caps; /* Capability */
- int decommission_in_progress;
- xlator_t *xl;
+ int op_version;
+ int client_op_version;
+ pthread_mutex_t reflock;
+ int refcnt;
+};
- gf_boolean_t memory_accounting;
+typedef enum gd_snap_status_ {
+ GD_SNAP_STATUS_NONE,
+ GD_SNAP_STATUS_INIT,
+ GD_SNAP_STATUS_IN_USE,
+ GD_SNAP_STATUS_DECOMMISSION,
+ GD_SNAP_STATUS_RESTORED,
+} gd_snap_status_t;
+
+struct glusterd_snap_ {
+ gf_lock_t lock;
+ struct list_head volumes;
+ struct list_head snap_list;
+ char snapname[GLUSTERD_MAX_SNAP_NAME];
+ uuid_t snap_id;
+ char *description;
+ time_t time_stamp;
+ gf_boolean_t snap_restored;
+ gd_snap_status_t snap_status;
+ gf_store_handle_t *shandle;
};
+typedef struct glusterd_snap_op_ {
+ char *snap_vol_id;
+ int32_t brick_num;
+ char *brick_path;
+ int32_t op;
+ int32_t status;
+ struct list_head snap_ops_list;
+} glusterd_snap_op_t;
+
+typedef struct glusterd_missed_snap_ {
+ char *node_uuid;
+ char *snap_uuid;
+ struct list_head missed_snaps;
+ struct list_head snap_ops;
+} glusterd_missed_snap_info;
+
typedef enum gd_node_type_ {
GD_NODE_NONE,
GD_NODE_BRICK,
GD_NODE_SHD,
GD_NODE_REBALANCE,
GD_NODE_NFS,
+ GD_NODE_QUOTAD,
} gd_node_type;
+typedef enum missed_snap_stat {
+ GD_MISSED_SNAP_NONE,
+ GD_MISSED_SNAP_PENDING,
+ GD_MISSED_SNAP_DONE,
+} missed_snap_stat;
+
typedef struct glusterd_pending_node_ {
struct list_head list;
void *node;
@@ -261,6 +442,13 @@ typedef struct glusterd_pending_node_ {
int32_t index;
} glusterd_pending_node_t;
+struct gsync_config_opt_vals_ {
+ char *op_name;
+ int no_of_pos_vals;
+ gf_boolean_t case_sensitive;
+ char *values[GEO_CONF_MAX_OPT_VALS];
+};
+
enum glusterd_op_ret {
GLUSTERD_CONNECTION_AWAITED = 100,
};
@@ -275,13 +463,23 @@ enum glusterd_vol_comp_status_ {
#define GLUSTERD_DEFAULT_WORKDIR "/var/lib/glusterd"
#define GLUSTERD_DEFAULT_PORT GF_DEFAULT_BASE_PORT
#define GLUSTERD_INFO_FILE "glusterd.info"
+#define GLUSTERD_VOLUME_QUOTA_CONFIG "quota.conf"
#define GLUSTERD_VOLUME_DIR_PREFIX "vols"
#define GLUSTERD_PEER_DIR_PREFIX "peers"
#define GLUSTERD_VOLUME_INFO_FILE "info"
+#define GLUSTERD_SNAP_INFO_FILE "info"
#define GLUSTERD_VOLUME_RBSTATE_FILE "rbstate"
#define GLUSTERD_BRICK_INFO_DIR "bricks"
#define GLUSTERD_CKSUM_FILE "cksum"
+#define GLUSTERD_VOL_QUOTA_CKSUM_FILE "quota.cksum"
+#define GLUSTERD_TRASH "trash"
#define GLUSTERD_NODE_STATE_FILE "node_state.info"
+#define GLUSTERD_MISSED_SNAPS_LIST_FILE "missed_snaps_list"
+#define GLUSTERD_VOL_SNAP_DIR_PREFIX "snaps"
+
+#define GLUSTERD_DEFAULT_SNAPS_BRICK_DIR "/gluster/snaps"
+#define GLUSTERD_VAR_RUN_DIR "/var/run"
+#define GLUSTERD_RUN_DIR "/run"
/* definitions related to replace brick */
#define RB_CLIENT_MOUNTPOINT "rb_mount"
@@ -294,18 +492,40 @@ enum glusterd_vol_comp_status_ {
typedef ssize_t (*gd_serialize_t) (struct iovec outmsg, void *args);
-#define GLUSTERD_GET_VOLUME_DIR(path, volinfo, priv) \
- snprintf (path, PATH_MAX, "%s/vols/%s", priv->workdir,\
- volinfo->volname);
+#define GLUSTERD_GET_VOLUME_DIR(path, volinfo, priv) \
+ if (volinfo->is_snap_volume) { \
+ snprintf (path, PATH_MAX, "%s/snaps/%s/%s", priv->workdir, \
+ volinfo->snapshot->snapname, volinfo->volname); \
+ } else { \
+ snprintf (path, PATH_MAX, "%s/vols/%s", priv->workdir, \
+ volinfo->volname); \
+ }
-#define GLUSTERD_GET_BRICK_DIR(path, volinfo, priv) \
- snprintf (path, PATH_MAX, "%s/%s/%s/%s", priv->workdir,\
- GLUSTERD_VOLUME_DIR_PREFIX, volinfo->volname, \
- GLUSTERD_BRICK_INFO_DIR);
+#define GLUSTERD_GET_SNAP_DIR(path, snap, priv) \
+ snprintf (path, PATH_MAX, "%s/snaps/%s", priv->workdir, \
+ snap->snapname);
+
+#define GLUSTERD_GET_BRICK_DIR(path, volinfo, priv) \
+ if (volinfo->is_snap_volume) { \
+ snprintf (path, PATH_MAX, "%s/snaps/%s/%s/%s", priv->workdir, \
+ volinfo->snapshot->snapname, volinfo->volname, \
+ GLUSTERD_BRICK_INFO_DIR); \
+ } else { \
+ snprintf (path, PATH_MAX, "%s/%s/%s/%s", priv->workdir, \
+ GLUSTERD_VOLUME_DIR_PREFIX, volinfo->volname, \
+ GLUSTERD_BRICK_INFO_DIR); \
+ }
#define GLUSTERD_GET_NFS_DIR(path, priv) \
snprintf (path, PATH_MAX, "%s/nfs", priv->workdir);
+#define GLUSTERD_GET_QUOTAD_DIR(path, priv) \
+ snprintf (path, PATH_MAX, "%s/quotad", priv->workdir);
+
+#define GLUSTERD_GET_QUOTA_AUX_MOUNT_PATH(abspath, volname, path) \
+ snprintf (abspath, sizeof (abspath)-1, \
+ DEFAULT_VAR_RUN_DIRECTORY"/%s%s", volname, path);
+
#define GLUSTERD_REMOVE_SLASH_FROM_PATH(path,string) do { \
int i = 0; \
for (i = 1; i < strlen (path); i++) { \
@@ -315,22 +535,38 @@ typedef ssize_t (*gd_serialize_t) (struct iovec outmsg, void *args);
} \
} while (0)
-#define GLUSTERD_GET_BRICK_PIDFILE(pidfile,volpath,hostname,brickpath) { \
- char exp_path[PATH_MAX] = {0,}; \
- GLUSTERD_REMOVE_SLASH_FROM_PATH (brickpath, exp_path); \
- snprintf (pidfile, PATH_MAX, "%s/run/%s-%s.pid", \
- volpath, hostname, exp_path); \
- }
+#define GLUSTERD_GET_BRICK_PIDFILE(pidfile,volinfo,brickinfo, priv) do { \
+ char exp_path[PATH_MAX] = {0,}; \
+ char volpath[PATH_MAX] = {0,}; \
+ GLUSTERD_GET_VOLUME_DIR (volpath, volinfo, priv); \
+ GLUSTERD_REMOVE_SLASH_FROM_PATH (brickinfo->path, exp_path); \
+ snprintf (pidfile, PATH_MAX, "%s/run/%s-%s.pid", \
+ volpath, brickinfo->hostname, exp_path); \
+ } while (0)
+
+#define GLUSTERD_GET_BRICK_RECON_PIDFILE(pidfile,volinfo,brickinfo, priv) do { \
+ char exp_path[PATH_MAX] = {0,}; \
+ char volpath[PATH_MAX] = {0,}; \
+ GLUSTERD_GET_VOLUME_DIR (volpath, volinfo, priv); \
+ GLUSTERD_REMOVE_SLASH_FROM_PATH (brickinfo->path, exp_path); \
+ snprintf (pidfile, PATH_MAX, "%s/run/%s:-%s-recon.pid", \
+ volpath, brickinfo->hostname, exp_path); \
+ } while (0)
#define GLUSTERD_GET_NFS_PIDFILE(pidfile,nfspath) { \
snprintf (pidfile, PATH_MAX, "%s/run/nfs.pid", \
nfspath); \
}
+#define GLUSTERD_GET_QUOTAD_PIDFILE(pidfile,quotadpath) { \
+ snprintf (pidfile, PATH_MAX, "%s/run/quotad.pid", \
+ quotadpath); \
+ }
+
#define GLUSTERD_STACK_DESTROY(frame) do {\
- frame->local = NULL; \
- STACK_DESTROY (frame->root);\
- } while (0)
+ frame->local = NULL; \
+ STACK_DESTROY (frame->root); \
+ } while (0)
#define GLUSTERD_GET_DEFRAG_DIR(path, volinfo, priv) do { \
char vol_path[PATH_MAX]; \
@@ -338,13 +574,19 @@ typedef ssize_t (*gd_serialize_t) (struct iovec outmsg, void *args);
snprintf (path, PATH_MAX, "%s/rebalance",vol_path); \
} while (0)
-#define GLUSTERD_GET_DEFRAG_SOCK_FILE(path, volinfo, priv) do { \
+#define GLUSTERD_GET_DEFRAG_SOCK_FILE_OLD(path, volinfo, priv) do { \
char defrag_path[PATH_MAX]; \
GLUSTERD_GET_DEFRAG_DIR(defrag_path, volinfo, priv); \
snprintf (path, PATH_MAX, "%s/%s.sock", defrag_path, \
uuid_utoa(MY_UUID)); \
} while (0)
+#define GLUSTERD_GET_DEFRAG_SOCK_FILE(path, volinfo) do { \
+ snprintf (path, UNIX_PATH_MAX, DEFAULT_VAR_RUN_DIRECTORY \
+ "/gluster-rebalance-%s.sock", \
+ uuid_utoa(volinfo->volume_id)); \
+ } while (0)
+
#define GLUSTERD_GET_DEFRAG_PID_FILE(path, volinfo, priv) do { \
char defrag_path[PATH_MAX]; \
GLUSTERD_GET_DEFRAG_DIR(defrag_path, volinfo, priv); \
@@ -352,30 +594,62 @@ typedef ssize_t (*gd_serialize_t) (struct iovec outmsg, void *args);
uuid_utoa(MY_UUID)); \
} while (0)
+#define GLUSTERFS_GET_AUX_MOUNT_PIDFILE(pidfile, volname) { \
+ snprintf (pidfile, PATH_MAX-1, \
+ DEFAULT_VAR_RUN_DIRECTORY"/%s.pid", volname); \
+ }
+
+#define GLUSTERD_GET_UUID_NOHYPHEN(ret_string, uuid) do { \
+ char *snap_volname_ptr = ret_string; \
+ char *snap_volid_ptr = uuid_utoa(uuid); \
+ while (*snap_volid_ptr) { \
+ if (*snap_volid_ptr == '-') { \
+ snap_volid_ptr++; \
+ } else { \
+ (*snap_volname_ptr++) = \
+ (*snap_volid_ptr++); \
+ } \
+ } \
+ *snap_volname_ptr = '\0'; \
+ } while (0)
int glusterd_uuid_init();
+int glusterd_uuid_generate_save ();
+
#define MY_UUID (__glusterd_uuid())
static inline unsigned char *
__glusterd_uuid()
{
- glusterd_conf_t *priv = THIS->private;
+ glusterd_conf_t *priv = THIS->private;
- if (uuid_is_null (priv->uuid))
- glusterd_uuid_init();
- return &priv->uuid[0];
+ if (uuid_is_null (priv->uuid))
+ glusterd_uuid_init();
+ return &priv->uuid[0];
}
+int glusterd_big_locked_notify (struct rpc_clnt *rpc, void *mydata,
+ rpc_clnt_event_t event,
+ void *data, rpc_clnt_notify_t notify_fn);
+
+int
+glusterd_big_locked_cbk (struct rpc_req *req, struct iovec *iov,
+ int count, void *myframe, fop_cbk_fn_t fn);
+
+int glusterd_big_locked_handler (rpcsvc_request_t *req, rpcsvc_actor actor_fn);
+
int32_t
glusterd_brick_from_brickinfo (glusterd_brickinfo_t *brickinfo,
char **new_brick);
int
-glusterd_probe_begin (rpcsvc_request_t *req, const char *hoststr, int port);
+glusterd_probe_begin (rpcsvc_request_t *req, const char *hoststr, int port,
+ dict_t *dict);
int
-glusterd_xfer_friend_add_resp (rpcsvc_request_t *req, char *hostname,
- int port, int32_t op_ret, int32_t op_errno);
+glusterd_xfer_friend_add_resp (rpcsvc_request_t *req, char *myhostname,
+ char *remote_hostname, int port, int32_t op_ret,
+ int32_t op_errno);
int
glusterd_friend_find (uuid_t uuid, char *hostname,
@@ -388,6 +662,9 @@ glusterd_friend_add (const char *hoststr, int port,
gf_boolean_t restore, glusterd_peerctx_args_t *args);
int
+glusterd_friend_rpc_create (xlator_t *this, glusterd_peerinfo_t *peerinfo,
+ glusterd_peerctx_args_t *args);
+int
glusterd_friend_remove (uuid_t uuid, char *hostname);
int
@@ -397,6 +674,14 @@ int
glusterd_op_unlock_send_resp (rpcsvc_request_t *req, int32_t status);
int
+glusterd_op_mgmt_v3_lock_send_resp (rpcsvc_request_t *req,
+ uuid_t *txn_id, int32_t status);
+
+int
+glusterd_op_mgmt_v3_unlock_send_resp (rpcsvc_request_t *req,
+ uuid_t *txn_id, int32_t status);
+
+int
glusterd_op_stage_send_resp (rpcsvc_request_t *req,
int32_t op, int32_t status,
char *op_errstr, dict_t *rsp_dict);
@@ -441,7 +726,7 @@ glusterd_handle_defrag_volume_v2 (rpcsvc_request_t *req);
int
glusterd_xfer_cli_probe_resp (rpcsvc_request_t *req, int32_t op_ret,
int32_t op_errno, char *op_errstr, char *hostname,
- int port);
+ int port, dict_t *dict);
int
glusterd_op_commit_send_resp (rpcsvc_request_t *req,
@@ -453,7 +738,7 @@ glusterd_xfer_friend_remove_resp (rpcsvc_request_t *req, char *hostname, int por
int
glusterd_deprobe_begin (rpcsvc_request_t *req, const char *hoststr, int port,
- uuid_t uuid);
+ uuid_t uuid, dict_t *dict);
int
glusterd_handle_cli_deprobe (rpcsvc_request_t *req);
@@ -528,6 +813,12 @@ int
glusterd_handle_reset_volume (rpcsvc_request_t *req);
int
+glusterd_handle_copy_file (rpcsvc_request_t *req);
+
+int
+glusterd_handle_sys_exec (rpcsvc_request_t *req);
+
+int
glusterd_handle_gsync_set (rpcsvc_request_t *req);
int
@@ -539,7 +830,7 @@ glusterd_handle_fsm_log (rpcsvc_request_t *req);
int
glusterd_xfer_cli_deprobe_resp (rpcsvc_request_t *req, int32_t op_ret,
int32_t op_errno, char *op_errstr,
- char *hostname);
+ char *hostname, dict_t *dict);
int
glusterd_fetchspec_notify (xlator_t *this);
@@ -556,8 +847,8 @@ glusterd_volume_txn (rpcsvc_request_t *req, char *volname, int flags,
glusterd_op_t op);
int
-glusterd_peer_handshake (xlator_t *this, struct rpc_clnt *rpc,
- glusterd_peerctx_t *peerctx);
+glusterd_peer_dump_version (xlator_t *this, struct rpc_clnt *rpc,
+ glusterd_peerctx_t *peerctx);
int
glusterd_validate_reconfopts (glusterd_volinfo_t *volinfo, dict_t *val_dict, char **op_errstr);
@@ -587,7 +878,8 @@ glusterd_rpc_create (struct rpc_clnt **rpc, dict_t *options,
/* handler functions */
-int32_t glusterd_op_begin (rpcsvc_request_t *req, glusterd_op_t op, void *ctx);
+int32_t glusterd_op_begin (rpcsvc_request_t *req, glusterd_op_t op, void *ctx,
+ char *err_str, size_t size);
/* removed other definitions as they have been defined elsewhere in this file*/
@@ -595,22 +887,32 @@ int glusterd_handle_cli_statedump_volume (rpcsvc_request_t *req);
int glusterd_handle_cli_clearlocks_volume (rpcsvc_request_t *req);
int glusterd_handle_defrag_start (glusterd_volinfo_t *volinfo, char *op_errstr,
- size_t len, int cmd, defrag_cbk_fn_t cbk);
+ size_t len, int cmd, defrag_cbk_fn_t cbk,
+ glusterd_op_t op);
int
glusterd_rebalance_rpc_create (glusterd_volinfo_t *volinfo,
- glusterd_conf_t *priv, int cmd);
+ gf_boolean_t reconnect);
int glusterd_handle_cli_heal_volume (rpcsvc_request_t *req);
int glusterd_handle_cli_list_volume (rpcsvc_request_t *req);
+int
+glusterd_handle_snapshot (rpcsvc_request_t *req);
+
/* op-sm functions */
int glusterd_op_stage_heal_volume (dict_t *dict, char **op_errstr);
int glusterd_op_heal_volume (dict_t *dict, char **op_errstr);
int glusterd_op_stage_gsync_set (dict_t *dict, char **op_errstr);
int glusterd_op_gsync_set (dict_t *dict, char **op_errstr, dict_t *rsp_dict);
-int glusterd_op_quota (dict_t *dict, char **op_errstr);
-int glusterd_op_stage_quota (dict_t *dict, char **op_errstr);
+int glusterd_op_stage_copy_file (dict_t *dict, char **op_errstr);
+int glusterd_op_copy_file (dict_t *dict, char **op_errstr);
+int glusterd_op_stage_sys_exec (dict_t *dict, char **op_errstr);
+int glusterd_op_sys_exec (dict_t *dict, char **op_errstr, dict_t *rsp_dict);
+int glusterd_op_stage_gsync_create (dict_t *dict, char **op_errstr);
+int glusterd_op_gsync_create (dict_t *dict, char **op_errstr, dict_t *rsp_dict);
+int glusterd_op_quota (dict_t *dict, char **op_errstr, dict_t *rsp_dict);
+int glusterd_op_stage_quota (dict_t *dict, char **op_errstr, dict_t *rsp_dict);
int glusterd_op_stage_replace_brick (dict_t *dict, char **op_errstr,
dict_t *rsp_dict);
int glusterd_op_replace_brick (dict_t *dict, dict_t *rsp_dict);
@@ -637,7 +939,8 @@ int glusterd_op_stage_statedump_volume (dict_t *dict, char **op_errstr);
int glusterd_op_statedump_volume (dict_t *dict, char **op_errstr);
int glusterd_op_stage_clearlocks_volume (dict_t *dict, char **op_errstr);
-int glusterd_op_clearlocks_volume (dict_t *dict, char **op_errstr);
+int glusterd_op_clearlocks_volume (dict_t *dict, char **op_errstr,
+ dict_t *rsp_dict);
/* misc */
void glusterd_do_replace_brick (void *data);
@@ -648,10 +951,69 @@ int glusterd_op_statedump_volume_args_get (dict_t *dict, char **volname,
char **options, int *option_cnt);
int glusterd_op_gsync_args_get (dict_t *dict, char **op_errstr,
- char **master, char **slave);
+ char **master, char **slave, char **host_uuid);
+
+int glusterd_stop_volume (glusterd_volinfo_t *volinfo);
+
/* Synctask part */
int32_t glusterd_op_begin_synctask (rpcsvc_request_t *req, glusterd_op_t op,
void *dict);
int32_t
glusterd_defrag_event_notify_handle (dict_t *dict);
+
+int32_t
+glusterd_txn_opinfo_dict_init ();
+
+void
+glusterd_txn_opinfo_dict_fini ();
+
+void
+glusterd_txn_opinfo_init ();
+
+/* snapshot */
+glusterd_snap_t*
+glusterd_new_snap_object();
+
+int32_t
+glusterd_list_add_snapvol (glusterd_volinfo_t *origin_vol,
+ glusterd_volinfo_t *snap_vol);
+
+glusterd_snap_t*
+glusterd_remove_snap_by_id (uuid_t snap_id);
+
+glusterd_snap_t*
+glusterd_remove_snap_by_name (char *snap_name);
+
+glusterd_snap_t*
+glusterd_find_snap_by_name (char *snap_name);
+
+glusterd_snap_t*
+glusterd_find_snap_by_id (uuid_t snap_id);
+
+int
+glusterd_snapshot_prevalidate (dict_t *dict, char **op_errstr,
+ dict_t *rsp_dict);
+int
+glusterd_snapshot_brickop (dict_t *dict, char **op_errstr, dict_t *rsp_dict);
+int
+glusterd_snapshot (dict_t *dict, char **op_errstr, dict_t *rsp_dict);
+int
+glusterd_snapshot_postvalidate (dict_t *dict, int32_t op_ret, char **op_errstr,
+ dict_t *rsp_dict);
+char *
+glusterd_build_snap_device_path (char *device, char *snapname);
+int32_t
+glusterd_snap_remove (dict_t *rsp_dict, glusterd_snap_t *snap,
+ gf_boolean_t remove_lvm, gf_boolean_t force);
+int32_t
+glusterd_snapshot_cleanup (dict_t *dict, char **op_errstr, dict_t *rsp_dict);
+
+int32_t
+glusterd_add_missed_snaps_to_list (dict_t *dict, int32_t missed_snap_count);
+
+int32_t
+glusterd_add_new_entry_to_list (char *missed_info, char *snap_vol_id,
+ int32_t brick_num, char *brick_path,
+ int32_t snap_op, int32_t snap_status);
+
#endif
diff --git a/xlators/mount/fuse/src/Makefile.am b/xlators/mount/fuse/src/Makefile.am
index 4e9287067..7d1f93447 100644
--- a/xlators/mount/fuse/src/Makefile.am
+++ b/xlators/mount/fuse/src/Makefile.am
@@ -1,7 +1,9 @@
noinst_HEADERS_linux = $(CONTRIBDIR)/fuse-include/fuse_kernel.h\
$(CONTRIBDIR)/fuse-include/mount_util.h\
$(CONTRIBDIR)/fuse-lib/mount-gluster-compat.h
-noinst_HEADERS_darwin = $(CONTRIBDIR)/fuse-include/fuse_kernel_macfuse.h
+noinst_HEADERS_darwin = $(CONTRIBDIR)/fuse-include/fuse_kernel_macfuse.h\
+ $(CONTRIBDIR)/macfuse/fuse_param.h\
+ $(CONTRIBDIR)/macfuse/fuse_ioctl.h
noinst_HEADERS_common = $(CONTRIBDIR)/fuse-include/fuse-mount.h\
$(CONTRIBDIR)/fuse-include/fuse-misc.h fuse-mem-types.h \
fuse-bridge.h
@@ -24,7 +26,7 @@ endif
fuse_la_SOURCES = fuse-helpers.c fuse-resolve.c fuse-bridge.c \
$(CONTRIBDIR)/fuse-lib/misc.c $(mount_source)
-fuse_la_LDFLAGS = -module -avoidversion
+fuse_la_LDFLAGS = -module -avoid-version
fuse_la_LIBADD = @GF_FUSE_LDADD@
AM_CPPFLAGS = $(GF_CPPFLAGS) \
diff --git a/xlators/mount/fuse/src/fuse-bridge.c b/xlators/mount/fuse/src/fuse-bridge.c
index e46283793..d5ca4d146 100644
--- a/xlators/mount/fuse/src/fuse-bridge.c
+++ b/xlators/mount/fuse/src/fuse-bridge.c
@@ -10,6 +10,13 @@
#include <sys/wait.h>
#include "fuse-bridge.h"
+#include "mount-gluster-compat.h"
+#include "glusterfs.h"
+#include "glusterfs-acl.h"
+
+#ifdef __NetBSD__
+#undef open /* in perfuse.h, pulled from mount-gluster-compat.h */
+#endif
static int gf_fuse_conn_err_log;
static int gf_fuse_xattr_enotsup_log;
@@ -22,28 +29,71 @@ static void fuse_invalidate_inode(xlator_t *this, uint64_t fuse_ino);
* Send an invalidate notification up to fuse to purge the file from local
* page cache.
*/
+
static int32_t
fuse_invalidate(xlator_t *this, inode_t *inode)
{
- fuse_private_t *priv = this->private;
- uint64_t nodeid;
+ fuse_private_t *priv = this->private;
+ uint64_t nodeid;
- /*
- * NOTE: We only invalidate at the moment if fopen_keep_cache is
- * enabled because otherwise this is a departure from default
- * behavior. Specifically, the performance/write-behind xlator
- * causes unconditional invalidations on write requests.
- */
- if (!priv->fopen_keep_cache)
- return 0;
+ /*
+ * NOTE: We only invalidate at the moment if fopen_keep_cache is
+ * enabled because otherwise this is a departure from default
+ * behavior. Specifically, the performance/write-behind xlator
+ * causes unconditional invalidations on write requests.
+ */
+ if (!priv->fopen_keep_cache)
+ return 0;
- nodeid = inode_to_fuse_nodeid(inode);
- gf_log(this->name, GF_LOG_DEBUG, "Invalidate inode id %lu.", nodeid);
- fuse_invalidate_inode(this, nodeid);
+ nodeid = inode_to_fuse_nodeid(inode);
+ gf_log(this->name, GF_LOG_DEBUG, "Invalidate inode id %"GF_PRI_INODE"." , nodeid);
+ fuse_log_eh (this, "Sending invalidate inode id: %"GF_PRI_INODE" gfid: %s", nodeid,
+ uuid_utoa (inode->gfid));
+ fuse_invalidate_inode(this, nodeid);
- return 0;
+ return 0;
}
+static int32_t
+fuse_forget_cbk (xlator_t *this, inode_t *inode)
+{
+ //Nothing to free in inode ctx, hence return.
+ return 0;
+}
+
+void
+fuse_inode_set_need_lookup (inode_t *inode, xlator_t *this)
+{
+ uint64_t need_lookup = 1;
+
+ if (!inode || !this)
+ return;
+
+ inode_ctx_set (inode, this, &need_lookup);
+
+ return;
+}
+
+
+gf_boolean_t
+fuse_inode_needs_lookup (inode_t *inode, xlator_t *this)
+{
+ uint64_t need_lookup = 0;
+ gf_boolean_t ret = _gf_false;
+
+ if (!inode || !this)
+ return ret;
+
+ inode_ctx_get (inode, this, &need_lookup);
+ if (need_lookup)
+ ret = _gf_true;
+ need_lookup = 0;
+ inode_ctx_set (inode, this, &need_lookup);
+
+ return ret;
+}
+
+
fuse_fd_ctx_t *
__fuse_fd_ctx_check_n_create (xlator_t *this, fd_t *fd)
{
@@ -93,7 +143,6 @@ out:
return fd_ctx;
}
-
fuse_fd_ctx_t *
fuse_fd_ctx_get (xlator_t *this, fd_t *fd)
{
@@ -112,7 +161,6 @@ out:
return fdctx;
}
-
/*
* iov_out should contain a fuse_out_header at zeroth position.
* The error value of this header is sent to kernel.
@@ -139,6 +187,8 @@ send_fuse_iov (xlator_t *this, fuse_in_header_t *finh, struct iovec *iov_out,
fouh->unique = finh->unique;
res = writev (priv->fd, iov_out, count);
+ gf_log ("glusterfs-fuse", GF_LOG_TRACE, "writev() result %d/%d %s",
+ res, fouh->len, res == -1 ? strerror (errno) : "");
if (res == -1)
return errno;
@@ -168,19 +218,26 @@ send_fuse_data (xlator_t *this, fuse_in_header_t *finh, void *data, size_t size)
{
struct fuse_out_header fouh = {0, };
struct iovec iov_out[2];
+ int ret = 0;
fouh.error = 0;
iov_out[0].iov_base = &fouh;
iov_out[1].iov_base = data;
iov_out[1].iov_len = size;
- return send_fuse_iov (this, finh, iov_out, 2);
+ ret = send_fuse_iov (this, finh, iov_out, 2);
+ if (ret != 0)
+ gf_log ("glusterfs-fuse", GF_LOG_ERROR, "send_fuse_iov() "
+ "failed: %s", strerror (ret));
+
+ return ret;
}
#define send_fuse_obj(this, finh, obj) \
send_fuse_data (this, finh, obj, sizeof (*(obj)))
+#if FUSE_KERNEL_MINOR_VERSION >= 11
static void
fuse_invalidate_entry (xlator_t *this, uint64_t fuse_ino)
{
@@ -191,8 +248,7 @@ fuse_invalidate_entry (xlator_t *this, uint64_t fuse_ino)
inode_t *inode = NULL;
size_t nlen = 0;
int rv = 0;
-
- char inval_buf[INVAL_BUF_SIZE] = {0,};
+ char inval_buf[INVAL_BUF_SIZE] = {0,};
fouh = (struct fuse_out_header *)inval_buf;
fnieo = (struct fuse_notify_inval_entry_out *)(fouh + 1);
@@ -225,8 +281,21 @@ fuse_invalidate_entry (xlator_t *this, uint64_t fuse_ino)
gf_log ("glusterfs-fuse", GF_LOG_TRACE, "INVALIDATE entry: "
"%"PRIu64"/%s", fnieo->parent, dentry->name);
+
+ if (dentry->parent) {
+ fuse_log_eh (this, "Invalidated entry %s (parent: %s)",
+ dentry->name,
+ uuid_utoa (dentry->parent->gfid));
+ } else {
+ fuse_log_eh (this, "Invalidated entry %s(nodeid: %ld)",
+ dentry->name, fnieo->parent);
+ }
}
+
+ if (inode)
+ inode_unref (inode);
}
+#endif
/*
* Send an inval inode notification to fuse. This causes an invalidation of the
@@ -235,53 +304,90 @@ fuse_invalidate_entry (xlator_t *this, uint64_t fuse_ino)
static void
fuse_invalidate_inode(xlator_t *this, uint64_t fuse_ino)
{
- struct fuse_out_header *fouh = NULL;
- struct fuse_notify_inval_inode_out *fniio = NULL;
- fuse_private_t *priv = NULL;
- int rv = 0;
- char inval_buf[INVAL_BUF_SIZE] = {0};
+#if FUSE_KERNEL_MINOR_VERSION >= 11
+ struct fuse_out_header *fouh = NULL;
+ struct fuse_notify_inval_inode_out *fniio = NULL;
+ fuse_private_t *priv = NULL;
+ int rv = 0;
+ char inval_buf[INVAL_BUF_SIZE] = {0};
+ inode_t *inode = NULL;
- fouh = (struct fuse_out_header *) inval_buf;
- fniio = (struct fuse_notify_inval_inode_out *) (fouh + 1);
+ fouh = (struct fuse_out_header *) inval_buf;
+ fniio = (struct fuse_notify_inval_inode_out *) (fouh + 1);
- priv = this->private;
+ priv = this->private;
- if (priv->revchan_out < 0)
- return;
+ if (priv->revchan_out < 0)
+ return;
- fouh->unique = 0;
- fouh->error = FUSE_NOTIFY_INVAL_INODE;
- fouh->len = sizeof(struct fuse_out_header) +
- sizeof(struct fuse_notify_inval_inode_out);
-
- /* inval the entire mapping until we learn how to be more granular */
- fniio->ino = fuse_ino;
- fniio->off = 0;
- fniio->len = -1;
-
- rv = write(priv->revchan_out, inval_buf, fouh->len);
- if (rv != fouh->len) {
- gf_log("glusterfs-fuse", GF_LOG_ERROR, "kernel notification "
- "daemon defunct");
- close(priv->fd);
- }
+ fouh->unique = 0;
+ fouh->error = FUSE_NOTIFY_INVAL_INODE;
+ fouh->len = sizeof(struct fuse_out_header) +
+ sizeof(struct fuse_notify_inval_inode_out);
+
+ /* inval the entire mapping until we learn how to be more granular */
+ fniio->ino = fuse_ino;
+ fniio->off = 0;
+ fniio->len = -1;
+
+ inode = fuse_ino_to_inode (fuse_ino, this);
- gf_log("glusterfs-fuse", GF_LOG_TRACE, "INVALIDATE inode: %lu", fuse_ino);
+ rv = write(priv->revchan_out, inval_buf, fouh->len);
+ if (rv != fouh->len) {
+ gf_log("glusterfs-fuse", GF_LOG_ERROR, "kernel notification "
+ "daemon defunct");
+ close(priv->fd);
+ }
+
+ gf_log("glusterfs-fuse", GF_LOG_TRACE, "INVALIDATE inode: %lu", fuse_ino);
+
+ if (inode) {
+ fuse_log_eh (this, "Invalidated inode %lu (gfid: %s)",
+ fuse_ino, uuid_utoa (inode->gfid));
+ } else {
+ fuse_log_eh (this, "Invalidated inode %lu ", fuse_ino);
+ }
+
+ if (inode)
+ inode_unref (inode);
+#else
+ gf_log ("glusterfs-fuse", GF_LOG_WARNING,
+ "fuse_invalidate_inode not implemented on OS X due to missing FUSE notification");
+#endif
}
+
int
send_fuse_err (xlator_t *this, fuse_in_header_t *finh, int error)
{
struct fuse_out_header fouh = {0, };
struct iovec iov_out;
+ inode_t *inode = NULL;
fouh.error = -error;
iov_out.iov_base = &fouh;
+ inode = fuse_ino_to_inode (finh->nodeid, this);
+
+ // filter out ENOENT
+ if (error != ENOENT) {
+ if (inode) {
+ fuse_log_eh (this,"Sending %s for operation %d on "
+ "inode %s", strerror (error), finh->opcode,
+ uuid_utoa (inode->gfid));
+ } else {
+ fuse_log_eh (this, "Sending %s for operation %d on "
+ "inode %" GF_PRI_INODE, strerror (error),
+ finh->opcode, finh->nodeid);
+ }
+ }
+
+ if (inode)
+ inode_unref (inode);
+
return send_fuse_iov (this, finh, &iov_out, 1);
}
-
static int
fuse_entry_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
@@ -298,24 +404,33 @@ fuse_entry_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
finh = state->finh;
if (op_ret == 0) {
- if (__is_root_gfid (state->loc.inode->gfid))
- buf->ia_ino = 1;
- if (uuid_is_null (buf->ia_gfid)) {
- /* With a NULL gfid inode linking is
- not possible. Let's not pretend this
- call was a "success".
- */
- gf_log ("glusterfs-fuse", GF_LOG_WARNING,
- "Received NULL gfid for %s. Forcing EIO",
- state->loc.path);
- op_ret = -1;
- op_errno = EIO;
- }
+ if (__is_root_gfid (state->loc.inode->gfid))
+ buf->ia_ino = 1;
+ if (uuid_is_null (buf->ia_gfid)) {
+ /* With a NULL gfid inode linking is
+ not possible. Let's not pretend this
+ call was a "success".
+ */
+ gf_log ("glusterfs-fuse", GF_LOG_WARNING,
+ "Received NULL gfid for %s. Forcing EIO",
+ state->loc.path);
+ op_ret = -1;
+ op_errno = EIO;
+ }
}
+ /* log into the event-history after the null uuid check is done, since
+ * the op_ret and op_errno are being changed if the gfid is NULL.
+ */
+ fuse_log_eh (this, "op_ret: %d op_errno: %d "
+ "%"PRIu64": %s() %s => %s", op_ret, op_errno,
+ frame->root->unique, gf_fop_list[frame->root->op],
+ state->loc.path, (op_ret == 0)?
+ uuid_utoa(buf->ia_gfid):uuid_utoa(state->loc.gfid));
+
if (op_ret == 0) {
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
- "%"PRIu64": %s() %s => %"PRId64,
+ "%"PRIu64": %s() %s => %"PRIu64,
frame->root->unique, gf_fop_list[frame->root->op],
state->loc.path, buf->ia_ino);
@@ -364,7 +479,8 @@ fuse_entry_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
"%"PRIu64": %s() %s => -1 (%s)", frame->root->unique,
gf_fop_list[frame->root->op], state->loc.path,
strerror (op_errno));
- if (op_errno == ENOENT) {
+
+ if ((op_errno == ENOENT) && (priv->negative_timeout != 0)) {
feo.entry_valid =
calc_timeout_sec (priv->negative_timeout);
feo.entry_valid_nsec =
@@ -372,7 +488,7 @@ fuse_entry_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
send_fuse_obj (this, finh, &feo);
} else {
send_fuse_err (this, state->finh, op_errno);
- }
+ }
}
free_fuse_state (state);
@@ -380,7 +496,6 @@ fuse_entry_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
}
-
static int
fuse_newentry_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
@@ -392,7 +507,6 @@ fuse_newentry_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
}
-
static int
fuse_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
@@ -408,6 +522,13 @@ fuse_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (op_ret == -1 && state->is_revalidate == 1) {
itable = state->itable;
+ /*
+ * A stale mapping might exist for a dentry/inode that has been
+ * removed from another client.
+ */
+ if (op_errno == ENOENT)
+ inode_unlink(state->loc.inode, state->loc.parent,
+ state->loc.name);
inode_unref (state->loc.inode);
state->loc.inode = inode_new (itable);
state->is_revalidate = 2;
@@ -467,8 +588,8 @@ fuse_lookup_resume (fuse_state_t *state)
static void
fuse_lookup (xlator_t *this, fuse_in_header_t *finh, void *msg)
{
- char *name = msg;
- fuse_state_t *state = NULL;
+ char *name = msg;
+ fuse_state_t *state = NULL;
GET_STATE (this, finh, state);
@@ -476,16 +597,27 @@ fuse_lookup (xlator_t *this, fuse_in_header_t *finh, void *msg)
finh->nodeid, name);
fuse_resolve_and_resume (state, fuse_lookup_resume);
+
+ return;
}
+static inline void
+do_forget(xlator_t *this, uint64_t unique, uint64_t nodeid, uint64_t nlookup)
+{
+ inode_t *fuse_inode = fuse_ino_to_inode(nodeid, this);
+
+ fuse_log_eh(this, "%"PRIu64": FORGET %"PRIu64"/%"PRIu64" gfid: (%s)",
+ unique, nodeid, nlookup, uuid_utoa(fuse_inode->gfid));
+
+ inode_forget(fuse_inode, nlookup);
+ inode_unref(fuse_inode);
+}
static void
fuse_forget (xlator_t *this, fuse_in_header_t *finh, void *msg)
{
- struct fuse_forget_in *ffi = msg;
-
- inode_t *fuse_inode;
+ struct fuse_forget_in *ffi = msg;
if (finh->nodeid == 1) {
GF_FREE (finh);
@@ -496,14 +628,31 @@ fuse_forget (xlator_t *this, fuse_in_header_t *finh, void *msg)
"%"PRIu64": FORGET %"PRIu64"/%"PRIu64,
finh->unique, finh->nodeid, ffi->nlookup);
- fuse_inode = fuse_ino_to_inode (finh->nodeid, this);
-
- inode_forget (fuse_inode, ffi->nlookup);
- inode_unref (fuse_inode);
+ do_forget(this, finh->unique, finh->nodeid, ffi->nlookup);
GF_FREE (finh);
}
+#if FUSE_KERNEL_MINOR_VERSION >= 16
+static void
+fuse_batch_forget(xlator_t *this, fuse_in_header_t *finh, void *msg)
+{
+ struct fuse_batch_forget_in *fbfi = msg;
+ struct fuse_forget_one *ffo = (struct fuse_forget_one *) (fbfi + 1);
+ int i;
+
+ gf_log("glusterfs-fuse", GF_LOG_TRACE,
+ "%"PRIu64": BATCH_FORGET %"PRIu64"/%"PRIu32,
+ finh->unique, finh->nodeid, fbfi->count);
+
+ for (i = 0; i < fbfi->count; i++) {
+ if (ffo[i].nodeid == 1)
+ continue;
+ do_forget(this, finh->unique, ffo[i].nodeid, ffo[i].nlookup);
+ }
+ GF_FREE(finh);
+}
+#endif
static int
fuse_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -519,9 +668,11 @@ fuse_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
state = frame->root->state;
finh = state->finh;
+ fuse_log_eh_fop(this, state, frame, op_ret, op_errno);
+
if (op_ret == 0) {
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
- "%"PRIu64": %s() %s => %"PRId64, frame->root->unique,
+ "%"PRIu64": %s() %s => %"PRIu64, frame->root->unique,
gf_fop_list[frame->root->op],
state->loc.path ? state->loc.path : "ERR",
prebuf->ia_ino);
@@ -557,7 +708,6 @@ fuse_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
}
-
static int
fuse_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata)
@@ -571,9 +721,13 @@ fuse_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
state = frame->root->state;
finh = state->finh;
+ fuse_log_eh (this, "op_ret: %d, op_errno: %d, %"PRIu64": %s() %s => "
+ "gfid: %s", op_ret, op_errno, frame->root->unique,
+ gf_fop_list[frame->root->op], state->loc.path,
+ state->loc.inode ? uuid_utoa (state->loc.inode->gfid) : "");
if (op_ret == 0) {
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
- "%"PRIu64": %s() %s => %"PRId64, frame->root->unique,
+ "%"PRIu64": %s() %s => %"PRIu64, frame->root->unique,
gf_fop_list[frame->root->op],
state->loc.path ? state->loc.path : "ERR",
buf->ia_ino);
@@ -594,9 +748,9 @@ fuse_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
send_fuse_obj (this, finh, &fao);
#endif
} else {
- GF_LOG_OCCASIONALLY ( gf_fuse_conn_err_log, "glusterfs-fuse",
- GF_LOG_WARNING,
- "%"PRIu64": %s() %s => -1 (%s)",
+ GF_LOG_OCCASIONALLY ( gf_fuse_conn_err_log, "glusterfs-fuse",
+ GF_LOG_WARNING,
+ "%"PRIu64": %s() %s => -1 (%s)",
frame->root->unique,
gf_fop_list[frame->root->op],
state->loc.path ? state->loc.path : "ERR",
@@ -611,7 +765,6 @@ fuse_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
}
-
static int
fuse_root_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
@@ -630,15 +783,15 @@ fuse_getattr_resume (fuse_state_t *state)
gf_log ("glusterfs-fuse", GF_LOG_ERROR,
"%"PRIu64": GETATTR %"PRIu64" (%s) resolution failed",
state->finh->unique, state->finh->nodeid,
- uuid_utoa (state->resolve.gfid));
+ uuid_utoa (state->resolve.gfid));
send_fuse_err (state->this, state->finh, ENOENT);
free_fuse_state (state);
return;
}
- if (!IA_ISDIR (state->loc.inode->ia_type)) {
- state->fd = fd_lookup (state->loc.inode, state->finh->pid);
- }
+ if (!IA_ISDIR (state->loc.inode->ia_type)) {
+ state->fd = fd_lookup (state->loc.inode, state->finh->pid);
+ }
if (!state->fd) {
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
@@ -693,7 +846,6 @@ fuse_getattr (xlator_t *this, fuse_in_header_t *finh, void *msg)
fuse_resolve_and_resume (state, fuse_getattr_resume);
}
-
static int32_t
fuse_fd_inherit_directio (xlator_t *this, fd_t *fd, struct fuse_open_out *foo)
{
@@ -736,6 +888,15 @@ out:
}
+gf_boolean_t
+direct_io_mode (dict_t *xdata)
+{
+ if (xdata && dict_get (xdata, "direct-io-mode"))
+ return _gf_true;
+ return _gf_false;
+}
+
+
static int
fuse_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
@@ -750,6 +911,8 @@ fuse_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
state = frame->root->state;
finh = state->finh;
+ fuse_log_eh_fop(this, state, frame, op_ret, op_errno);
+
if (op_ret >= 0) {
foo.fh = (uintptr_t) fd;
foo.open_flags = 0;
@@ -757,7 +920,8 @@ fuse_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (!IA_ISDIR (fd->inode->ia_type)) {
if (((priv->direct_io_mode == 2)
&& ((state->flags & O_ACCMODE) != O_RDONLY))
- || (priv->direct_io_mode == 1))
+ || (priv->direct_io_mode == 1)
+ || (direct_io_mode (xdata)))
foo.open_flags |= FOPEN_DIRECT_IO;
#ifdef GF_DARWIN_HOST_OS
/* In Linux: by default, buffer cache
@@ -770,17 +934,17 @@ fuse_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
*
* [[Interesting...]]
*/
- if (!priv->fopen_keep_cache)
- foo.open_flags |= FOPEN_PURGE_UBC;
+ if (!priv->fopen_keep_cache)
+ foo.open_flags |= FOPEN_PURGE_UBC;
#else
- /*
- * If fopen-keep-cache is enabled, we set the associated
- * flag here such that files are not invalidated on open.
- * File invalidations occur either in fuse or explicitly
- * when the cache is set invalid on the inode.
- */
- if (priv->fopen_keep_cache)
- foo.open_flags |= FOPEN_KEEP_CACHE;
+ /*
+ * If fopen-keep-cache is enabled, we set the associated
+ * flag here such that files are not invalidated on open.
+ * File invalidations occur either in fuse or explicitly
+ * when the cache is set invalid on the inode.
+ */
+ if (priv->fopen_keep_cache)
+ foo.open_flags |= FOPEN_KEEP_CACHE;
#endif
}
@@ -822,7 +986,6 @@ out:
return 0;
}
-
static void
fuse_do_truncate (fuse_state_t *state, size_t size)
{
@@ -837,7 +1000,6 @@ fuse_do_truncate (fuse_state_t *state, size_t size)
return;
}
-
static int
fuse_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
@@ -854,9 +1016,14 @@ fuse_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
state = frame->root->state;
finh = state->finh;
+ fuse_log_eh(this, "op_ret: %d, op_errno: %d, %"PRIu64", %s() %s => "
+ "gfid: %s", op_ret, op_errno, frame->root->unique,
+ gf_fop_list[frame->root->op], state->loc.path,
+ state->loc.inode ? uuid_utoa (state->loc.inode->gfid) : "");
+
if (op_ret == 0) {
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
- "%"PRIu64": %s() %s => %"PRId64, frame->root->unique,
+ "%"PRIu64": %s() %s => %"PRIu64, frame->root->unique,
gf_fop_list[frame->root->op],
state->loc.path ? state->loc.path : "ERR",
statpost->ia_ino);
@@ -901,7 +1068,6 @@ fuse_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
}
-
static int32_t
fattr_to_gf_set_attr (int32_t valid)
{
@@ -928,7 +1094,6 @@ fattr_to_gf_set_attr (int32_t valid)
return gf_valid;
}
-
#define FATTR_MASK (FATTR_SIZE \
| FATTR_UID | FATTR_GID \
| FATTR_ATIME | FATTR_MTIME \
@@ -941,7 +1106,7 @@ fuse_setattr_resume (fuse_state_t *state)
gf_log ("glusterfs-fuse", GF_LOG_ERROR,
"%"PRIu64": SETATTR %"PRIu64" (%s) resolution failed",
state->finh->unique, state->finh->nodeid,
- uuid_utoa (state->resolve.gfid));
+ uuid_utoa (state->resolve.gfid));
send_fuse_err (state->this, state->finh, ENOENT);
free_fuse_state (state);
return;
@@ -987,7 +1152,9 @@ fuse_setattr (xlator_t *this, fuse_in_header_t *finh, void *msg)
{
struct fuse_setattr_in *fsi = msg;
+#if FUSE_KERNEL_MINOR_VERSION >= 9
fuse_private_t *priv = NULL;
+#endif
fuse_state_t *state = NULL;
GET_STATE (this, finh, state);
@@ -997,10 +1164,10 @@ fuse_setattr (xlator_t *this, fuse_in_header_t *finh, void *msg)
/* We need no loc if kernel sent us an fd and
* we are not fiddling with times */
state->fd = FH_TO_FD (fsi->fh);
- fuse_resolve_fd_init (state, &state->resolve, state->fd);
- } else {
- fuse_resolve_inode_init (state, &state->resolve, finh->nodeid);
- }
+ fuse_resolve_fd_init (state, &state->resolve, state->fd);
+ } else {
+ fuse_resolve_inode_init (state, &state->resolve, finh->nodeid);
+ }
/*
* This is just stub code demonstrating how to retrieve
@@ -1015,8 +1182,8 @@ fuse_setattr (xlator_t *this, fuse_in_header_t *finh, void *msg)
* http://git.kernel.org/?p=linux/kernel/git/torvalds/
* linux-2.6.git;a=commit;h=v2.6.23-5896-gf333211
*/
- priv = this->private;
#if FUSE_KERNEL_MINOR_VERSION >= 9
+ priv = this->private;
if (priv->proto_minor >= 9 && fsi->valid & FATTR_LOCKOWNER)
state->lk_owner = fsi->lock_owner;
#endif
@@ -1045,7 +1212,6 @@ fuse_setattr (xlator_t *this, fuse_in_header_t *finh, void *msg)
fuse_resolve_and_resume (state, fuse_setattr_resume);
}
-
static int
fuse_err_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
@@ -1053,6 +1219,8 @@ fuse_err_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
fuse_state_t *state = frame->root->state;
fuse_in_header_t *finh = state->finh;
+ fuse_log_eh_fop(this, state, frame, op_ret, op_errno);
+
if (op_ret == 0) {
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
"%"PRIu64": %s() %s => 0", frame->root->unique,
@@ -1077,7 +1245,6 @@ fuse_err_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
}
-
static int
fuse_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
@@ -1086,7 +1253,6 @@ fuse_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
return fuse_err_cbk (frame, cookie, this, op_ret, op_errno, xdata);
}
-
static int
fuse_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
@@ -1100,7 +1266,6 @@ fuse_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
return fuse_err_cbk (frame, cookie, this, op_ret, op_errno, xdata);
}
-
static int
fuse_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *preparent,
@@ -1112,11 +1277,14 @@ fuse_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
state = frame->root->state;
finh = state->finh;
- if (op_ret == 0)
- inode_unlink (state->loc.inode, state->loc.parent,
- state->loc.name);
+ fuse_log_eh (this, "op_ret: %d, op_errno: %d, %"PRIu64": %s() %s => "
+ "gfid: %s", op_ret, op_errno, frame->root->unique,
+ gf_fop_list[frame->root->op], state->loc.path,
+ state->loc.inode ? uuid_utoa (state->loc.inode->gfid) : "");
if (op_ret == 0) {
+ inode_unlink (state->loc.inode, state->loc.parent,
+ state->loc.name);
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
"%"PRIu64": %s() %s => 0", frame->root->unique,
gf_fop_list[frame->root->op], state->loc.path);
@@ -1138,7 +1306,6 @@ fuse_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
}
-
void
fuse_access_resume (fuse_state_t *state)
{
@@ -1146,7 +1313,7 @@ fuse_access_resume (fuse_state_t *state)
gf_log ("glusterfs-fuse", GF_LOG_ERROR,
"%"PRIu64": ACCESS %"PRIu64" (%s) resolution failed",
state->finh->unique, state->finh->nodeid,
- uuid_utoa (state->resolve.gfid));
+ uuid_utoa (state->resolve.gfid));
send_fuse_err (state->this, state->finh, ENOENT);
free_fuse_state (state);
return;
@@ -1161,7 +1328,6 @@ fuse_access_resume (fuse_state_t *state)
&state->loc, state->mask, state->xdata);
}
-
static void
fuse_access (xlator_t *this, fuse_in_header_t *finh, void *msg)
{
@@ -1179,7 +1345,6 @@ fuse_access (xlator_t *this, fuse_in_header_t *finh, void *msg)
return;
}
-
static int
fuse_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, const char *linkname,
@@ -1191,6 +1356,12 @@ fuse_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
state = frame->root->state;
finh = state->finh;
+ fuse_log_eh (this, "op_ret: %d, op_errno: %d %"PRIu64": %s() => %s"
+ " linkname: %s, gfid: %s", op_ret, op_errno,
+ frame->root->unique, gf_fop_list[frame->root->op],
+ state->loc.gfid, linkname,
+ uuid_utoa (state->loc.gfid));
+
if (op_ret > 0) {
((char *)linkname)[op_ret] = '\0';
@@ -1213,7 +1384,6 @@ fuse_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
}
-
void
fuse_readlink_resume (fuse_state_t *state)
{
@@ -1234,7 +1404,6 @@ fuse_readlink_resume (fuse_state_t *state)
readlink, &state->loc, 4096, state->xdata);
}
-
static void
fuse_readlink (xlator_t *this, fuse_in_header_t *finh, void *msg)
{
@@ -1242,22 +1411,21 @@ fuse_readlink (xlator_t *this, fuse_in_header_t *finh, void *msg)
GET_STATE (this, finh, state);
- fuse_resolve_inode_init (state, &state->resolve, finh->nodeid);
+ fuse_resolve_inode_init (state, &state->resolve, finh->nodeid);
fuse_resolve_and_resume (state, fuse_readlink_resume);
return;
}
-
void
fuse_mknod_resume (fuse_state_t *state)
{
if (!state->loc.parent) {
gf_log ("glusterfs-fuse", GF_LOG_ERROR,
- "MKNOD %"PRId64"/%s (%s/%s) resolution failed",
+ "MKNOD %"PRIu64"/%s (%s/%s) resolution failed",
state->finh->nodeid, state->resolve.bname,
- uuid_utoa (state->resolve.gfid), state->resolve.bname);
+ uuid_utoa (state->resolve.gfid), state->resolve.bname);
send_fuse_err (state->this, state->finh, ENOENT);
free_fuse_state (state);
return;
@@ -1271,7 +1439,7 @@ fuse_mknod_resume (fuse_state_t *state)
if (state->loc.inode) {
gf_log (state->this->name, GF_LOG_DEBUG, "inode already present");
inode_unref (state->loc.inode);
- state->loc.inode = NULL;
+ state->loc.inode = NULL;
}
state->loc.inode = inode_new (state->loc.parent->table);
@@ -1285,7 +1453,6 @@ fuse_mknod_resume (fuse_state_t *state)
state->xdata);
}
-
static void
fuse_mknod (xlator_t *this, fuse_in_header_t *finh, void *msg)
{
@@ -1293,11 +1460,11 @@ fuse_mknod (xlator_t *this, fuse_in_header_t *finh, void *msg)
char *name = (char *)(fmi + 1);
fuse_state_t *state = NULL;
+#if FUSE_KERNEL_MINOR_VERSION >= 12
fuse_private_t *priv = NULL;
- int32_t ret = -1;
+ int32_t ret = -1;
priv = this->private;
-#if FUSE_KERNEL_MINOR_VERSION >= 12
if (priv->proto_minor < 12)
name = (char *)msg + FUSE_COMPAT_MKNOD_IN_SIZE;
#endif
@@ -1306,48 +1473,14 @@ fuse_mknod (xlator_t *this, fuse_in_header_t *finh, void *msg)
uuid_generate (state->gfid);
- fuse_resolve_entry_init (state, &state->resolve, finh->nodeid, name);
+ fuse_resolve_entry_init (state, &state->resolve, finh->nodeid, name);
state->mode = fmi->mode;
state->rdev = fmi->rdev;
- priv = this->private;
#if FUSE_KERNEL_MINOR_VERSION >=12
- if (priv->proto_minor >= 12)
- state->mode &= ~fmi->umask;
- if (priv->proto_minor >= 12 && priv->acl) {
- state->xdata = dict_new ();
- if (!state->xdata) {
- gf_log ("glusterfs-fuse", GF_LOG_WARNING,
- "MKNOD Failed to allocate a param dictionary");
- send_fuse_err (this, finh, ENOMEM);
- free_fuse_state (state);
- return;
- }
- state->umask = fmi->umask;
-
- /* TODO: remove this after 3.4.0 release. keeping it for the
- sake of backward compatibility with old (3.3.[01])
- releases till then. */
- ret = dict_set_int16 (state->xdata, "umask", fmi->umask);
- if (ret < 0) {
- gf_log ("glusterfs-fuse", GF_LOG_WARNING,
- "MKNOD Failed adding umask to request");
- dict_destroy (state->xdata);
- send_fuse_err (this, finh, ENOMEM);
- free_fuse_state (state);
- return;
- }
- ret = dict_set_int16 (state->xdata, "mode", fmi->mode);
- if (ret < 0) {
- gf_log ("glusterfs-fuse", GF_LOG_WARNING,
- "MKNOD Failed adding mode to request");
- dict_destroy (state->xdata);
- send_fuse_err (this, finh, ENOMEM);
- free_fuse_state (state);
- return;
- }
- }
+ priv = this->private;
+ FUSE_ENTRY_CREATE(this, priv, finh, state, fmi, "MKNOD");
#endif
fuse_resolve_and_resume (state, fuse_mknod_resume);
@@ -1355,15 +1488,14 @@ fuse_mknod (xlator_t *this, fuse_in_header_t *finh, void *msg)
return;
}
-
void
fuse_mkdir_resume (fuse_state_t *state)
{
if (!state->loc.parent) {
gf_log ("glusterfs-fuse", GF_LOG_ERROR,
- "MKDIR %"PRId64" (%s/%s) resolution failed",
+ "MKDIR %"PRIu64" (%s/%s) resolution failed",
state->finh->nodeid, uuid_utoa (state->resolve.gfid),
- state->resolve.bname);
+ state->resolve.bname);
send_fuse_err (state->this, state->finh, ENOENT);
free_fuse_state (state);
return;
@@ -1377,7 +1509,7 @@ fuse_mkdir_resume (fuse_state_t *state)
if (state->loc.inode) {
gf_log (state->this->name, GF_LOG_DEBUG, "inode already present");
inode_unref (state->loc.inode);
- state->loc.inode = NULL;
+ state->loc.inode = NULL;
}
state->loc.inode = inode_new (state->loc.parent->table);
@@ -1390,62 +1522,29 @@ fuse_mkdir_resume (fuse_state_t *state)
mkdir, &state->loc, state->mode, state->umask, state->xdata);
}
-
static void
fuse_mkdir (xlator_t *this, fuse_in_header_t *finh, void *msg)
{
struct fuse_mkdir_in *fmi = msg;
char *name = (char *)(fmi + 1);
+#if FUSE_KERNEL_MINOR_VERSION >=12
fuse_private_t *priv = NULL;
+ int32_t ret = -1;
+#endif
fuse_state_t *state;
- int32_t ret = -1;
GET_STATE (this, finh, state);
uuid_generate (state->gfid);
- fuse_resolve_entry_init (state, &state->resolve, finh->nodeid, name);
+ fuse_resolve_entry_init (state, &state->resolve, finh->nodeid, name);
state->mode = fmi->mode;
- priv = this->private;
#if FUSE_KERNEL_MINOR_VERSION >=12
- if (priv->proto_minor >= 12)
- state->mode &= ~fmi->umask;
- if (priv->proto_minor >= 12 && priv->acl) {
- state->xdata = dict_new ();
- if (!state->xdata) {
- gf_log ("glusterfs-fuse", GF_LOG_WARNING,
- "MKDIR Failed to allocate a param dictionary");
- send_fuse_err (this, finh, ENOMEM);
- free_fuse_state (state);
- return;
- }
- state->umask = fmi->umask;
-
- /* TODO: remove this after 3.4.0 release. keeping it for the
- sake of backward compatibility with old (3.3.[01])
- releases till then. */
- ret = dict_set_int16 (state->xdata, "umask", fmi->umask);
- if (ret < 0) {
- gf_log ("glusterfs-fuse", GF_LOG_WARNING,
- "MKDIR Failed adding umask to request");
- dict_destroy (state->xdata);
- send_fuse_err (this, finh, ENOMEM);
- free_fuse_state (state);
- return;
- }
- ret = dict_set_int16 (state->xdata, "mode", fmi->mode);
- if (ret < 0) {
- gf_log ("glusterfs-fuse", GF_LOG_WARNING,
- "MKDIR Failed adding mode to request");
- dict_destroy (state->xdata);
- send_fuse_err (this, finh, ENOMEM);
- free_fuse_state (state);
- return;
- }
- }
+ priv = this->private;
+ FUSE_ENTRY_CREATE(this, priv, finh, state, fmi, "MKDIR");
#endif
fuse_resolve_and_resume (state, fuse_mkdir_resume);
@@ -1453,15 +1552,14 @@ fuse_mkdir (xlator_t *this, fuse_in_header_t *finh, void *msg)
return;
}
-
void
fuse_unlink_resume (fuse_state_t *state)
{
if (!state->loc.parent || !state->loc.inode) {
gf_log ("glusterfs-fuse", GF_LOG_ERROR,
- "UNLINK %"PRId64" (%s/%s) resolution failed",
- state->finh->nodeid, uuid_utoa (state->resolve.gfid),
- state->resolve.bname);
+ "UNLINK %"PRIu64" (%s/%s) resolution failed",
+ state->finh->nodeid, uuid_utoa (state->resolve.gfid),
+ state->resolve.bname);
send_fuse_err (state->this, state->finh, ENOENT);
free_fuse_state (state);
return;
@@ -1475,7 +1573,6 @@ fuse_unlink_resume (fuse_state_t *state)
unlink, &state->loc, 0, state->xdata);
}
-
static void
fuse_unlink (xlator_t *this, fuse_in_header_t *finh, void *msg)
{
@@ -1484,7 +1581,7 @@ fuse_unlink (xlator_t *this, fuse_in_header_t *finh, void *msg)
GET_STATE (this, finh, state);
- fuse_resolve_entry_init (state, &state->resolve, finh->nodeid, name);
+ fuse_resolve_entry_init (state, &state->resolve, finh->nodeid, name);
fuse_resolve_and_resume (state, fuse_unlink_resume);
@@ -1496,9 +1593,9 @@ fuse_rmdir_resume (fuse_state_t *state)
{
if (!state->loc.parent || !state->loc.inode) {
gf_log ("glusterfs-fuse", GF_LOG_ERROR,
- "RMDIR %"PRId64" (%s/%s) resolution failed",
- state->finh->nodeid, uuid_utoa (state->resolve.gfid),
- state->resolve.bname);
+ "RMDIR %"PRIu64" (%s/%s) resolution failed",
+ state->finh->nodeid, uuid_utoa (state->resolve.gfid),
+ state->resolve.bname);
send_fuse_err (state->this, state->finh, ENOENT);
free_fuse_state (state);
return;
@@ -1512,7 +1609,6 @@ fuse_rmdir_resume (fuse_state_t *state)
rmdir, &state->loc, 0, state->xdata);
}
-
static void
fuse_rmdir (xlator_t *this, fuse_in_header_t *finh, void *msg)
{
@@ -1521,22 +1617,21 @@ fuse_rmdir (xlator_t *this, fuse_in_header_t *finh, void *msg)
GET_STATE (this, finh, state);
- fuse_resolve_entry_init (state, &state->resolve, finh->nodeid, name);
+ fuse_resolve_entry_init (state, &state->resolve, finh->nodeid, name);
fuse_resolve_and_resume (state, fuse_rmdir_resume);
return;
}
-
void
fuse_symlink_resume (fuse_state_t *state)
{
if (!state->loc.parent) {
gf_log ("glusterfs-fuse", GF_LOG_ERROR,
- "SYMLINK %"PRId64" (%s/%s) -> %s resolution failed",
+ "SYMLINK %"PRIu64" (%s/%s) -> %s resolution failed",
state->finh->nodeid, uuid_utoa (state->resolve.gfid),
- state->resolve.bname, state->name);
+ state->resolve.bname, state->name);
send_fuse_err (state->this, state->finh, ENOENT);
free_fuse_state (state);
return;
@@ -1550,7 +1645,7 @@ fuse_symlink_resume (fuse_state_t *state)
if (state->loc.inode) {
gf_log (state->this->name, GF_LOG_DEBUG, "inode already present");
inode_unref (state->loc.inode);
- state->loc.inode = NULL;
+ state->loc.inode = NULL;
}
state->loc.inode = inode_new (state->loc.parent->table);
@@ -1563,7 +1658,6 @@ fuse_symlink_resume (fuse_state_t *state)
symlink, state->name, &state->loc, state->umask, state->xdata);
}
-
static void
fuse_symlink (xlator_t *this, fuse_in_header_t *finh, void *msg)
{
@@ -1575,7 +1669,7 @@ fuse_symlink (xlator_t *this, fuse_in_header_t *finh, void *msg)
uuid_generate (state->gfid);
- fuse_resolve_entry_init (state, &state->resolve, finh->nodeid, name);
+ fuse_resolve_entry_init (state, &state->resolve, finh->nodeid, name);
state->name = gf_strdup (linkname);
@@ -1584,7 +1678,6 @@ fuse_symlink (xlator_t *this, fuse_in_header_t *finh, void *msg)
return;
}
-
int
fuse_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *buf,
@@ -1598,9 +1691,18 @@ fuse_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
state = frame->root->state;
finh = state->finh;
+ fuse_log_eh (this, "op_ret: %d, op_errno: %d, %"PRIu64": %s() "
+ "path: %s parent: %s ==> path: %s parent: %s"
+ "gfid: %s", op_ret, op_errno, frame->root->unique,
+ gf_fop_list[frame->root->op], state->loc.path,
+ state->loc.parent?uuid_utoa (state->loc.parent->gfid):"",
+ state->loc2.path,
+ state->loc2.parent?uuid_utoa (state->loc2.parent->gfid):"",
+ state->loc.inode?uuid_utoa (state->loc.inode->gfid):"");
+
if (op_ret == 0) {
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
- "%"PRIu64": %s -> %s => 0 (buf->ia_ino=%"PRId64")",
+ "%"PRIu64": %s -> %s => 0 (buf->ia_ino=%"PRIu64")",
frame->root->unique, state->loc.path, state->loc2.path,
buf->ia_ino);
@@ -1640,11 +1742,11 @@ fuse_rename_resume (fuse_state_t *state)
if (!state->loc.parent || !state->loc.inode) {
gf_log ("glusterfs-fuse", GF_LOG_ERROR,
"RENAME %"PRIu64" %s/%s -> %s/%s src resolution failed",
- state->finh->unique,
- uuid_utoa_r (state->resolve.gfid, loc_uuid),
- state->resolve.bname,
- uuid_utoa_r (state->resolve2.gfid, loc2_uuid),
- state->resolve2.bname);
+ state->finh->unique,
+ uuid_utoa_r (state->resolve.gfid, loc_uuid),
+ state->resolve.bname,
+ uuid_utoa_r (state->resolve2.gfid, loc2_uuid),
+ state->resolve2.bname);
send_fuse_err (state->this, state->finh, ENOENT);
free_fuse_state (state);
@@ -1654,11 +1756,11 @@ fuse_rename_resume (fuse_state_t *state)
if (!state->loc2.parent) {
gf_log ("glusterfs-fuse", GF_LOG_ERROR,
"RENAME %"PRIu64" %s/%s -> %s/%s dst resolution failed",
- state->finh->unique,
- uuid_utoa_r (state->resolve.gfid, loc_uuid),
- state->resolve.bname,
- uuid_utoa_r (state->resolve2.gfid, loc2_uuid),
- state->resolve2.bname);
+ state->finh->unique,
+ uuid_utoa_r (state->resolve.gfid, loc_uuid),
+ state->resolve.bname,
+ uuid_utoa_r (state->resolve2.gfid, loc2_uuid),
+ state->resolve2.bname);
send_fuse_err (state->this, state->finh, ENOENT);
free_fuse_state (state);
@@ -1677,7 +1779,6 @@ fuse_rename_resume (fuse_state_t *state)
rename, &state->loc, &state->loc2, state->xdata);
}
-
static void
fuse_rename (xlator_t *this, fuse_in_header_t *finh, void *msg)
{
@@ -1688,16 +1789,15 @@ fuse_rename (xlator_t *this, fuse_in_header_t *finh, void *msg)
GET_STATE (this, finh, state);
- fuse_resolve_entry_init (state, &state->resolve, finh->nodeid, oldname);
+ fuse_resolve_entry_init (state, &state->resolve, finh->nodeid, oldname);
- fuse_resolve_entry_init (state, &state->resolve2, fri->newdir, newname);
+ fuse_resolve_entry_init (state, &state->resolve2, fri->newdir, newname);
fuse_resolve_and_resume (state, fuse_rename_resume);
return;
}
-
void
fuse_link_resume (fuse_state_t *state)
{
@@ -1713,11 +1813,11 @@ fuse_link_resume (fuse_state_t *state)
state->resolve.op_ret = 0;
state->resolve2.op_ret = 0;
- if (state->loc.inode) {
- inode_unref (state->loc.inode);
- state->loc.inode = NULL;
- }
- state->loc.inode = inode_ref (state->loc2.inode);
+ if (state->loc.inode) {
+ inode_unref (state->loc.inode);
+ state->loc.inode = NULL;
+ }
+ state->loc.inode = inode_ref (state->loc2.inode);
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
"%"PRIu64": LINK() %s -> %s",
@@ -1728,7 +1828,6 @@ fuse_link_resume (fuse_state_t *state)
link, &state->loc2, &state->loc, state->xdata);
}
-
static void
fuse_link (xlator_t *this, fuse_in_header_t *finh, void *msg)
{
@@ -1738,16 +1837,15 @@ fuse_link (xlator_t *this, fuse_in_header_t *finh, void *msg)
GET_STATE (this, finh, state);
- fuse_resolve_inode_init (state, &state->resolve2, fli->oldnodeid);
+ fuse_resolve_inode_init (state, &state->resolve2, fli->oldnodeid);
- fuse_resolve_entry_init (state, &state->resolve, finh->nodeid, name);
+ fuse_resolve_entry_init (state, &state->resolve, finh->nodeid, name);
fuse_resolve_and_resume (state, fuse_link_resume);
return;
}
-
static int
fuse_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
@@ -1768,16 +1866,19 @@ fuse_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
finh = state->finh;
foo.open_flags = 0;
+ fuse_log_eh_fop(this, state, frame, op_ret, op_errno);
+
if (op_ret >= 0) {
foo.fh = (uintptr_t) fd;
if (((priv->direct_io_mode == 2)
&& ((state->flags & O_ACCMODE) != O_RDONLY))
- || (priv->direct_io_mode == 1))
+ || (priv->direct_io_mode == 1)
+ || direct_io_mode (xdata))
foo.open_flags |= FOPEN_DIRECT_IO;
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
- "%"PRIu64": %s() %s => %p (ino=%"PRId64")",
+ "%"PRIu64": %s() %s => %p (ino=%"PRIu64")",
frame->root->unique, gf_fop_list[frame->root->op],
state->loc.path, fd, buf->ia_ino);
@@ -1844,7 +1945,6 @@ out:
return 0;
}
-
void
fuse_create_resume (fuse_state_t *state)
{
@@ -1856,7 +1956,7 @@ fuse_create_resume (fuse_state_t *state)
gf_log ("glusterfs-fuse", GF_LOG_WARNING,
"%"PRIu64" CREATE %s/%s resolution failed",
state->finh->unique, uuid_utoa (state->resolve.gfid),
- state->resolve.bname);
+ state->resolve.bname);
send_fuse_err (state->this, state->finh, ENOENT);
free_fuse_state (state);
return;
@@ -1913,23 +2013,22 @@ fuse_create_resume (fuse_state_t *state)
}
-
static void
fuse_create (xlator_t *this, fuse_in_header_t *finh, void *msg)
{
#if FUSE_KERNEL_MINOR_VERSION >= 12
struct fuse_create_in *fci = msg;
+ fuse_private_t *priv = NULL;
+ int32_t ret = -1;
#else
struct fuse_open_in *fci = msg;
#endif
char *name = (char *)(fci + 1);
- fuse_private_t *priv = NULL;
fuse_state_t *state = NULL;
- int32_t ret = -1;
- priv = this->private;
#if FUSE_KERNEL_MINOR_VERSION >= 12
+ priv = this->private;
if (priv->proto_minor < 12)
name = (char *)((struct fuse_open_in *)msg + 1);
#endif
@@ -1938,56 +2037,20 @@ fuse_create (xlator_t *this, fuse_in_header_t *finh, void *msg)
uuid_generate (state->gfid);
- fuse_resolve_entry_init (state, &state->resolve, finh->nodeid, name);
+ fuse_resolve_entry_init (state, &state->resolve, finh->nodeid, name);
state->mode = fci->mode;
state->flags = fci->flags;
- priv = this->private;
#if FUSE_KERNEL_MINOR_VERSION >=12
- if (priv->proto_minor >= 12)
- state->mode &= ~fci->umask;
- if (priv->proto_minor >= 12 && priv->acl) {
- state->xdata = dict_new ();
- if (!state->xdata) {
- gf_log ("glusterfs-fuse", GF_LOG_WARNING,
- "CREATE Failed to allocate a param dictionary");
- send_fuse_err (this, finh, ENOMEM);
- free_fuse_state (state);
- return;
- }
- state->umask = fci->umask;
-
- /* TODO: remove this after 3.4.0 release. keeping it for the
- sake of backward compatibility with old (3.3.[01])
- releases till then. */
- ret = dict_set_int16 (state->xdata, "umask", fci->umask);
- if (ret < 0) {
- gf_log ("glusterfs-fuse", GF_LOG_WARNING,
- "CREATE Failed adding umask to request");
- dict_destroy (state->xdata);
- send_fuse_err (this, finh, ENOMEM);
- free_fuse_state (state);
- return;
- }
- ret = dict_set_int16 (state->xdata, "mode", fci->mode);
- if (ret < 0) {
- gf_log ("glusterfs-fuse", GF_LOG_WARNING,
- "CREATE Failed adding mode to request");
- dict_destroy (state->xdata);
- send_fuse_err (this, finh, ENOMEM);
- free_fuse_state (state);
- return;
- }
- }
+ priv = this->private;
+ FUSE_ENTRY_CREATE(this, priv, finh, state, fci, "CREATE");
#endif
-
fuse_resolve_and_resume (state, fuse_create_resume);
return;
}
-
void
fuse_open_resume (fuse_state_t *state)
{
@@ -2039,7 +2102,6 @@ fuse_open_resume (fuse_state_t *state)
open, &state->loc, state->flags, fd, state->xdata);
}
-
static void
fuse_open (xlator_t *this, fuse_in_header_t *finh, void *msg)
{
@@ -2048,7 +2110,7 @@ fuse_open (xlator_t *this, fuse_in_header_t *finh, void *msg)
GET_STATE (this, finh, state);
- fuse_resolve_inode_init (state, &state->resolve, finh->nodeid);
+ fuse_resolve_inode_init (state, &state->resolve, finh->nodeid);
state->flags = foi->flags;
@@ -2057,7 +2119,6 @@ fuse_open (xlator_t *this, fuse_in_header_t *finh, void *msg)
return;
}
-
static int
fuse_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
@@ -2072,9 +2133,11 @@ fuse_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
state = frame->root->state;
finh = state->finh;
+ fuse_log_eh_fop(this, state, frame, op_ret, op_errno);
+
if (op_ret >= 0) {
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
- "%"PRIu64": READ => %d/%"GF_PRI_SIZET",%"PRId64"/%"PRId64,
+ "%"PRIu64": READ => %d/%"GF_PRI_SIZET",%"PRId64"/%"PRIu64,
frame->root->unique,
op_ret, state->size, state->off, stbuf->ia_size);
@@ -2118,7 +2181,9 @@ fuse_readv (xlator_t *this, fuse_in_header_t *finh, void *msg)
{
struct fuse_read_in *fri = msg;
+#if FUSE_KERNEL_MINOR_VERSION >= 9
fuse_private_t *priv = NULL;
+#endif
fuse_state_t *state = NULL;
fd_t *fd = NULL;
@@ -2127,11 +2192,11 @@ fuse_readv (xlator_t *this, fuse_in_header_t *finh, void *msg)
fd = FH_TO_FD (fri->fh);
state->fd = fd;
- fuse_resolve_fd_init (state, &state->resolve, fd);
+ fuse_resolve_fd_init (state, &state->resolve, fd);
/* See comment by similar code in fuse_settatr */
- priv = this->private;
#if FUSE_KERNEL_MINOR_VERSION >= 9
+ priv = this->private;
if (priv->proto_minor >= 9 && fri->read_flags & FUSE_READ_LOCKOWNER)
state->lk_owner = fri->lock_owner;
#endif
@@ -2139,12 +2204,12 @@ fuse_readv (xlator_t *this, fuse_in_header_t *finh, void *msg)
state->size = fri->size;
state->off = fri->offset;
/* lets ignore 'fri->read_flags', but just consider 'fri->flags' */
+#if FUSE_KERNEL_MINOR_VERSION >= 9
state->io_flags = fri->flags;
-
+#endif
fuse_resolve_and_resume (state, fuse_readv_resume);
}
-
static int
fuse_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
@@ -2157,9 +2222,11 @@ fuse_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
state = frame->root->state;
finh = state->finh;
+ fuse_log_eh_fop(this, state, frame, op_ret, op_errno);
+
if (op_ret >= 0) {
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
- "%"PRIu64": WRITE => %d/%"GF_PRI_SIZET",%"PRId64"/%"PRId64,
+ "%"PRIu64": WRITE => %d/%"GF_PRI_SIZET",%"PRId64"/%"PRIu64,
frame->root->unique,
op_ret, state->size, state->off, stbuf->ia_size);
@@ -2201,7 +2268,7 @@ fuse_write_resume (fuse_state_t *state)
iobref_add (iobref, iobuf);
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
- "%"PRIu64": WRITE (%p, size=%"PRId64", offset=%"PRId64")",
+ "%"PRIu64": WRITE (%p, size=%"GF_PRI_SIZET", offset=%"PRId64")",
state->finh->unique, state->fd, state->size, state->off);
FUSE_FOP (state, fuse_writev_cbk, GF_FOP_WRITE, writev, state->fd,
@@ -2220,11 +2287,12 @@ fuse_write (xlator_t *this, fuse_in_header_t *finh, void *msg)
struct fuse_write_in *fwi = (struct fuse_write_in *)
(finh + 1);
- fuse_private_t *priv = NULL;
fuse_state_t *state = NULL;
fd_t *fd = NULL;
-
+#if FUSE_KERNEL_MINOR_VERSION >= 9
+ fuse_private_t *priv = NULL;
priv = this->private;
+#endif
GET_STATE (this, finh, state);
fd = FH_TO_FD (fwi->fh);
@@ -2233,17 +2301,21 @@ fuse_write (xlator_t *this, fuse_in_header_t *finh, void *msg)
state->off = fwi->offset;
/* lets ignore 'fwi->write_flags', but just consider 'fwi->flags' */
+#if FUSE_KERNEL_MINOR_VERSION >= 9
state->io_flags = fwi->flags;
+#else
+ state->io_flags = fwi->write_flags;
+#endif
/* TODO: may need to handle below flag
(fwi->write_flags & FUSE_WRITE_CACHE);
*/
- fuse_resolve_fd_init (state, &state->resolve, fd);
+ fuse_resolve_fd_init (state, &state->resolve, fd);
/* See comment by similar code in fuse_settatr */
- priv = this->private;
#if FUSE_KERNEL_MINOR_VERSION >= 9
+ priv = this->private;
if (priv->proto_minor >= 9 && fwi->write_flags & FUSE_WRITE_LOCKOWNER)
state->lk_owner = fwi->lock_owner;
#endif
@@ -2256,7 +2328,6 @@ fuse_write (xlator_t *this, fuse_in_header_t *finh, void *msg)
return;
}
-
void
fuse_flush_resume (fuse_state_t *state)
{
@@ -2264,7 +2335,6 @@ fuse_flush_resume (fuse_state_t *state)
flush, state->fd, state->xdata);
}
-
static void
fuse_flush (xlator_t *this, fuse_in_header_t *finh, void *msg)
{
@@ -2277,7 +2347,7 @@ fuse_flush (xlator_t *this, fuse_in_header_t *finh, void *msg)
fd = FH_TO_FD (ffi->fh);
state->fd = fd;
- fuse_resolve_fd_init (state, &state->resolve, fd);
+ fuse_resolve_fd_init (state, &state->resolve, fd);
state->lk_owner = ffi->lock_owner;
@@ -2289,6 +2359,15 @@ fuse_flush (xlator_t *this, fuse_in_header_t *finh, void *msg)
return;
}
+int
+fuse_internal_release (xlator_t *this, fd_t *fd)
+{
+ //This is a place holder function to prevent "xlator does not implement
+ //release_cbk" Warning log.
+ //Actual release happens as part of fuse_release which gets executed
+ //when kernel fuse sends it.
+ return 0;
+}
static void
fuse_release (xlator_t *this, fuse_in_header_t *finh, void *msg)
@@ -2308,6 +2387,9 @@ fuse_release (xlator_t *this, fuse_in_header_t *finh, void *msg)
priv = this->private;
+ fuse_log_eh (this, "RELEASE(): %"PRIu64":, fd: %p, gfid: %s",
+ finh->unique, fd, uuid_utoa (fd->inode->gfid));
+
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
"%"PRIu64": RELEASE %p", finh->unique, state->fd);
@@ -2335,20 +2417,18 @@ fuse_release (xlator_t *this, fuse_in_header_t *finh, void *msg)
return;
}
-
void
fuse_fsync_resume (fuse_state_t *state)
{
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
"%"PRIu64": FSYNC %p", state->finh->unique,
- state->fd);
+ state->fd);
/* fsync_flags: 1 means "datasync" (no defines for this) */
FUSE_FOP (state, fuse_fsync_cbk, GF_FOP_FSYNC,
fsync, state->fd, (state->flags & 1), state->xdata);
}
-
static void
fuse_fsync (xlator_t *this, fuse_in_header_t *finh, void *msg)
{
@@ -2361,14 +2441,13 @@ fuse_fsync (xlator_t *this, fuse_in_header_t *finh, void *msg)
fd = FH_TO_FD (fsi->fh);
state->fd = fd;
- fuse_resolve_fd_init (state, &state->resolve, fd);
+ fuse_resolve_fd_init (state, &state->resolve, fd);
state->flags = fsi->fsync_flags;
fuse_resolve_and_resume (state, fuse_fsync_resume);
return;
}
-
void
fuse_opendir_resume (fuse_state_t *state)
{
@@ -2418,7 +2497,6 @@ fuse_opendir_resume (fuse_state_t *state)
opendir, &state->loc, fd, state->xdata);
}
-
static void
fuse_opendir (xlator_t *this, fuse_in_header_t *finh, void *msg)
{
@@ -2435,7 +2513,6 @@ fuse_opendir (xlator_t *this, fuse_in_header_t *finh, void *msg)
fuse_resolve_and_resume (state, fuse_opendir_resume);
}
-
unsigned char
d_type_from_stat (struct iatt *buf)
{
@@ -2469,7 +2546,6 @@ d_type_from_stat (struct iatt *buf)
return d_type;
}
-
static int
fuse_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
@@ -2477,7 +2553,8 @@ fuse_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
{
fuse_state_t *state = NULL;
fuse_in_header_t *finh = NULL;
- int size = 0;
+ size_t size = 0;
+ size_t max_size = 0;
char *buf = NULL;
gf_dirent_t *entry = NULL;
struct fuse_dirent *fde = NULL;
@@ -2487,6 +2564,8 @@ fuse_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
finh = state->finh;
priv = state->this->private;
+ fuse_log_eh_fop(this, state, frame, op_ret, op_errno);
+
if (op_ret < 0) {
gf_log ("glusterfs-fuse", GF_LOG_WARNING,
"%"PRIu64": READDIR => -1 (%s)", frame->root->unique,
@@ -2501,11 +2580,23 @@ fuse_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
frame->root->unique, op_ret, state->size, state->off);
list_for_each_entry (entry, &entries->list, list) {
- size += FUSE_DIRENT_ALIGN (FUSE_NAME_OFFSET +
- strlen (entry->d_name));
+ size_t fde_size = FUSE_DIRENT_ALIGN (FUSE_NAME_OFFSET +
+ strlen (entry->d_name));
+ max_size += fde_size;
+
+ if (max_size > state->size) {
+ /* we received too many entries to fit in the reply */
+ max_size -= fde_size;
+ break;
+ }
+ }
+
+ if (max_size == 0) {
+ send_fuse_data (this, finh, 0, 0);
+ goto out;
}
- buf = GF_CALLOC (1, size, gf_fuse_mt_char);
+ buf = GF_CALLOC (1, max_size, gf_fuse_mt_char);
if (!buf) {
gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
"%"PRIu64": READDIR => -1 (%s)", frame->root->unique,
@@ -2519,6 +2610,9 @@ fuse_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
fde = (struct fuse_dirent *)(buf + size);
gf_fuse_fill_dirent (entry, fde, priv->enable_ino32);
size += FUSE_DIRENT_SIZE (fde);
+
+ if (size == max_size)
+ break;
}
send_fuse_data (this, finh, buf, size);
@@ -2534,19 +2628,17 @@ out:
}
-
void
fuse_readdir_resume (fuse_state_t *state)
{
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
- "%"PRIu64": READDIR (%p, size=%zu, offset=%"PRId64")",
+ "%"PRIu64": READDIR (%p, size=%"GF_PRI_SIZET", offset=%"PRId64")",
state->finh->unique, state->fd, state->size, state->off);
FUSE_FOP (state, fuse_readdir_cbk, GF_FOP_READDIR,
readdir, state->fd, state->size, state->off, state->xdata);
}
-
static void
fuse_readdir (xlator_t *this, fuse_in_header_t *finh, void *msg)
{
@@ -2561,11 +2653,201 @@ fuse_readdir (xlator_t *this, fuse_in_header_t *finh, void *msg)
fd = FH_TO_FD (fri->fh);
state->fd = fd;
- fuse_resolve_fd_init (state, &state->resolve, fd);
+ fuse_resolve_fd_init (state, &state->resolve, fd);
fuse_resolve_and_resume (state, fuse_readdir_resume);
}
+#if FUSE_KERNEL_MINOR_VERSION >= 20
+static int
+fuse_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+ dict_t *xdata)
+{
+ fuse_state_t *state = NULL;
+ fuse_in_header_t *finh = NULL;
+ size_t max_size = 0;
+ size_t size = 0;
+ char *buf = NULL;
+ gf_dirent_t *entry = NULL;
+ struct fuse_direntplus *fde = NULL;
+ struct fuse_entry_out *feo = NULL;
+ fuse_private_t *priv = NULL;
+
+ state = frame->root->state;
+ finh = state->finh;
+ priv = this->private;
+
+ if (op_ret < 0) {
+ gf_log ("glusterfs-fuse", GF_LOG_WARNING,
+ "%"PRIu64": READDIRP => -1 (%s)", frame->root->unique,
+ strerror (op_errno));
+
+ send_fuse_err (this, finh, op_errno);
+ goto out;
+ }
+
+ gf_log ("glusterfs-fuse", GF_LOG_TRACE,
+ "%"PRIu64": READDIRP => %d/%"GF_PRI_SIZET",%"PRId64,
+ frame->root->unique, op_ret, state->size, state->off);
+
+ list_for_each_entry (entry, &entries->list, list) {
+ size_t fdes = FUSE_DIRENT_ALIGN (FUSE_NAME_OFFSET_DIRENTPLUS +
+ strlen (entry->d_name));
+ max_size += fdes;
+
+ if (max_size > state->size) {
+ /* we received too many entries to fit in the reply */
+ max_size -= fdes;
+ break;
+ }
+ }
+
+ if (max_size == 0) {
+ send_fuse_data (this, finh, 0, 0);
+ goto out;
+ }
+
+ buf = GF_CALLOC (1, max_size, gf_fuse_mt_char);
+ if (!buf) {
+ gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
+ "%"PRIu64": READDIRP => -1 (%s)", frame->root->unique,
+ strerror (ENOMEM));
+ send_fuse_err (this, finh, ENOMEM);
+ goto out;
+ }
+
+ size = 0;
+ list_for_each_entry (entry, &entries->list, list) {
+ inode_t *linked_inode;
+
+ fde = (struct fuse_direntplus *)(buf + size);
+ feo = &fde->entry_out;
+ fde->dirent.ino = entry->d_ino;
+ fde->dirent.off = entry->d_off;
+ fde->dirent.type = entry->d_type;
+ fde->dirent.namelen = strlen (entry->d_name);
+ strncpy (fde->dirent.name, entry->d_name, fde->dirent.namelen);
+ size += FUSE_DIRENTPLUS_SIZE (fde);
+
+ if (!entry->inode)
+ goto next_entry;
+
+ entry->d_stat.ia_blksize = this->ctx->page_size;
+ gf_fuse_stat2attr (&entry->d_stat, &feo->attr, priv->enable_ino32);
+
+ linked_inode = inode_link (entry->inode, state->fd->inode,
+ entry->d_name, &entry->d_stat);
+ if (!linked_inode)
+ goto next_entry;
+
+ inode_lookup (linked_inode);
+
+ feo->nodeid = inode_to_fuse_nodeid (linked_inode);
+
+ fuse_inode_set_need_lookup (linked_inode, this);
+
+ inode_unref (linked_inode);
+
+ feo->entry_valid =
+ calc_timeout_sec (priv->entry_timeout);
+ feo->entry_valid_nsec =
+ calc_timeout_nsec (priv->entry_timeout);
+ feo->attr_valid =
+ calc_timeout_sec (priv->attribute_timeout);
+ feo->attr_valid_nsec =
+ calc_timeout_nsec (priv->attribute_timeout);
+
+next_entry:
+ if (size == max_size)
+ break;
+ }
+
+ send_fuse_data (this, finh, buf, size);
+out:
+ free_fuse_state (state);
+ STACK_DESTROY (frame->root);
+ GF_FREE (buf);
+ return 0;
+
+}
+
+void
+fuse_readdirp_resume (fuse_state_t *state)
+{
+ gf_log ("glusterfs-fuse", GF_LOG_TRACE,
+ "%"PRIu64": READDIRP (%p, size=%"GF_PRI_SIZET", offset=%"PRId64")",
+ state->finh->unique, state->fd, state->size, state->off);
+
+ FUSE_FOP (state, fuse_readdirp_cbk, GF_FOP_READDIRP,
+ readdirp, state->fd, state->size, state->off, state->xdata);
+}
+
+
+static void
+fuse_readdirp (xlator_t *this, fuse_in_header_t *finh, void *msg)
+{
+ struct fuse_read_in *fri = msg;
+
+ fuse_state_t *state = NULL;
+ fd_t *fd = NULL;
+
+ GET_STATE (this, finh, state);
+ state->size = fri->size;
+ state->off = fri->offset;
+ fd = FH_TO_FD (fri->fh);
+ state->fd = fd;
+
+ fuse_resolve_fd_init (state, &state->resolve, fd);
+
+ fuse_resolve_and_resume (state, fuse_readdirp_resume);
+}
+#endif
+
+#if FUSE_KERNEL_MINOR_VERSION >= 19
+#ifdef FALLOC_FL_KEEP_SIZE
+static int
+fuse_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ return fuse_err_cbk(frame, cookie, this, op_ret, op_errno, xdata);
+}
+
+static void
+fuse_fallocate_resume(fuse_state_t *state)
+{
+ gf_log("glusterfs-fuse", GF_LOG_TRACE,
+ "%"PRIu64": FALLOCATE (%p, flags=%d, size=%zu, offset=%"PRId64")",
+ state->finh->unique, state->fd, state->flags, state->size,
+ state->off);
+
+ if (state->flags & FALLOC_FL_PUNCH_HOLE)
+ FUSE_FOP(state, fuse_fallocate_cbk, GF_FOP_DISCARD, discard,
+ state->fd, state->off, state->size, state->xdata);
+ else
+ FUSE_FOP(state, fuse_fallocate_cbk, GF_FOP_FALLOCATE, fallocate,
+ state->fd, (state->flags & FALLOC_FL_KEEP_SIZE),
+ state->off, state->size, state->xdata);
+}
+
+static void
+fuse_fallocate(xlator_t *this, fuse_in_header_t *finh, void *msg)
+{
+ struct fuse_fallocate_in *ffi = msg;
+ fuse_state_t *state = NULL;
+
+ GET_STATE(this, finh, state);
+ state->off = ffi->offset;
+ state->size = ffi->length;
+ state->flags = ffi->mode;
+ state->fd = FH_TO_FD(ffi->fh);
+
+ fuse_resolve_fd_init(state, &state->resolve, state->fd);
+ fuse_resolve_and_resume(state, fuse_fallocate_resume);
+}
+#endif /* FALLOC_FL_KEEP_SIZE */
+#endif /* FUSE minor version >= 19 */
static void
fuse_releasedir (xlator_t *this, fuse_in_header_t *finh, void *msg)
@@ -2583,6 +2865,10 @@ fuse_releasedir (xlator_t *this, fuse_in_header_t *finh, void *msg)
priv = this->private;
+ fuse_log_eh (this, "RELEASEDIR (): %"PRIu64": fd: %p, gfid: %s",
+ finh->unique, state->fd,
+ uuid_utoa (state->fd->inode->gfid));
+
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
"%"PRIu64": RELEASEDIR %p", finh->unique, state->fd);
@@ -2634,7 +2920,7 @@ fuse_fsyncdir (xlator_t *this, fuse_in_header_t *finh, void *msg)
GET_STATE (this, finh, state);
state->fd = fd;
- fuse_resolve_fd_init (state, &state->resolve, fd);
+ fuse_resolve_fd_init (state, &state->resolve, fd);
state->flags = fsi->fsync_flags;
fuse_resolve_and_resume (state, fuse_fsyncdir_resume);
@@ -2642,7 +2928,6 @@ fuse_fsyncdir (xlator_t *this, fuse_in_header_t *finh, void *msg)
return;
}
-
static int
fuse_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct statvfs *buf,
@@ -2657,6 +2942,10 @@ fuse_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
priv = this->private;
finh = state->finh;
+ fuse_log_eh (this, "op_ret: %d, op_errno: %d, %"PRIu64": %s()",
+ op_ret, op_errno, frame->root->unique,
+ gf_fop_list[frame->root->op]);
+
if (op_ret == 0) {
#ifndef GF_DARWIN_HOST_OS
/* MacFUSE doesn't respect anyof these tweaks */
@@ -2696,7 +2985,6 @@ fuse_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
return 0;
}
-
void
fuse_statfs_resume (fuse_state_t *state)
{
@@ -2725,9 +3013,9 @@ fuse_statfs (xlator_t *this, fuse_in_header_t *finh, void *msg)
GET_STATE (this, finh, state);
- fuse_resolve_inode_init (state, &state->resolve, finh->nodeid);
+ fuse_resolve_inode_init (state, &state->resolve, finh->nodeid);
- fuse_resolve_and_resume (state, fuse_statfs_resume);
+ fuse_resolve_and_resume (state, fuse_statfs_resume);
}
@@ -2739,7 +3027,7 @@ fuse_setxattr_resume (fuse_state_t *state)
"%"PRIu64": SETXATTR %s/%"PRIu64" (%s) "
"resolution failed",
state->finh->unique, uuid_utoa (state->resolve.gfid),
- state->finh->nodeid, state->name);
+ state->finh->nodeid, state->name);
send_fuse_err (state->this, state->finh, ENOENT);
free_fuse_state (state);
return;
@@ -2802,21 +3090,21 @@ fuse_setxattr (xlator_t *this, fuse_in_header_t *finh, void *msg)
}
if (!priv->acl) {
- if ((strcmp (name, "system.posix_acl_access") == 0) ||
- (strcmp (name, "system.posix_acl_default") == 0)) {
+ if ((strcmp (name, POSIX_ACL_ACCESS_XATTR) == 0) ||
+ (strcmp (name, POSIX_ACL_DEFAULT_XATTR) == 0)) {
send_fuse_err (this, finh, EOPNOTSUPP);
GF_FREE (finh);
return;
}
}
- if (!priv->selinux) {
- if (strncmp (name, "security.", 9) == 0) {
- send_fuse_err (this, finh, EOPNOTSUPP);
- GF_FREE (finh);
- return;
- }
- }
+ if (!priv->selinux) {
+ if (strncmp (name, "security.", 9) == 0) {
+ send_fuse_err (this, finh, EOPNOTSUPP);
+ GF_FREE (finh);
+ return;
+ }
+ }
/* Check if the command is for changing the log
level of process or specific xlator */
@@ -2831,12 +3119,14 @@ fuse_setxattr (xlator_t *this, fuse_in_header_t *finh, void *msg)
gf_log ("fuse", GF_LOG_TRACE,
"got request to invalidate %"PRIu64, finh->nodeid);
send_fuse_err (this, finh, 0);
+#if FUSE_KERNEL_MINOR_VERSION >= 11
fuse_invalidate_entry (this, finh->nodeid);
+#endif
GF_FREE (finh);
return;
}
- if (!strcmp (GFID_XATTR_KEY, name)) {
+ if (!strcmp (GFID_XATTR_KEY, name) || !strcmp (GF_XATTR_VOL_ID_KEY, name)) {
send_fuse_err (this, finh, EPERM);
GF_FREE (finh);
return;
@@ -2845,7 +3135,7 @@ fuse_setxattr (xlator_t *this, fuse_in_header_t *finh, void *msg)
GET_STATE (this, finh, state);
state->size = fsi->size;
- fuse_resolve_inode_init (state, &state->resolve, finh->nodeid);
+ fuse_resolve_inode_init (state, &state->resolve, finh->nodeid);
state->xattr = get_new_dict ();
if (!state->xattr) {
@@ -2913,10 +3203,10 @@ send_fuse_xattr (xlator_t *this, fuse_in_header_t *finh, const char *value,
* when it tries to setxattr() for selinux xattrs
*/
static int
-fuse_filter_xattr(xlator_t *this, char *key)
+fuse_filter_xattr(char *key)
{
int need_filter = 0;
- struct fuse_private *priv = this->private;
+ struct fuse_private *priv = THIS->private;
if ((priv->client_pid == GF_CLIENT_PID_GSYNCD)
&& fnmatch ("*.selinux*", key, FNM_PERIOD) == 0)
@@ -2937,10 +3227,13 @@ fuse_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
data_t *value_data = NULL;
int ret = -1;
int32_t len = 0;
+ int32_t len_next = 0;
state = frame->root->state;
finh = state->finh;
+ fuse_log_eh_fop(this, state, frame, op_ret, op_errno);
+
if (op_ret >= 0) {
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
"%"PRIu64": %s() %s => %d", frame->root->unique,
@@ -2965,32 +3258,20 @@ fuse_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
/* we need to invoke fuse_filter_xattr() twice. Once
* while counting size and then while filling buffer
*/
- int _get_total_len (dict_t *d, char *k, data_t *v,
- void *tmp)
- {
- if (!fuse_filter_xattr (this, k))
- len += strlen (k) + 1;
- return 0;
- }
- dict_foreach (dict, _get_total_len, NULL);
+ len = dict_keys_join (NULL, 0, dict, fuse_filter_xattr);
+ if (len < 0)
+ goto out;
value = alloca (len + 1);
if (!value)
goto out;
- len = 0;
-
- int _set_listxattr_keys (dict_t *d, char *k, data_t *v,
- void *tmp)
- {
- if (!fuse_filter_xattr (this, k)) {
- strcpy (value + len, k);
- value[len + strlen (k)] = '\0';
- len += strlen (k) + 1;
- }
- return 0;
- }
- dict_foreach (dict, _set_listxattr_keys, NULL);
+ len_next = dict_keys_join (value, len, dict,
+ fuse_filter_xattr);
+ if (len_next != len)
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "sizes not equal %d != %d",
+ len, len_next);
send_fuse_xattr (this, finh, value, len, state->size);
} /* if(state->name)...else */
@@ -3036,13 +3317,15 @@ out:
void
fuse_getxattr_resume (fuse_state_t *state)
{
+ char *value = NULL;
+
if (!state->loc.inode) {
gf_log ("glusterfs-fuse", GF_LOG_WARNING,
"%"PRIu64": GETXATTR %s/%"PRIu64" (%s) "
- "resolution failed",
+ "resolution failed",
state->finh->unique,
- uuid_utoa (state->resolve.gfid),
- state->finh->nodeid, state->name);
+ uuid_utoa (state->resolve.gfid),
+ state->finh->nodeid, state->name);
send_fuse_err (state->this, state->finh, ENOENT);
free_fuse_state (state);
@@ -3053,6 +3336,46 @@ fuse_getxattr_resume (fuse_state_t *state)
state->fd = fd_lookup (state->loc.inode, state->finh->pid);
#endif /* GF_TEST_FFOP */
+ if (state->name &&
+ (strcmp (state->name, VIRTUAL_GFID_XATTR_KEY) == 0)) {
+ /* send glusterfs gfid in binary form */
+
+ value = GF_CALLOC (16 + 1, sizeof(char),
+ gf_common_mt_char);
+ if (!value) {
+ send_fuse_err (state->this, state->finh, ENOMEM);
+ goto internal_out;
+ }
+ memcpy (value, state->loc.inode->gfid, 16);
+
+ send_fuse_xattr (THIS, state->finh, value, 16, state->size);
+ GF_FREE (value);
+ internal_out:
+ free_fuse_state (state);
+ return;
+ }
+
+ if (state->name &&
+ (strcmp (state->name, VIRTUAL_GFID_XATTR_KEY_STR) == 0)) {
+ /* transform binary gfid to canonical form */
+
+ value = GF_CALLOC (UUID_CANONICAL_FORM_LEN + 1, sizeof(char),
+ gf_common_mt_char);
+ if (!value) {
+ send_fuse_err (state->this, state->finh, ENOMEM);
+ goto internal_out1;
+ }
+ uuid_utoa_r (state->loc.inode->gfid, value);
+
+ send_fuse_xattr (THIS, state->finh, value,
+ UUID_CANONICAL_FORM_LEN, state->size);
+ GF_FREE (value);
+ internal_out1:
+ free_fuse_state (state);
+ return;
+ }
+
+
if (state->fd) {
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
"%"PRIu64": GETXATTR %p/%"PRIu64" (%s)", state->finh->unique,
@@ -3074,15 +3397,16 @@ fuse_getxattr_resume (fuse_state_t *state)
static void
fuse_getxattr (xlator_t *this, fuse_in_header_t *finh, void *msg)
{
- struct fuse_getxattr_in *fgxi = msg;
- char *name = (char *)(fgxi + 1);
-
- fuse_state_t *state = NULL;
- struct fuse_private *priv = NULL;
- int rv = 0;
- char *newkey = NULL;
+ struct fuse_getxattr_in *fgxi = msg;
+ char *name = (char *)(fgxi + 1);
+ fuse_state_t *state = NULL;
+ struct fuse_private *priv = NULL;
+ int rv = 0;
+ int op_errno = EINVAL;
+ char *newkey = NULL;
priv = this->private;
+ GET_STATE (this, finh, state);
#ifdef GF_DARWIN_HOST_OS
if (fgxi->position) {
@@ -3098,45 +3422,43 @@ fuse_getxattr (xlator_t *this, fuse_in_header_t *finh, void *msg)
"%"PRIu64": GETXATTR %s/%"PRIu64" (%s):"
"refusing positioned getxattr",
finh->unique, state->loc.path, finh->nodeid, name);
- send_fuse_err (this, finh, EINVAL);
- FREE (finh);
- return;
+ op_errno = EINVAL;
+ goto err;
}
#endif
if (!priv->acl) {
- if ((strcmp (name, "system.posix_acl_access") == 0) ||
- (strcmp (name, "system.posix_acl_default") == 0)) {
- send_fuse_err (this, finh, ENOTSUP);
- GF_FREE (finh);
- return;
+ if ((strcmp (name, POSIX_ACL_ACCESS_XATTR) == 0) ||
+ (strcmp (name, POSIX_ACL_DEFAULT_XATTR) == 0)) {
+ op_errno = ENOTSUP;
+ goto err;
}
}
- if (!priv->selinux) {
- if (strncmp (name, "security.", 9) == 0) {
- send_fuse_err (this, finh, ENODATA);
- GF_FREE (finh);
- return;
- }
- }
-
- GET_STATE (this, finh, state);
+ if (!priv->selinux) {
+ if (strncmp (name, "security.", 9) == 0) {
+ op_errno = ENODATA;
+ goto err;
+ }
+ }
- fuse_resolve_inode_init (state, &state->resolve, finh->nodeid);
+ fuse_resolve_inode_init (state, &state->resolve, finh->nodeid);
rv = fuse_flip_xattr_ns (priv, name, &newkey);
if (rv) {
- send_fuse_err (this, finh, ENOMEM);
- free_fuse_state (state);
- goto out;
+ op_errno = ENOMEM;
+ goto err;
}
state->size = fgxi->size;
state->name = newkey;
fuse_resolve_and_resume (state, fuse_getxattr_resume);
- out:
+
+ return;
+ err:
+ send_fuse_err (this, finh, op_errno);
+ free_fuse_state (state);
return;
}
@@ -3148,7 +3470,7 @@ fuse_listxattr_resume (fuse_state_t *state)
gf_log ("glusterfs-fuse", GF_LOG_WARNING,
"%"PRIu64": LISTXATTR %s/%"PRIu64
"resolution failed", state->finh->unique,
- uuid_utoa (state->resolve.gfid), state->finh->nodeid);
+ uuid_utoa (state->resolve.gfid), state->finh->nodeid);
send_fuse_err (state->this, state->finh, ENOENT);
free_fuse_state (state);
@@ -3185,7 +3507,7 @@ fuse_listxattr (xlator_t *this, fuse_in_header_t *finh, void *msg)
GET_STATE (this, finh, state);
- fuse_resolve_inode_init (state, &state->resolve, finh->nodeid);
+ fuse_resolve_inode_init (state, &state->resolve, finh->nodeid);
state->size = fgxi->size;
@@ -3201,9 +3523,9 @@ fuse_removexattr_resume (fuse_state_t *state)
if (!state->loc.inode) {
gf_log ("glusterfs-fuse", GF_LOG_DEBUG,
"%"PRIu64": REMOVEXATTR %s/%"PRIu64" (%s) "
- "resolution failed",
+ "resolution failed",
state->finh->unique, uuid_utoa (state->resolve.gfid),
- state->finh->nodeid, state->name);
+ state->finh->nodeid, state->name);
send_fuse_err (state->this, state->finh, ENOENT);
free_fuse_state (state);
@@ -3242,7 +3564,7 @@ fuse_removexattr (xlator_t *this, fuse_in_header_t *finh, void *msg)
int32_t ret = -1;
char *newkey = NULL;
- if (!strcmp (GFID_XATTR_KEY, name)) {
+ if (!strcmp (GFID_XATTR_KEY, name) || !strcmp (GF_XATTR_VOL_ID_KEY, name)) {
send_fuse_err (this, finh, EPERM);
GF_FREE (finh);
return;
@@ -3252,7 +3574,7 @@ fuse_removexattr (xlator_t *this, fuse_in_header_t *finh, void *msg)
GET_STATE (this, finh, state);
- fuse_resolve_inode_init (state, &state->resolve, finh->nodeid);
+ fuse_resolve_inode_init (state, &state->resolve, finh->nodeid);
ret = fuse_flip_xattr_ns (priv, name, &newkey);
if (ret) {
@@ -3280,6 +3602,8 @@ fuse_getlk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
state = frame->root->state;
struct fuse_lk_out flo = {{0, }, };
+ fuse_log_eh_fop(this, state, frame, op_ret, op_errno);
+
if (op_ret == 0) {
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
"%"PRIu64": ERR => 0", frame->root->unique);
@@ -3341,7 +3665,7 @@ fuse_getlk (xlator_t *this, fuse_in_header_t *finh, void *msg)
GET_STATE (this, finh, state);
state->fd = fd;
- fuse_resolve_fd_init (state, &state->resolve, fd);
+ fuse_resolve_fd_init (state, &state->resolve, fd);
convert_fuse_file_lock (&fli->lk, &state->lk_lock,
fli->owner);
@@ -3365,6 +3689,8 @@ fuse_setlk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
state = frame->root->state;
op = state->finh->opcode;
+ fuse_log_eh_fop(this, state, frame, op_ret, op_errno);
+
if (op_ret == 0) {
gf_log ("glusterfs-fuse", GF_LOG_TRACE,
"%"PRIu64": ERR => 0", frame->root->unique);
@@ -3432,7 +3758,7 @@ fuse_setlk (xlator_t *this, fuse_in_header_t *finh, void *msg)
state->finh = finh;
state->fd = fd;
- fuse_resolve_fd_init (state, &state->resolve, fd);
+ fuse_resolve_fd_init (state, &state->resolve, fd);
convert_fuse_file_lock (&fli->lk, &state->lk_lock,
fli->owner);
@@ -3444,7 +3770,7 @@ fuse_setlk (xlator_t *this, fuse_in_header_t *finh, void *msg)
return;
}
-
+#if FUSE_KERNEL_MINOR_VERSION >= 11
static void *
notify_kernel_loop (void *data)
{
@@ -3480,7 +3806,7 @@ notify_kernel_loop (void *data)
return NULL;
}
-
+#endif
static void
fuse_init (xlator_t *this, fuse_in_header_t *finh, void *msg)
@@ -3489,8 +3815,10 @@ fuse_init (xlator_t *this, fuse_in_header_t *finh, void *msg)
struct fuse_init_out fino = {0,};
fuse_private_t *priv = NULL;
int ret = 0;
+#if FUSE_KERNEL_MINOR_VERSION >= 9
int pfd[2] = {0,};
pthread_t messenger;
+#endif
priv = this->private;
@@ -3519,6 +3847,10 @@ fuse_init (xlator_t *this, fuse_in_header_t *finh, void *msg)
fino.max_readahead = 1 << 17;
fino.max_write = 1 << 17;
fino.flags = FUSE_ASYNC_READ | FUSE_POSIX_LOCKS;
+#if FUSE_KERNEL_MINOR_VERSION >= 17
+ if (fini->minor >= 17)
+ fino.flags |= FUSE_FLOCK_LOCKS;
+#endif
#if FUSE_KERNEL_MINOR_VERSION >= 12
if (fini->minor >= 12) {
/* let fuse leave the umask processing to us, so that it does not
@@ -3547,8 +3879,8 @@ fuse_init (xlator_t *this, fuse_in_header_t *finh, void *msg)
}
priv->revchan_in = pfd[0];
priv->revchan_out = pfd[1];
- ret = pthread_create (&messenger, NULL, notify_kernel_loop,
- this);
+ ret = gf_thread_create (&messenger, NULL, notify_kernel_loop,
+ this);
if (ret != 0) {
gf_log ("glusterfs-fuse", GF_LOG_ERROR,
"failed to start messenger daemon (%s)",
@@ -3559,19 +3891,19 @@ fuse_init (xlator_t *this, fuse_in_header_t *finh, void *msg)
}
priv->reverse_fuse_thread_started = _gf_true;
} else {
- /*
- * FUSE minor < 12 does not implement invalidate notifications.
- * This mechanism is required for fopen-keep-cache to operate
- * correctly. Disable and warn the user.
- */
- if (priv->fopen_keep_cache) {
- gf_log("glusterfs-fuse", GF_LOG_WARNING, "FUSE version "
- "%d.%d does not support inval notifications. "
- "fopen-keep-cache disabled.", fini->major,
- fini->minor);
- priv->fopen_keep_cache = 0;
- }
- }
+ /*
+ * FUSE minor < 12 does not implement invalidate notifications.
+ * This mechanism is required for fopen-keep-cache to operate
+ * correctly. Disable and warn the user.
+ */
+ if (priv->fopen_keep_cache) {
+ gf_log("glusterfs-fuse", GF_LOG_WARNING, "FUSE version "
+ "%d.%d does not support inval notifications. "
+ "fopen-keep-cache disabled.", fini->major,
+ fini->minor);
+ priv->fopen_keep_cache = 0;
+ }
+ }
if (fini->minor >= 13) {
fino.max_background = priv->background_qlen;
@@ -3579,6 +3911,56 @@ fuse_init (xlator_t *this, fuse_in_header_t *finh, void *msg)
}
if (fini->minor < 9)
*priv->msg0_len_p = sizeof(*finh) + FUSE_COMPAT_WRITE_IN_SIZE;
+
+ if (priv->use_readdirp) {
+ if (fini->flags & FUSE_DO_READDIRPLUS)
+ fino.flags |= FUSE_DO_READDIRPLUS;
+ }
+#endif
+ if (priv->fopen_keep_cache == 2) {
+ /* If user did not explicitly set --fopen-keep-cache[=off],
+ then check if kernel support FUSE_AUTO_INVAL_DATA and ...
+ */
+#if FUSE_KERNEL_MINOR_VERSION >= 20
+ if (fini->flags & FUSE_AUTO_INVAL_DATA) {
+ /* ... enable fopen_keep_cache mode if supported.
+ */
+ gf_log ("glusterfs-fuse", GF_LOG_DEBUG, "Detected "
+ "support for FUSE_AUTO_INVAL_DATA. Enabling "
+ "fopen_keep_cache automatically.");
+ fino.flags |= FUSE_AUTO_INVAL_DATA;
+ priv->fopen_keep_cache = 1;
+ } else
+#endif
+ {
+
+ gf_log ("glusterfs-fuse", GF_LOG_DEBUG, "No support "
+ "for FUSE_AUTO_INVAL_DATA. Disabling "
+ "fopen_keep_cache.");
+ /* ... else disable. */
+ priv->fopen_keep_cache = 0;
+ }
+ } else if (priv->fopen_keep_cache == 1) {
+ /* If user explicitly set --fopen-keep-cache[=on],
+ then enable FUSE_AUTO_INVAL_DATA if possible.
+ */
+#if FUSE_KERNEL_MINOR_VERSION >= 20
+ if (fini->flags & FUSE_AUTO_INVAL_DATA) {
+ gf_log ("glusterfs-fuse", GF_LOG_DEBUG, "fopen_keep_cache "
+ "is explicitly set. Enabling FUSE_AUTO_INVAL_DATA");
+ fino.flags |= FUSE_AUTO_INVAL_DATA;
+ } else
+#endif
+ {
+ gf_log ("glusterfs-fuse", GF_LOG_WARNING, "fopen_keep_cache "
+ "is explicitly set. Support for "
+ "FUSE_AUTO_INVAL_DATA is missing");
+ }
+ }
+
+#if FUSE_KERNEL_MINOR_VERSION >= 22
+ if (fini->flags & FUSE_ASYNC_DIO)
+ fino.flags |= FUSE_ASYNC_DIO;
#endif
ret = send_fuse_obj (this, finh, &fino);
if (ret == 0)
@@ -3722,12 +4104,14 @@ fuse_nameless_lookup (xlator_t *xl, uuid_t gfid, loc_t *loc)
inode_t *linked_inode = NULL;
if ((loc == NULL) || (xl == NULL)) {
+ ret = -EINVAL;
goto out;
}
if (loc->inode == NULL) {
loc->inode = inode_new (xl->itable);
if (loc->inode == NULL) {
+ ret = -ENOMEM;
goto out;
}
}
@@ -3736,13 +4120,13 @@ fuse_nameless_lookup (xlator_t *xl, uuid_t gfid, loc_t *loc)
xattr_req = dict_new ();
if (xattr_req == NULL) {
+ ret = -ENOMEM;
goto out;
}
ret = syncop_lookup (xl, loc, xattr_req, &iatt, NULL, NULL);
- if (ret < 0) {
+ if (ret < 0)
goto out;
- }
linked_inode = inode_link (loc->inode, NULL, NULL, &iatt);
inode_unref (loc->inode);
@@ -3791,9 +4175,10 @@ fuse_migrate_fd_open (xlator_t *this, fd_t *basefd, fd_t *oldfd,
"name-less lookup of gfid (%s) failed (%s)"
"(old-subvolume:%s-%d new-subvolume:%s-%d)",
uuid_utoa (basefd->inode->gfid),
- strerror (errno),
+ strerror (-ret),
old_subvol->name, old_subvol->graph->id,
new_subvol->name, new_subvol->graph->id);
+ ret = -1;
goto out;
}
@@ -3811,9 +4196,14 @@ fuse_migrate_fd_open (xlator_t *this, fd_t *basefd, fd_t *oldfd,
uuid_utoa (loc.inode->gfid),
old_subvol->name, old_subvol->graph->id,
new_subvol->name, new_subvol->graph->id);
+ ret = -1;
goto out;
}
+ newfd->flags = basefd->flags;
+ if (newfd->lk_ctx)
+ fd_lk_ctx_unref (newfd->lk_ctx);
+
newfd->lk_ctx = fd_lk_ctx_ref (oldfd->lk_ctx);
newfd_ctx = fuse_fd_ctx_check_n_create (this, newfd);
@@ -3830,9 +4220,10 @@ fuse_migrate_fd_open (xlator_t *this, fd_t *basefd, fd_t *oldfd,
gf_log ("glusterfs-fuse", GF_LOG_WARNING,
"open on basefd (ptr:%p inode-gfid:%s) failed (%s)"
"(old-subvolume:%s-%d new-subvolume:%s-%d)", basefd,
- uuid_utoa (basefd->inode->gfid), strerror (errno),
+ uuid_utoa (basefd->inode->gfid), strerror (-ret),
old_subvol->name, old_subvol->graph->id,
new_subvol->name, new_subvol->graph->id);
+ ret = -1;
goto out;
}
@@ -3852,20 +4243,95 @@ fuse_migrate_fd_open (xlator_t *this, fd_t *basefd, fd_t *oldfd,
fd_unref (old_activefd);
}
- gf_log ("glusterfs-fuse", GF_LOG_WARNING,
- "migrated basefd (%p) to newfd (%p) "
+ gf_log ("glusterfs-fuse", GF_LOG_INFO,
+ "migrated basefd (%p) to newfd (%p) (inode-gfid:%s)"
"(old-subvolume:%s-%d new-subvolume:%s-%d)", basefd, newfd,
- old_subvol->name, old_subvol->graph->id,
+ uuid_utoa (basefd->inode->gfid),
+ old_subvol->name, old_subvol->graph->id,
new_subvol->name, new_subvol->graph->id);
- newfd = NULL;
ret = 0;
+
out:
loc_wipe (&loc);
return ret;
}
+int
+fuse_migrate_locks (xlator_t *this, fd_t *basefd, fd_t *oldfd,
+ xlator_t *old_subvol, xlator_t *new_subvol)
+{
+ int ret = -1;
+ dict_t *lockinfo = NULL;
+ void *ptr = NULL;
+ fd_t *newfd = NULL;
+ fuse_fd_ctx_t *basefd_ctx = NULL;
+
+
+ if (!oldfd->lk_ctx || fd_lk_ctx_empty (oldfd->lk_ctx))
+ return 0;
+
+ basefd_ctx = fuse_fd_ctx_get (this, basefd);
+ GF_VALIDATE_OR_GOTO ("glusterfs-fuse", basefd_ctx, out);
+
+ LOCK (&basefd->lock);
+ {
+ newfd = fd_ref (basefd_ctx->activefd);
+ }
+ UNLOCK (&basefd->lock);
+
+ ret = syncop_fgetxattr (old_subvol, oldfd, &lockinfo,
+ GF_XATTR_LOCKINFO_KEY);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "getting lockinfo failed while migrating locks"
+ "(oldfd:%p newfd:%p inode-gfid:%s)"
+ "(old-subvol:%s-%d new-subvol:%s-%d)",
+ oldfd, newfd, uuid_utoa (newfd->inode->gfid),
+ old_subvol->name, old_subvol->graph->id,
+ new_subvol->name, new_subvol->graph->id);
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_get_ptr (lockinfo, GF_XATTR_LOCKINFO_KEY, &ptr);
+ if (ptr == NULL) {
+ ret = 0;
+ gf_log (this->name, GF_LOG_INFO,
+ "No lockinfo present on any of the bricks "
+ "(oldfd: %p newfd:%p inode-gfid:%s) "
+ "(old-subvol:%s-%d new-subvol:%s-%d)",
+ oldfd, newfd, uuid_utoa (newfd->inode->gfid),
+ old_subvol->name, old_subvol->graph->id,
+ new_subvol->name, new_subvol->graph->id);
+
+ goto out;
+ }
+
+ ret = syncop_fsetxattr (new_subvol, newfd, lockinfo, 0);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "migrating locks failed (oldfd:%p newfd:%p "
+ "inode-gfid:%s) (old-subvol:%s-%d new-subvol:%s-%d)",
+ oldfd, newfd, uuid_utoa (newfd->inode->gfid),
+ old_subvol->name, old_subvol->graph->id,
+ new_subvol->name, new_subvol->graph->id);
+ ret = -1;
+ goto out;
+ }
+
+out:
+ if (newfd)
+ fd_unref (newfd);
+
+ if (lockinfo != NULL) {
+ dict_unref (lockinfo);
+ }
+
+ return ret;
+}
+
int
fuse_migrate_fd (xlator_t *this, fd_t *basefd, xlator_t *old_subvol,
@@ -3873,7 +4339,7 @@ fuse_migrate_fd (xlator_t *this, fd_t *basefd, xlator_t *old_subvol,
{
int ret = -1;
char create_in_progress = 0;
- fuse_fd_ctx_t *basefd_ctx = NULL;
+ fuse_fd_ctx_t *basefd_ctx = NULL;
fd_t *oldfd = NULL;
basefd_ctx = fuse_fd_ctx_get (this, basefd);
@@ -3919,10 +4385,11 @@ fuse_migrate_fd (xlator_t *this, fd_t *basefd, xlator_t *old_subvol,
"syncop_fsync failed (%s) on fd (%p)"
"(basefd:%p basefd-inode.gfid:%s) "
"(old-subvolume:%s-%d new-subvolume:%s-%d)",
- strerror (errno), oldfd, basefd,
+ strerror (-ret), oldfd, basefd,
uuid_utoa (basefd->inode->gfid),
old_subvol->name, old_subvol->graph->id,
new_subvol->name, new_subvol->graph->id);
+ ret = -1;
}
} else {
gf_log ("glusterfs-fuse", GF_LOG_WARNING,
@@ -3936,6 +4403,27 @@ fuse_migrate_fd (xlator_t *this, fd_t *basefd, xlator_t *old_subvol,
ret = fuse_migrate_fd_open (this, basefd, oldfd, old_subvol,
new_subvol);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING, "open corresponding to "
+ "basefd (ptr:%p inode-gfid:%s) in new graph failed "
+ "(old-subvolume:%s-%d new-subvolume:%s-%d)", basefd,
+ uuid_utoa (basefd->inode->gfid), old_subvol->name,
+ old_subvol->graph->id, new_subvol->name,
+ new_subvol->graph->id);
+ goto out;
+ }
+
+ ret = fuse_migrate_locks (this, basefd, oldfd, old_subvol,
+ new_subvol);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "migrating locks from old-subvolume (%s-%d) to "
+ "new-subvolume (%s-%d) failed (inode-gfid:%s oldfd:%p "
+ "basefd:%p)", old_subvol->name, old_subvol->graph->id,
+ new_subvol->name, new_subvol->graph->id,
+ uuid_utoa (basefd->inode->gfid), oldfd, basefd);
+
+ }
out:
if (ret < 0) {
gf_log (this->name, GF_LOG_WARNING, "migration of basefd "
@@ -4134,11 +4622,11 @@ fuse_graph_sync (xlator_t *this)
ret = pthread_cond_wait (&priv->sync_cond,
&priv->sync_mutex);
if (ret != 0) {
- gf_log (this->name, GF_LOG_DEBUG,
- "timedwait returned non zero value "
- "ret: %d errno: %d", ret, errno);
- break;
- }
+ gf_log (this->name, GF_LOG_DEBUG,
+ "timedwait returned non zero value "
+ "ret: %d errno: %d", ret, errno);
+ break;
+ }
}
}
unlock:
@@ -4289,13 +4777,20 @@ fuse_thread_proc (void *data)
"terminating upon getting %s when "
"reading /dev/fuse",
errno == ENODEV ? "ENODEV" : "EBADF");
-
+ fuse_log_eh (this, "glusterfs-fuse: terminating"
+ " upon getting %s when "
+ "reading /dev/fuse",
+ errno == ENODEV ? "ENODEV":
+ "EBADF");
break;
}
if (errno != EINTR) {
gf_log ("glusterfs-fuse", GF_LOG_WARNING,
"read from /dev/fuse returned -1 (%s)",
strerror (errno));
+ fuse_log_eh (this, "glusterfs-fuse: read from "
+ "/dev/fuse returned -1 (%s)",
+ strerror (errno));
}
goto cont_err;
@@ -4303,6 +4798,8 @@ fuse_thread_proc (void *data)
if (res < sizeof (finh)) {
gf_log ("glusterfs-fuse", GF_LOG_WARNING,
"short read on /dev/fuse");
+ fuse_log_eh (this, "glusterfs-fuse: short read on "
+ "/dev/fuse");
break;
}
@@ -4321,6 +4818,8 @@ fuse_thread_proc (void *data)
) {
gf_log ("glusterfs-fuse", GF_LOG_WARNING,
"inconsistent read on /dev/fuse");
+ fuse_log_eh (this, "glusterfs-fuse: inconsistent read "
+ "on /dev/fuse");
break;
}
@@ -4399,7 +4898,7 @@ fuse_itable_dump (xlator_t *this)
gf_proc_dump_add_section("xlator.mount.fuse.itable");
inode_table_dump(this->itable, "xlator.mount.fuse.itable");
- return 0;
+ return 0;
}
int32_t
@@ -4442,17 +4941,59 @@ fuse_priv_dump (xlator_t *this)
(int)private->strict_volfile_check);
gf_proc_dump_write("reverse_thread_started", "%d",
(int)private->reverse_fuse_thread_started);
+ gf_proc_dump_write("use_readdirp", "%d", private->use_readdirp);
return 0;
}
+int
+fuse_history_dump (xlator_t *this)
+{
+ int ret = -1;
+ char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0,};
+
+ GF_VALIDATE_OR_GOTO ("fuse", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, this->history, out);
+
+ gf_proc_dump_build_key (key_prefix, "xlator.mount.fuse",
+ "history");
+ gf_proc_dump_add_section (key_prefix);
+ eh_dump (this->history, NULL, dump_history_fuse);
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int
+dump_history_fuse (circular_buffer_t *cb, void *data)
+{
+ char *string = NULL;
+ struct tm *tm = NULL;
+ char timestr[256] = {0,};
+
+ string = (char *)cb->data;
+ tm = localtime (&cb->tv.tv_sec);
+
+ if (tm) {
+ strftime (timestr, 256, "%Y-%m-%d %H:%M:%S", tm);
+ snprintf (timestr + strlen (timestr), 256 - strlen (timestr),
+ ".%"GF_PRI_SUSECONDS, cb->tv.tv_usec);
+ gf_proc_dump_write ("TIME", "%s", timestr);
+ }
+
+ gf_proc_dump_write ("message", "%s\n", string);
+
+ return 0;
+}
int
fuse_graph_setup (xlator_t *this, glusterfs_graph_t *graph)
{
- inode_table_t *itable = NULL;
- int ret = 0;
- fuse_private_t *priv = NULL;
+ inode_table_t *itable = NULL;
+ int ret = 0, winds = 0;
+ fuse_private_t *priv = NULL;
+ glusterfs_graph_t *prev_graph = NULL;
priv = this->private;
@@ -4473,13 +5014,30 @@ fuse_graph_setup (xlator_t *this, glusterfs_graph_t *graph)
pthread_mutex_lock (&priv->sync_mutex);
{
- priv->next_graph = graph;
- priv->event_recvd = 0;
+ prev_graph = priv->next_graph;
+
+ if ((prev_graph != NULL) && (prev_graph->id > graph->id)) {
+ /* there was a race and an old graph was initialised
+ * before new one.
+ */
+ prev_graph = graph;
+ } else {
+ priv->next_graph = graph;
+ priv->event_recvd = 0;
+
+ pthread_cond_signal (&priv->sync_cond);
+ }
- pthread_cond_signal (&priv->sync_cond);
+ if (prev_graph != NULL)
+ winds = ((xlator_t *)prev_graph->top)->winds;
}
pthread_mutex_unlock (&priv->sync_mutex);
+ if ((prev_graph != NULL) && (winds == 0)) {
+ xlator_notify (prev_graph->top, GF_EVENT_PARENT_DOWN,
+ prev_graph->top, NULL);
+ }
+
gf_log ("fuse", GF_LOG_INFO, "switched to graph %d",
((graph) ? graph->id : 0));
@@ -4530,8 +5088,8 @@ notify (xlator_t *this, int32_t event, void *data, ...)
if (!private->fuse_thread_started) {
private->fuse_thread_started = 1;
- ret = pthread_create (&private->fuse_thread, NULL,
- fuse_thread_proc, this);
+ ret = gf_thread_create (&private->fuse_thread, NULL,
+ fuse_thread_proc, this);
if (ret != 0) {
gf_log (this->name, GF_LOG_DEBUG,
"pthread_create() failed (%s)",
@@ -4618,15 +5176,26 @@ static fuse_handler_t *fuse_std_ops[FUSE_OP_HIGH] = {
/* [FUSE_IOCTL] */
/* [FUSE_POLL] */
/* [FUSE_NOTIFY_REPLY] */
- /* [FUSE_BATCH_FORGET] */
- /* [FUSE_FALLOCATE] */
-};
+#if FUSE_KERNEL_MINOR_VERSION >= 16
+ [FUSE_BATCH_FORGET]= fuse_batch_forget,
+#endif
+
+#if FUSE_KERNEL_MINOR_VERSION >= 19
+#ifdef FALLOC_FL_KEEP_SIZE
+ [FUSE_FALLOCATE] = fuse_fallocate,
+#endif /* FALLOC_FL_KEEP_SIZE */
+#endif
-static fuse_handler_t *fuse_dump_ops[FUSE_OP_HIGH] = {
+#if FUSE_KERNEL_MINOR_VERSION >= 21
+ [FUSE_READDIRPLUS] = fuse_readdirp,
+#endif
};
+static fuse_handler_t *fuse_dump_ops[FUSE_OP_HIGH];
+
+
static void
fuse_dumper (xlator_t *this, fuse_in_header_t *finh, void *msg)
{
@@ -4652,7 +5221,7 @@ fuse_dumper (xlator_t *this, fuse_in_header_t *finh, void *msg)
"failed to dump fuse message (R): %s",
strerror (errno));
- return priv->fuse_ops0[finh->opcode] (this, finh, msg);
+ priv->fuse_ops0[finh->opcode] (this, finh, msg);
}
@@ -4671,7 +5240,10 @@ init (xlator_t *this_xl)
int fsname_allocated = 0;
glusterfs_ctx_t *ctx = NULL;
gf_boolean_t sync_to_mount = _gf_false;
+ gf_boolean_t fopen_keep_cache = _gf_false;
+ unsigned long mntflags = 0;
char *mnt_args = NULL;
+ eh_t *event = NULL;
if (this_xl == NULL)
return -1;
@@ -4750,21 +5322,21 @@ init (xlator_t *this_xl)
goto cleanup_exit;
}
- GF_OPTION_INIT ("attribute-timeout", priv->attribute_timeout, double,
- cleanup_exit);
+ GF_OPTION_INIT (ZR_ATTR_TIMEOUT_OPT, priv->attribute_timeout, double,
+ cleanup_exit);
- GF_OPTION_INIT ("entry-timeout", priv->entry_timeout, double,
- cleanup_exit);
+ GF_OPTION_INIT (ZR_ENTRY_TIMEOUT_OPT, priv->entry_timeout, double,
+ cleanup_exit);
- GF_OPTION_INIT ("negative-timeout", priv->negative_timeout, double,
- cleanup_exit);
+ GF_OPTION_INIT (ZR_NEGATIVE_TIMEOUT_OPT, priv->negative_timeout, double,
+ cleanup_exit);
- GF_OPTION_INIT ("client-pid", priv->client_pid, int32, cleanup_exit);
+ GF_OPTION_INIT ("client-pid", priv->client_pid, int32, cleanup_exit);
/* have to check & register the presence of client-pid manually */
priv->client_pid_set = !!dict_get (this_xl->options, "client-pid");
- GF_OPTION_INIT ("uid-map-root", priv->uid_map_root, uint32,
- cleanup_exit);
+ GF_OPTION_INIT ("uid-map-root", priv->uid_map_root, uint32,
+ cleanup_exit);
priv->direct_io_mode = 2;
ret = dict_get_str (options, ZR_DIRECT_IO_OPT, &value_string);
@@ -4773,20 +5345,22 @@ init (xlator_t *this_xl)
GF_ASSERT (ret == 0);
}
- GF_OPTION_INIT (ZR_STRICT_VOLFILE_CHECK, priv->strict_volfile_check,
- bool, cleanup_exit);
+ GF_OPTION_INIT (ZR_STRICT_VOLFILE_CHECK, priv->strict_volfile_check,
+ bool, cleanup_exit);
- GF_OPTION_INIT ("acl", priv->acl, bool, cleanup_exit);
+ GF_OPTION_INIT ("acl", priv->acl, bool, cleanup_exit);
if (priv->uid_map_root)
priv->acl = 1;
- GF_OPTION_INIT ("selinux", priv->selinux, bool, cleanup_exit);
+ GF_OPTION_INIT ("selinux", priv->selinux, bool, cleanup_exit);
- GF_OPTION_INIT ("read-only", priv->read_only, bool, cleanup_exit);
+ GF_OPTION_INIT ("read-only", priv->read_only, bool, cleanup_exit);
GF_OPTION_INIT ("enable-ino32", priv->enable_ino32, bool, cleanup_exit);
+ GF_OPTION_INIT ("use-readdirp", priv->use_readdirp, bool, cleanup_exit);
+
priv->fuse_dump_fd = -1;
ret = dict_get_str (options, "dump-fuse", &value_string);
if (ret == 0) {
@@ -4812,26 +5386,42 @@ init (xlator_t *this_xl)
GF_ASSERT (ret == 0);
}
- GF_OPTION_INIT("fopen-keep-cache", priv->fopen_keep_cache, bool,
- cleanup_exit);
+ priv->fopen_keep_cache = 2;
+ if (dict_get (options, "fopen-keep-cache")) {
+ GF_OPTION_INIT("fopen-keep-cache", fopen_keep_cache, bool,
+ cleanup_exit);
+ priv->fopen_keep_cache = fopen_keep_cache;
+ }
- GF_OPTION_INIT("gid-timeout", priv->gid_cache_timeout, int32,
- cleanup_exit);
+ GF_OPTION_INIT("gid-timeout", priv->gid_cache_timeout, int32,
+ cleanup_exit);
- GF_OPTION_INIT ("fuse-mountopts", priv->fuse_mountopts, str, cleanup_exit);
+ GF_OPTION_INIT ("fuse-mountopts", priv->fuse_mountopts, str, cleanup_exit);
- if (gid_cache_init(&priv->gid_cache, priv->gid_cache_timeout) < 0) {
- gf_log("glusterfs-fuse", GF_LOG_ERROR, "Failed to initialize "
- "group cache.");
- goto cleanup_exit;
- }
+ if (gid_cache_init(&priv->gid_cache, priv->gid_cache_timeout) < 0) {
+ gf_log("glusterfs-fuse", GF_LOG_ERROR, "Failed to initialize "
+ "group cache.");
+ goto cleanup_exit;
+ }
/* default values seemed to work fine during testing */
- GF_OPTION_INIT ("background-qlen", priv->background_qlen, int32,
+ GF_OPTION_INIT ("background-qlen", priv->background_qlen, int32,
cleanup_exit);
- GF_OPTION_INIT ("congestion-threshold", priv->congestion_threshold,
+ GF_OPTION_INIT ("congestion-threshold", priv->congestion_threshold,
int32, cleanup_exit);
+ GF_OPTION_INIT("no-root-squash", priv->no_root_squash, bool,
+ cleanup_exit);
+ /* change the client_pid to no-root-squash pid only if the
+ client is none of defrag process, hadoop access and gsyncd process.
+ */
+ if (!priv->client_pid_set) {
+ if (priv->no_root_squash == _gf_true) {
+ priv->client_pid_set = _gf_true;
+ priv->client_pid = GF_CLIENT_PID_NO_ROOT_SQUASH;
+ }
+ }
+
/* user has set only background-qlen, not congestion-threshold,
use the fuse kernel driver formula to set congestion. ie, 75% */
if (dict_get (this_xl->options, "background-qlen") &&
@@ -4881,8 +5471,9 @@ init (xlator_t *this_xl)
goto cleanup_exit;
}
- gf_asprintf (&mnt_args, "%s%s%s%sallow_other,max_read=131072",
- priv->read_only ? "ro," : "",
+ if (priv->read_only)
+ mntflags |= MS_RDONLY;
+ gf_asprintf (&mnt_args, "%s%s%sallow_other,max_read=131072",
priv->acl ? "" : "default_permissions,",
priv->fuse_mountopts ? priv->fuse_mountopts : "",
priv->fuse_mountopts ? "," : "");
@@ -4895,12 +5486,21 @@ init (xlator_t *this_xl)
goto cleanup_exit;
}
- priv->fd = gf_fuse_mount (priv->mount_point, fsname, mnt_args,
+ priv->fd = gf_fuse_mount (priv->mount_point, fsname, mntflags, mnt_args,
sync_to_mount ? &ctx->mnt_pid : NULL,
priv->status_pipe[1]);
if (priv->fd == -1)
goto cleanup_exit;
+ event = eh_new (FUSE_EVENT_HISTORY_SIZE, _gf_false, NULL);
+ if (!event) {
+ gf_log (this_xl->name, GF_LOG_ERROR,
+ "could not create a new event history");
+ goto cleanup_exit;
+ }
+
+ this_xl->history = event;
+
pthread_mutex_init (&priv->fuse_dump_mutex, NULL);
pthread_cond_init (&priv->sync_cond, NULL);
pthread_mutex_init (&priv->sync_mutex, NULL);
@@ -4970,17 +5570,19 @@ fini (xlator_t *this_xl)
kill (getpid (), SIGTERM);
}
-struct xlator_fops fops = {
-};
+struct xlator_fops fops;
struct xlator_cbks cbks = {
- .invalidate = fuse_invalidate,
+ .invalidate = fuse_invalidate,
+ .forget = fuse_forget_cbk,
+ .release = fuse_internal_release
};
struct xlator_dumpops dumpops = {
.priv = fuse_priv_dump,
.inode = fuse_itable_dump,
+ .history = fuse_history_dump,
};
struct volume_options options[] = {
@@ -4995,19 +5597,19 @@ struct volume_options options[] = {
},
{ .key = {ZR_ATTR_TIMEOUT_OPT},
.type = GF_OPTION_TYPE_DOUBLE,
- .default_value = "1.0"
+ .default_value = "1.0"
},
{ .key = {ZR_ENTRY_TIMEOUT_OPT},
.type = GF_OPTION_TYPE_DOUBLE,
- .default_value = "1.0"
+ .default_value = "1.0"
},
{ .key = {ZR_NEGATIVE_TIMEOUT_OPT},
.type = GF_OPTION_TYPE_DOUBLE,
- .default_value = "0.0"
+ .default_value = "0.0"
},
{ .key = {ZR_STRICT_VOLFILE_CHECK},
.type = GF_OPTION_TYPE_BOOL,
- .default_value = "false"
+ .default_value = "false"
},
{ .key = {"client-pid"},
.type = GF_OPTION_TYPE_INT
@@ -5021,26 +5623,26 @@ struct volume_options options[] = {
{ .key = {"read-only"},
.type = GF_OPTION_TYPE_BOOL
},
- { .key = {"fopen-keep-cache"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "false"
- },
- { .key = {"gid-timeout"},
- .type = GF_OPTION_TYPE_INT,
- .default_value = "0"
- },
- { .key = {"acl"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "false"
- },
- { .key = {"selinux"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "false"
- },
- { .key = {"enable-ino32"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "false"
- },
+ { .key = {"fopen-keep-cache"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "false"
+ },
+ { .key = {"gid-timeout"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "2"
+ },
+ { .key = {"acl"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "false"
+ },
+ { .key = {"selinux"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "false"
+ },
+ { .key = {"enable-ino32"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "false"
+ },
{ .key = {"background-qlen"},
.type = GF_OPTION_TYPE_INT,
.default_value = "64",
@@ -5056,5 +5658,19 @@ struct volume_options options[] = {
{ .key = {"fuse-mountopts"},
.type = GF_OPTION_TYPE_STR
},
+ { .key = {"use-readdirp"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "yes"
+ },
+ { .key = {"no-root-squash"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "false",
+ .description = "This is the mount option for disabling the "
+ "root squash for the client irrespective of whether the root-squash "
+ "option for the volume is set or not. But this option is honoured "
+ "only for the trusted clients. For non trusted clients this value "
+ "does not have any affect and the volume option for root-squash is "
+ "honoured.",
+ },
{ .key = {NULL} },
};
diff --git a/xlators/mount/fuse/src/fuse-bridge.h b/xlators/mount/fuse/src/fuse-bridge.h
index fdfe08b2c..f1c4cb3f0 100644
--- a/xlators/mount/fuse/src/fuse-bridge.h
+++ b/xlators/mount/fuse/src/fuse-bridge.h
@@ -48,7 +48,7 @@
#include "gidcache.h"
#if defined(GF_LINUX_HOST_OS) || defined(__NetBSD__)
-#define FUSE_OP_HIGH (FUSE_POLL + 1)
+#define FUSE_OP_HIGH (FUSE_READDIRPLUS + 1)
#endif
#ifdef GF_DARWIN_HOST_OS
#define FUSE_OP_HIGH (FUSE_DESTROY + 1)
@@ -101,9 +101,17 @@ struct fuse_private {
gf_boolean_t acl;
gf_boolean_t selinux;
gf_boolean_t read_only;
- gf_boolean_t fopen_keep_cache;
+ int32_t fopen_keep_cache;
int32_t gid_cache_timeout;
gf_boolean_t enable_ino32;
+ /* This is the mount option for disabling the root-squash for the
+ mount irrespective of whether the root-squash option for the
+ volume is set or not. But this option is honoured only for
+ thr trusted clients. For non trusted clients this value does
+ not have any affect and the volume option for root-squash is
+ honoured.
+ */
+ gf_boolean_t no_root_squash;
fdtable_t *fdtable;
gid_cache_t gid_cache;
char *fuse_mountopts;
@@ -119,6 +127,9 @@ struct fuse_private {
/* for fuse queue length and congestion threshold */
int background_qlen;
int congestion_threshold;
+
+ /* for using fuse-kernel readdirp*/
+ gf_boolean_t use_readdirp;
};
typedef struct fuse_private fuse_private_t;
@@ -134,6 +145,7 @@ typedef struct fuse_graph_switch_args fuse_graph_switch_args_t;
sizeof (struct fuse_notify_inval_entry_out) + \
NAME_MAX + 1))
+#define FUSE_EVENT_HISTORY_SIZE 1024
#define _FH_TO_FD(fh) ((fd_t *)(uintptr_t)(fh))
@@ -141,9 +153,10 @@ typedef struct fuse_graph_switch_args fuse_graph_switch_args_t;
#define FUSE_FOP(state, ret, op_num, fop, args ...) \
do { \
- call_frame_t *frame = NULL; \
- xlator_t *xl = NULL; \
- int32_t op_ret = 0, op_errno = 0; \
+ call_frame_t *frame = NULL; \
+ xlator_t *xl = NULL; \
+ int32_t op_ret = 0, op_errno = 0; \
+ fuse_resolve_t *resolve = NULL; \
\
frame = get_call_frame_for_req (state); \
if (!frame) { \
@@ -170,14 +183,20 @@ typedef struct fuse_graph_switch_args fuse_graph_switch_args_t;
frame->root->op = op_num; \
frame->op = op_num; \
\
+ if ( state->resolve_now ) { \
+ resolve = state->resolve_now; \
+ } else { \
+ resolve = &(state->resolve); \
+ } \
+ \
xl = state->active_subvol; \
if (!xl) { \
gf_log_callingfn ("glusterfs-fuse", GF_LOG_ERROR, \
"xl is NULL"); \
op_errno = ENOENT; \
op_ret = -1; \
- } else if (state->resolve.op_ret < 0) { \
- op_errno = state->resolve.op_errno; \
+ } else if (resolve->op_ret < 0) { \
+ op_errno = resolve->op_errno; \
op_ret = -1; \
if (op_num == GF_FOP_LOOKUP) { \
gf_log ("glusterfs-fuse", \
@@ -186,7 +205,7 @@ typedef struct fuse_graph_switch_args fuse_graph_switch_args_t;
"%"PRIu64": %s() %s => -1 (%s)", \
frame->root->unique, \
gf_fop_list[frame->root->op], \
- state->resolve.resolve_loc.path, \
+ resolve->resolve_loc.path, \
strerror (op_errno)); \
} else { \
gf_log ("glusterfs-fuse", \
@@ -195,7 +214,7 @@ typedef struct fuse_graph_switch_args fuse_graph_switch_args_t;
"migration of %s failed (%s)", \
frame->root->unique, \
gf_fop_list[frame->root->op], \
- state->resolve.resolve_loc.path, \
+ resolve->resolve_loc.path, \
strerror (op_errno)); \
} \
} else if (state->resolve2.op_ret < 0) { \
@@ -216,6 +235,14 @@ typedef struct fuse_graph_switch_args fuse_graph_switch_args_t;
free_fuse_state (state); \
STACK_DESTROY (frame->root); \
} else { \
+ if (state->this->history) \
+ gf_log_eh ("%"PRIu64", %s, path: (%s), gfid: " \
+ "(%s)", frame->root->unique, \
+ gf_fop_list[frame->root->op], \
+ state->loc.path, \
+ (state->fd == NULL)? \
+ uuid_utoa (state->loc.gfid): \
+ uuid_utoa (state->fd->inode->gfid));\
STACK_WIND (frame, ret, xl, xl->fops->fop, args); \
} \
\
@@ -291,6 +318,12 @@ typedef struct fuse_graph_switch_args fuse_graph_switch_args_t;
free_fuse_state (state); \
STACK_DESTROY (frame->root); \
} else { \
+ if (xl->history) \
+ gf_log_eh ("%"PRIu64", %s, path: (%s), gfid: " \
+ "(%s)", frame->root->unique, \
+ gf_fop_list[frame->root->op], \
+ state->loc.path, \
+ uuid_utoa (state->loc.gfid)); \
STACK_WIND_COOKIE (frame, ret, cky, xl, xl->fops->fop, \
args); \
} \
@@ -317,7 +350,80 @@ typedef struct fuse_graph_switch_args fuse_graph_switch_args_t;
} \
} while (0)
+#define FUSE_ENTRY_CREATE(this, priv, finh, state, fci, op) \
+ do { \
+ if (priv->proto_minor >= 12) \
+ state->mode &= ~fci->umask; \
+ if (priv->proto_minor >= 12 && priv->acl) { \
+ state->xdata = dict_new (); \
+ if (!state->xdata) { \
+ gf_log ("glusterfs-fuse", \
+ GF_LOG_WARNING, \
+ "%s failed to allocate " \
+ "a param dictionary", op); \
+ send_fuse_err (this, finh, ENOMEM); \
+ free_fuse_state (state); \
+ return; \
+ } \
+ state->umask = fci->umask; \
+ \
+/* TODO: remove this after 3.4.0 release. keeping it for the \
+ sake of backward compatibility with old (3.3.[01]) \
+ releases till then. */ \
+ ret = dict_set_int16 (state->xdata, "umask", \
+ fci->umask); \
+ if (ret < 0) { \
+ gf_log ("glusterfs-fuse", \
+ GF_LOG_WARNING, \
+ "%s Failed adding umask"\
+ " to request", op); \
+ dict_destroy (state->xdata); \
+ send_fuse_err (this, finh, ENOMEM); \
+ free_fuse_state (state); \
+ return; \
+ } \
+ ret = dict_set_int16 (state->xdata, "mode", \
+ fci->mode); \
+ if (ret < 0) { \
+ gf_log ("glusterfs-fuse", \
+ GF_LOG_WARNING, \
+ "%s Failed adding mode " \
+ "to request", op); \
+ dict_destroy (state->xdata); \
+ send_fuse_err (this, finh, ENOMEM); \
+ free_fuse_state (state); \
+ return; \
+ } \
+ } \
+ } while (0)
+
+#define fuse_log_eh_fop(this, state, frame, op_ret, op_errno) \
+ do { \
+ if (this->history) { \
+ if (state->fd) \
+ gf_log_eh ("op_ret: %d, op_errno: %d, " \
+ "%"PRIu64", %s () => %p, gfid: %s", \
+ op_ret, op_errno, \
+ frame->root->unique, \
+ gf_fop_list[frame->root->op], \
+ state->fd, \
+ uuid_utoa (state->fd->inode->gfid)); \
+ else \
+ gf_log_eh ("op_ret: %d, op_errno: %d, " \
+ "%"PRIu64", %s () => %s, gfid: %s", \
+ op_ret, op_errno, \
+ frame->root->unique, \
+ gf_fop_list[frame->root->op], \
+ state->loc.path, \
+ uuid_utoa (state->loc.gfid)); \
+ } \
+ } while(0)
+#define fuse_log_eh(this, args...) \
+ do { \
+ if (this->history) \
+ gf_log_eh(args); \
+ } while (0)
static inline xlator_t *
fuse_active_subvol (xlator_t *fuse)
@@ -435,4 +541,5 @@ int fuse_resolve_entry_init (fuse_state_t *state, fuse_resolve_t *resolve,
int fuse_resolve_fd_init (fuse_state_t *state, fuse_resolve_t *resolve,
fd_t *fd);
int fuse_ignore_xattr_set (fuse_private_t *priv, char *key);
+int dump_history_fuse (circular_buffer_t *cb, void *data);
#endif /* _GF_FUSE_BRIDGE_H_ */
diff --git a/xlators/mount/fuse/src/fuse-helpers.c b/xlators/mount/fuse/src/fuse-helpers.c
index e72669a9e..0936d6311 100644
--- a/xlators/mount/fuse/src/fuse-helpers.c
+++ b/xlators/mount/fuse/src/fuse-helpers.c
@@ -7,6 +7,10 @@
later), or the GNU General Public License, version 2 (GPLv2), in all
cases as published by the Free Software Foundation.
*/
+#ifdef __NetBSD__
+#define _KMEMUSER
+#endif
+
#include "fuse-bridge.h"
#if defined(GF_SOLARIS_HOST_OS)
#include <sys/procfs.h>
@@ -14,9 +18,6 @@
#include <sys/sysctl.h>
#endif
-#ifndef GF_REQUEST_MAXGROUPS
-#define GF_REQUEST_MAXGROUPS 16
-#endif /* GF_REQUEST_MAXGROUPS */
static void
fuse_resolve_wipe (fuse_resolve_t *resolve)
@@ -138,6 +139,7 @@ get_fuse_state (xlator_t *this, fuse_in_header_t *finh)
}
+#define FUSE_MAX_AUX_GROUPS 32 /* We can get only up to 32 aux groups from /proc */
void
frame_fill_groups (call_frame_t *frame)
{
@@ -160,6 +162,9 @@ frame_fill_groups (call_frame_t *frame)
if (!fp)
goto out;
+ if (call_stack_alloc_groups (frame->root, FUSE_MAX_AUX_GROUPS) != 0)
+ goto out;
+
while ((ptr = fgets (line, sizeof line, fp))) {
if (strncmp (ptr, "Groups:", 7) != 0)
continue;
@@ -176,7 +181,7 @@ frame_fill_groups (call_frame_t *frame)
if (!endptr || *endptr)
break;
frame->root->groups[idx++] = id;
- if (idx == GF_MAX_AUX_GROUPS)
+ if (idx == FUSE_MAX_AUX_GROUPS)
break;
}
@@ -192,6 +197,7 @@ out:
prcred_t *prcred = (prcred_t *) scratch;
FILE *fp = NULL;
int ret = 0;
+ int ngrps;
ret = snprintf (filename, sizeof filename,
"/proc/%d/cred", frame->root->pid);
@@ -200,8 +206,11 @@ out:
fp = fopen (filename, "r");
if (fp != NULL) {
if (fgets (scratch, sizeof scratch, fp) != NULL) {
- frame->root->ngrps = MIN(prcred->pr_ngroups,
- GF_REQUEST_MAXGROUPS);
+ ngrps = MIN(prcred->pr_ngroups,
+ GF_MAX_AUX_GROUPS);
+ if (call_stack_alloc_groups (frame->root,
+ ngrps) != 0)
+ return;
}
fclose (fp);
}
@@ -226,7 +235,9 @@ out:
if (sysctl(name, namelen, &kp, &kplen, NULL, 0) != 0)
return;
- ngroups = MIN(kp.kp_eproc.e_ucred.cr_ngroups, GF_REQUEST_MAXGROUPS);
+ ngroups = MIN(kp.kp_eproc.e_ucred.cr_ngroups, NGROUPS_MAX);
+ if (call_stack_alloc_groups (frame->root, ngroups) != 0)
+ return;
for (i = 0; i < ngroups; i++)
frame->root->groups[i] = kp.kp_eproc.e_ucred.cr_groups[i];
frame->root->ngrps = ngroups;
@@ -255,8 +266,11 @@ static void get_groups(fuse_private_t *priv, call_frame_t *frame)
return;
}
- gl = gid_cache_lookup(&priv->gid_cache, frame->root->pid);
+ gl = gid_cache_lookup(&priv->gid_cache, frame->root->pid,
+ frame->root->uid, frame->root->gid);
if (gl) {
+ if (call_stack_alloc_groups (frame->root, gl->gl_count) != 0)
+ return;
frame->root->ngrps = gl->gl_count;
for (i = 0; i < gl->gl_count; i++)
frame->root->groups[i] = gl->gl_list[i];
@@ -267,6 +281,8 @@ static void get_groups(fuse_private_t *priv, call_frame_t *frame)
frame_fill_groups (frame);
agl.gl_id = frame->root->pid;
+ agl.gl_uid = frame->root->uid;
+ agl.gl_gid = frame->root->gid;
agl.gl_count = frame->root->ngrps;
agl.gl_list = GF_CALLOC(frame->root->ngrps, sizeof(gid_t),
gf_fuse_mt_gids_t);
@@ -339,7 +355,9 @@ fuse_ino_to_inode (uint64_t ino, xlator_t *fuse)
uint64_t
inode_to_fuse_nodeid (inode_t *inode)
{
- if (!inode || __is_root_gfid (inode->gfid))
+ if (!inode)
+ return 0;
+ if (__is_root_gfid (inode->gfid))
return 1;
return (unsigned long) inode;
@@ -578,6 +596,8 @@ fuse_ignore_xattr_set (fuse_private_t *priv, char *key)
|| (fnmatch ("*.glusterfs.volume-mark",
key, FNM_PERIOD) == 0)
|| (fnmatch ("*.glusterfs.volume-mark.*",
+ key, FNM_PERIOD) == 0)
+ || (fnmatch ("glusterfs.gfid.newfile",
key, FNM_PERIOD) == 0)))
ret = -1;
diff --git a/xlators/mount/fuse/src/fuse-resolve.c b/xlators/mount/fuse/src/fuse-resolve.c
index e7e580fe8..76b1d9a72 100644
--- a/xlators/mount/fuse/src/fuse-resolve.c
+++ b/xlators/mount/fuse/src/fuse-resolve.c
@@ -26,6 +26,8 @@ int fuse_migrate_fd (xlator_t *this, fd_t *fd, xlator_t *old_subvol,
fuse_fd_ctx_t *
fuse_fd_ctx_get (xlator_t *this, fd_t *fd);
+gf_boolean_t fuse_inode_needs_lookup (inode_t *inode, xlator_t *this);
+
static int
fuse_resolve_loc_touchup (fuse_state_t *state)
{
@@ -161,10 +163,10 @@ fuse_resolve_gfid_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- loc_wipe (&resolve->resolve_loc);
-
link_inode = inode_link (inode, NULL, NULL, buf);
+ loc_wipe (&resolve->resolve_loc);
+
if (!link_inode)
goto out;
@@ -201,7 +203,11 @@ fuse_resolve_gfid (fuse_state_t *state)
uuid_copy (resolve_loc->gfid, resolve->gfid);
}
- resolve_loc->inode = inode_new (state->itable);
+ /* inode may already exist in case we are looking up an inode which was
+ linked through readdirplus */
+ resolve_loc->inode = inode_find (state->itable, resolve_loc->gfid);
+ if (!resolve_loc->inode)
+ resolve_loc->inode = inode_new (state->itable);
ret = loc_path (resolve_loc, NULL);
if (ret <= 0) {
@@ -239,8 +245,12 @@ fuse_resolve_parent_simple (fuse_state_t *state)
parent = resolve->parhint;
if (parent->table == state->itable) {
+ if (fuse_inode_needs_lookup (parent, THIS))
+ return 1;
+
/* no graph switches since */
loc->parent = inode_ref (parent);
+ uuid_copy (loc->pargfid, parent->gfid);
loc->inode = inode_grep (state->itable, parent, loc->name);
/* nodeid for root is 1 and we blindly take the latest graph's
@@ -264,6 +274,10 @@ fuse_resolve_parent_simple (fuse_state_t *state)
/* non decisive result - parent missing */
return 1;
}
+ if (fuse_inode_needs_lookup (parent, THIS)) {
+ inode_unref (parent);
+ return 1;
+ }
loc->parent = parent;
uuid_copy (loc->pargfid, resolve->pargfid);
@@ -313,15 +327,18 @@ fuse_resolve_inode_simple (fuse_state_t *state)
loc = state->loc_now;
inode = resolve->hint;
- if (inode->table == state->itable) {
+ if (inode->table == state->itable)
inode_ref (inode);
- goto found;
+ else
+ inode = inode_find (state->itable, resolve->gfid);
+
+ if (inode) {
+ if (!fuse_inode_needs_lookup (inode, THIS))
+ goto found;
+ /* inode was linked through readdirplus */
+ inode_unref (inode);
}
- inode = inode_find (state->itable, resolve->gfid);
- if (inode)
- goto found;
-
return 1;
found:
loc->inode = inode;
@@ -364,6 +381,8 @@ fuse_migrate_fd_task (void *data)
basefd = state->fd;
basefd_ctx = fuse_fd_ctx_get (state->this, basefd);
+ if (!basefd_ctx)
+ goto out;
LOCK (&basefd->lock);
{
diff --git a/xlators/mount/fuse/utils/mount.glusterfs.in b/xlators/mount/fuse/utils/mount.glusterfs.in
index 0d92bfbd5..71ea66c3c 100755
--- a/xlators/mount/fuse/utils/mount.glusterfs.in
+++ b/xlators/mount/fuse/utils/mount.glusterfs.in
@@ -1,20 +1,17 @@
-#!/bin/sh
-# (C) 2006, 2007, 2008 Gluster Inc. <http://www.gluster.com>
+#!/bin/bash
#
-# This program is free software; you can redistribute it and/or
-# modify it under the terms of the GNU General Public License as
-# published by the Free Software Foundation; either version 2 of
-# the License, or (at your option) any later version.
+# Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+# This file is part of GlusterFS.
#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public
-# License along with this program; if not, write to the Free
-# Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
-# Boston, MA 02110-1301 USA
+# This file is licensed to you under your choice of the GNU Lesser
+# General Public License, version 3 or any later version (LGPLv3 or
+# later), or the GNU General Public License, version 2 (GPLv2), in all
+# cases as published by the Free Software Foundation.
+
+warn ()
+{
+ echo "$@" >/dev/stderr
+}
_init ()
{
@@ -27,81 +24,129 @@ _init ()
LOG_DEBUG=DEBUG;
LOG_TRACE=TRACE;
+ HOST_NAME_MAX=64;
+
prefix="@prefix@";
exec_prefix=@exec_prefix@;
cmd_line=$(echo "@sbindir@/glusterfs");
- case `uname -s` in
- NetBSD)
- getinode="stat -f %i"
- getdev="stat -f %d"
- lgetinode="${getinode} -L"
- lgetdev="${getdev} -L"
-
- mounttab=/proc/mounts
- ;;
- Linux)
- getinode="stat -c %i $i"
- getdev="stat -c %d $d"
- lgetinode="${getinode} -L"
- lgetdev="${getdev} -L"
-
- mounttab=/etc/mtab
- ;;
+ # check whether getfattr exists
+ export PATH
+ getfattr=$(which getfattr 2>/dev/null);
+ if [ $? -ne 0 ]; then
+ warn "WARNING: getfattr not found, certain checks will be skipped.."
+ fi
+
+ alias lsL='ls -L'
+ mounttab=/proc/mounts
+ uname_s=`uname -s`
+ case ${uname_s} in
+ NetBSD)
+ getinode="stat -f %i"
+ getdev="stat -f %d"
+ lgetinode="${getinode} -L"
+ lgetdev="${getdev} -L"
+ ;;
+ Linux)
+ getinode="stat -c %i"
+ getdev="stat -c %d"
+ lgetinode="${getinode} -L"
+ lgetdev="${getdev} -L"
+ ;;
esac
UPDATEDBCONF=/etc/updatedb.conf
- LD_LIBRARY_PATH=@libdir@:${LD_LIBRARY_PATH}
- export LD_LIBRARY_PATH
}
-start_glusterfs ()
+is_valid_hostname ()
+{
+ local server=$1
+
+ length=$(echo $server | wc -c)
+ if [ ${length} -gt ${HOST_NAME_MAX} ]; then
+ return 1
+ fi
+}
+
+parse_backup_volfile_servers ()
+{
+ local server_list=$1
+ local servers=""
+ local new_servers=""
+
+ servers=$(echo ${server_list} | sed 's/\:/ /g')
+ for server in ${servers}; do
+ is_valid_hostname ${server}
+ if [ $? -eq 1 ]; then
+ continue
+ fi
+ new_servers=$(echo "${new_servers} ${server}")
+ done
+
+ echo ${new_servers}
+}
+
+parse_volfile_servers ()
{
- # lets the comparsion be case insensitive for all strings
+ local server_list=$1
+ local servers=""
+ local new_servers=""
+
+ servers=$(echo ${server_list} | sed 's/,/ /g')
+ for server in ${servers}; do
+ is_valid_hostname ${server}
+ if [ $? -eq 1 ]; then
+ continue
+ fi
+ new_servers=$(echo "${new_servers} ${server}")
+ done
+ echo ${new_servers}
+}
+
+start_glusterfs ()
+{
if [ -n "$log_level_str" ]; then
- case "$( echo $log_level_str | tr '[a-z]' '[A-Z]')" in
- "ERROR")
- log_level=$LOG_ERROR;
- ;;
+ case "$( echo $log_level_str | awk '{print toupper($0)}')" in
+ "ERROR")
+ log_level=$LOG_ERROR;
+ ;;
"INFO")
- log_level=$LOG_INFO
+ log_level=$LOG_INFO;
;;
- "DEBUG")
- log_level=$LOG_DEBUG;
- ;;
- "CRITICAL")
- log_level=$LOG_CRITICAL;
- ;;
- "WARNING")
- log_level=$LOG_WARNING;
- ;;
- "TRACE")
- log_level=$LOG_TRACE;
- ;;
- "NONE")
- log_level=$LOG_NONE;
- ;;
- *)
- echo "invalid log level $log_level_str, using INFO";
- log_level=$LOG_INFO;
- ;;
- esac
- fi
- if [ -n "$log_level" ]; then
- cmd_line=$(echo "$cmd_line --log-level=$log_level");
+ "DEBUG")
+ log_level=$LOG_DEBUG;
+ ;;
+ "CRITICAL")
+ log_level=$LOG_CRITICAL;
+ ;;
+ "WARNING")
+ log_level=$LOG_WARNING;
+ ;;
+ "TRACE")
+ log_level=$LOG_TRACE;
+ ;;
+ "NONE")
+ log_level=$LOG_NONE;
+ ;;
+ *)
+ warn "invalid log level $log_level_str, using INFO";
+ log_level=$LOG_INFO;
+ ;;
+ esac
fi
+ # options without values start here
if [ -n "$read_only" ]; then
- cmd_line=$(echo "$cmd_line --read-only");
+ cmd_line=$(echo "$cmd_line --read-only");
fi
if [ -n "$acl" ]; then
- cmd_line=$(echo "$cmd_line --acl");
+ cmd_line=$(echo "$cmd_line --acl");
fi
if [ -n "$selinux" ]; then
- cmd_line=$(echo "$cmd_line --selinux");
+ cmd_line=$(echo "$cmd_line --selinux");
fi
if [ -n "$enable_ino32" ]; then
@@ -111,17 +156,45 @@ start_glusterfs ()
if [ -n "$worm" ]; then
cmd_line=$(echo "$cmd_line --worm");
fi
+ if [ -n "$volfile_max_fetch_attempts" ]; then
+ cmd_line=$(echo "$cmd_line --volfile-max-fetch-attempts=$volfile_max_fetch_attempts")
+ fi
- if [ -n "$log_file" ]; then
- cmd_line=$(echo "$cmd_line --log-file=$log_file");
+ if [ -n "$fopen_keep_cache" ]; then
+ cmd_line=$(echo "$cmd_line --fopen-keep-cache");
fi
if [ -n "$volfile_check" ]; then
- cmd_line=$(echo "$cmd_line --volfile-check");
+ cmd_line=$(echo "$cmd_line --volfile-check");
+ fi
+
+ if [ -n "$mem_accounting" ]; then
+ cmd_line=$(echo "$cmd_line --mem-accounting");
+ fi
+
+ if [ -n "$aux_gfid_mount" ]; then
+ cmd_line=$(echo "$cmd_line --aux-gfid-mount");
+ fi
+
+ if [ -n "$no_root_squash" ]; then
+ cmd_line=$(echo "$cmd_line --no-root-squash");
+ fi
+
+#options with values start here
+ if [ -n "$log_level" ]; then
+ cmd_line=$(echo "$cmd_line --log-level=$log_level");
+ fi
+
+ if [ -n "$log_file" ]; then
+ cmd_line=$(echo "$cmd_line --log-file=$log_file");
fi
if [ -n "$direct_io_mode" ]; then
- cmd_line=$(echo "$cmd_line --direct-io-mode=$direct_io_mode");
+ cmd_line=$(echo "$cmd_line --direct-io-mode=$direct_io_mode");
+ fi
+
+ if [ -n "$use_readdirp" ]; then
+ cmd_line=$(echo "$cmd_line --use-readdirp=$use_readdirp");
fi
if [ -n "$volume_name" ]; then
@@ -141,23 +214,23 @@ start_glusterfs ()
fi
if [ -n "$gid_timeout" ]; then
- cmd_line=$(echo "$cmd_line --gid-timeout=$gid_timeout");
- fi
-
- if [ -n "$fopen_keep_cache" ]; then
- cmd_line=$(echo "$cmd_line --fopen-keep-cache");
+ cmd_line=$(echo "$cmd_line --gid-timeout=$gid_timeout");
fi
if [ -n "$bg_qlen" ]; then
- cmd_line=$(echo "$cmd_line --background-qlen=$bg_qlen");
+ cmd_line=$(echo "$cmd_line --background-qlen=$bg_qlen");
fi
if [ -n "$cong_threshold" ]; then
- cmd_line=$(echo "$cmd_line --congestion-threshold=$cong_threshold");
+ cmd_line=$(echo "$cmd_line --congestion-threshold=$cong_threshold");
fi
if [ -n "$fuse_mountopts" ]; then
- cmd_line=$(echo "$cmd_line --fuse-mountopts=$fuse_mountopts");
+ cmd_line=$(echo "$cmd_line --fuse-mountopts=$fuse_mountopts");
+ fi
+
+ if [ -n "$xlator_option" ]; then
+ cmd_line=$(echo "$cmd_line --xlator-option=$xlator_option");
fi
# for rdma volume, we have to fetch volfile with '.rdma' added
@@ -166,270 +239,394 @@ start_glusterfs ()
if [ -z "$volfile_loc" ]; then
if [ -n "$server_ip" ]; then
+
+ servers=$(parse_volfile_servers ${server_ip});
+ if [ -n "$servers" ]; then
+ for i in $(echo ${servers}); do
+ cmd_line=$(echo "$cmd_line --volfile-server=$i");
+ done
+ else
+ warn "ERROR: No valid servers found on command line.. exiting"
+ print_usage
+ exit 1
+ fi
+
+ if [ -n "$backupvolfile_server" ]; then
+ if [ -z "$backup_volfile_servers" ]; then
+ is_valid_hostname ${backupvolfile_server};
+ if [ $? -eq 1 ]; then
+ warn "ERROR: Invalid backup server specified.. exiting"
+ exit 1
+ fi
+ cmd_line=$(echo "$cmd_line --volfile-server=$backupvolfile_server");
+ fi
+ fi
+
+ if [ -n "$backup_volfile_servers" ]; then
+ backup_servers=$(parse_backup_volfile_servers ${backup_volfile_servers})
+ for i in $(echo ${backup_servers}); do
+ cmd_line=$(echo "$cmd_line --volfile-server=$i");
+ done
+ fi
+
if [ -n "$server_port" ]; then
cmd_line=$(echo "$cmd_line --volfile-server-port=$server_port");
fi
- if [ -n "$transport" ]; then
+
+ if [ -n "$transport" ]; then
cmd_line=$(echo "$cmd_line --volfile-server-transport=$transport");
if [ "$transport" = "rdma" ]; then
volume_id_rdma=".rdma";
fi
fi
+
if [ -n "$volume_id" ]; then
if [ -n "$volume_id_rdma" ]; then
volume_id="$volume_id$volume_id_rdma";
fi
cmd_line=$(echo "$cmd_line --volfile-id=$volume_id");
fi
-
- if [ -n "$backupvolfile_server" ]; then
- cmd_line1=$(echo "$cmd_line --volfile-server=$backupvolfile_server");
- fi
- if [ -n "$volfile_max_fetch_attempts" ]; then
- cmd_line=$(echo "$cmd_line --volfile-max-fetch-attempts=$volfile_max_fetch_attempts");
- fi
- cmd_line=$(echo "$cmd_line --volfile-server=$server_ip");
fi
else
cmd_line=$(echo "$cmd_line --volfile=$volfile_loc");
fi
if [ -n "$fuse_mountopts" ]; then
- cmd_line=$(echo "$cmd_line --fuse-mountopts=$fuse_mountopts");
+ cmd_line=$(echo "$cmd_line --fuse-mountopts=$fuse_mountopts");
fi
cmd_line=$(echo "$cmd_line $mount_point");
- err=0;
- $cmd_line;
+ $cmd_line;
inode=$( ${getinode} $mount_point 2>/dev/null);
-
# this is required if the stat returns error
- if [ -z "$inode" ]; then
- inode="0";
- fi
-
- # retry the failover
- # if [ $? != "0" ]; then # <--- TODO: Once glusterfs returns proper error code, change it.
- if [ $inode -ne 1 ]; then
- err=1;
- if [ -n "$cmd_line1" ]; then
- cmd_line1=$(echo "$cmd_line1 $mount_point");
- $cmd_line1;
- err=0;
-
- inode=$( ${getinode} $mount_point 2>/dev/null);
- # this is required if the stat returns error
- if [ -z "$inode" ]; then
- inode="0";
- fi
- if [ $inode -ne 1 ]; then
- err=1;
- fi
- fi
- fi
-
- if [ $err -eq "1" ]; then
- echo "Mount failed. Please check the log file for more details."
- umount $mount_point > /dev/null 2>&1;
- exit 1;
+ if [ $? -ne 0 ]; then
+ warn "Mount failed. Please check the log file for more details."
+ umount $mount_point > /dev/null 2>&1;
+ exit 1;
fi
}
-usage ()
+print_usage ()
{
-
-echo "Usage: mount.glusterfs <volumeserver>:<volumeid/volumeport> -o <options> <mountpoint>
+cat << EOF
+Usage: $0 <volumeserver>:<volumeid/volumeport> -o<options> <mountpoint>
Options:
-man 8 mount.glusterfs
-
-To display the version number of the mount helper:
-mount.glusterfs --version"
-
+man 8 $0
+To display the version number of the mount helper: $0 -V
+EOF
}
# check for recursive mounts. i.e, mounting over an existing brick
check_recursive_mount ()
{
- if [ $2 = "/" ]; then
- echo Cannot mount over root;
+ if [ $1 = "/" ]; then
+ warn "Cannot mount over root";
exit 2;
fi
+
# GFID check first
# remove trailing / from mount point
- mnt_dir=${2%/};
-
- export PATH;
- # check whether getfattr exists
- which getfattr > /dev/null 2>&1;
- if [ $? -ne 0 ]; then
- return;
- fi
+ mnt_dir=${1%/};
- getfattr -n trusted.gfid $mnt_dir 2>/dev/null | grep -iq "trusted.gfid=";
- if [ $? -eq 0 ]; then
- echo "ERROR: $mnt_dir is in use as a brick of a gluster volume";
- exit 2;
+ if [ -n ${getfattr} ]; then
+ ${getfattr} -n trusted.gfid $mnt_dir 2>/dev/null | grep -iq "trusted.gfid=";
+ if [ $? -eq 0 ]; then
+ warn "ERROR: $mnt_dir is in use as a brick of a gluster volume";
+ exit 2;
+ fi
fi
# check if the mount point is a brick's parent directory
GLUSTERD_WORKDIR="/var/lib/glusterd";
- ls -L "$GLUSTERD_WORKDIR"/vols/*/bricks/* > /dev/null 2>&1;
+ lsL "$GLUSTERD_WORKDIR"/vols/*/bricks/* > /dev/null 2>&1;
if [ $? -ne 0 ]; then
return;
fi
- brick_path=`grep ^path "$GLUSTERD_WORKDIR"/vols/*/bricks/* | cut -d "=" -f 2`;
+ brick_path=`grep ^path "$GLUSTERD_WORKDIR"/vols/*/bricks/* 2>/dev/null | cut -d "=" -f 2`;
root_inode=`${lgetinode} /`;
root_dev=`${lgetdev} /`;
mnt_inode=`${lgetinode} $mnt_dir`;
mnt_dev=`${lgetdev} $mnt_dir`;
- for brick in "$brick_path";
- do
+ for brick in "$brick_path"; do
# evaluate brick path to see if this is local, if non-local, skip iteration
ls $brick > /dev/null 2>&1;
if [ $? -ne 0 ]; then
continue;
fi
- getfattr -n trusted.gfid "$brick" 2>/dev/null | grep -iq "trusted.gfid=";
- if [ $? -ne 0 ]; then
- continue;
+
+ if [ -n ${getfattr} ]; then
+ ${getfattr} -n trusted.gfid "$brick" 2>/dev/null | grep -iq "trusted.gfid=";
+ if [ $? -eq 0 ]; then
+ # brick is local
+ while [ 1 ]; do
+ tmp_brick="$brick";
+ brick="$brick"/..;
+ brick_dev=`${lgetdev} $brick`;
+ brick_inode=`${lgetinode} $brick`;
+ if [ "$mnt_inode" -eq "$brick_inode" \
+ -a "$mnt_dev" -eq "$brick_dev" ]; then
+ warn "ERROR: ${mnt_dir} is a parent of the brick ${tmp_brick}";
+ exit 2;
+ fi
+ [ "$root_inode" -ne "$brick_inode" \
+ -o "$root_dev" -ne "$brick_dev" ] || break;
+ done;
+ else
+ continue;
+ fi
else
- # brick is local
- while [ 1 ];
- do
- tmp_brick="$brick";
- brick="$brick"/..;
- brick_dev=`${lgetdev} $brick`;
- brick_inode=`${lgetinode} $brick`;
- if [ "$mnt_inode" -eq "$brick_inode" -a "$mnt_dev" -eq "$brick_dev" ]; then
- echo ERROR: $mnt_dir is a parent of the brick $tmp_brick;
- exit 2;
- fi
- [ "$root_inode" -ne "$brick_inode" -o "$root_dev" -ne "$brick_dev" ] || break;
- done;
+ continue;
fi
done;
}
-main ()
+with_options()
{
- helper=$(echo "$@" | sed -n 's/.*\--[ ]*\([^ ]*\).*/\1/p');
-
- in_opt="no"
- pos_args=0
- for opt in "$@"; do
- if [ "$in_opt" = "yes" ]; then
- for pair in $(echo "$opt" | tr "," " "); do
- # Handle options without values.
- case "$pair" in
- "ro") read_only=1 ;;
- "acl") acl=1 ;;
- "selinux") selinux=1 ;;
- "worm") worm=1 ;;
- "fopen-keep-cache") fopen_keep_cache=1 ;;
- "enable-ino32") enable_ino32=1 ;;
- # "mount -t glusterfs" sends this, but it's useless.
- "rw") ;;
- # these ones are interpreted during system initialization
- "noauto") ;;
- "_netdev") ;;
- *)
- key=$(echo "$pair" | cut -f1 -d'=');
- value=$(echo "$pair" | cut -f2- -d'=');
-
- # Handle options with values.
- case "$key" in
- "log-level") log_level_str=$value ;;
- "log-file") log_file=$value ;;
- "transport") transport=$value ;;
- "direct-io-mode") direct_io_mode=$value ;;
- "volume-name") volume_name=$value ;;
- "volume-id") volume_id=$value ;;
- "volfile-check") volfile_check=$value ;;
- "server-port") server_port=$value ;;
- "fetch-attempts")
- volfile_max_fetch_attempts=$value ;;
- "backupvolfile-server")
- backupvolfile_server=$value ;;
- "attribute-timeout")
- attribute_timeout=$value ;;
- "entry-timeout") entry_timeout=$value ;;
- "negative-timeout") negative_timeout=$value ;;
- "gid-timeout") gid_timeout=$value ;;
- "background-qlen") bg_qlen=$value ;;
- "congestion-threshold") cong_threshold=$value ;;
- "fuse-mountopts") fuse_mountopts=$value ;;
- *)
- # Passthru
- [ -z "$fuse_mountopts" ] || fuse_mountopts="$fuse_mountopts,"
- fuse_mountopts="$fuse_mountopts$pair"
- ;;
- esac
- esac
- done
- in_opt="no"
- elif [ "$opt" = "-o" ]; then
- in_opt="yes"
+ local key=$1
+ local value=$2
+
+ # Handle options with values.
+ case "$key" in
+ "log-level")
+ log_level_str=$value
+ ;;
+ "log-file")
+ log_file=$value
+ ;;
+ "transport")
+ transport=$value
+ ;;
+ "direct-io-mode")
+ direct_io_mode=$value
+ ;;
+ "volume-name")
+ volume_name=$value
+ ;;
+ "volume-id")
+ volume_id=$value
+ ;;
+ "volfile-check")
+ volfile_check=$value
+ ;;
+ "server-port")
+ server_port=$value
+ ;;
+ "attribute-timeout")
+ attribute_timeout=$value
+ ;;
+ "entry-timeout")
+ entry_timeout=$value
+ ;;
+ "negative-timeout")
+ negative_timeout=$value
+ ;;
+ "gid-timeout")
+ gid_timeout=$value
+ ;;
+ "background-qlen")
+ bg_qlen=$value
+ ;;
+ "backup-volfile-servers")
+ backup_volfile_servers=$value
+ ;;
+ "backupvolfile-server")
+ backupvolfile_server=$value
+ ;;
+ "fetch-attempts")
+ volfile_max_fetch_attempts=$value
+ ;;
+ "congestion-threshold")
+ cong_threshold=$value
+ ;;
+ "xlator-option")
+ xlator_option=$value
+ ;;
+ "fuse-mountopts")
+ fuse_mountopts=$value
+ ;;
+ "use-readdirp")
+ use_readdirp=$value
+ ;;
+ "no-root-squash")
+ if [ $value == "yes" ] ||
+ [ $value == "on" ] ||
+ [ $value == "enable" ] ||
+ [ $value == "true" ] ; then
+ no_root_squash=1;
+ fi ;;
+ "root-squash")
+ if [ $value == "no" ] ||
+ [ $value == "off" ] ||
+ [ $value == "disable" ] ||
+ [ $value == "false" ] ; then
+ no_root_squash=1;
+ fi ;;
+ *)
+ warn "Invalid option: $key"
+ exit 1
+ ;;
+ esac
+}
+
+without_options()
+{
+ local option=$1
+ # Handle options without values.
+ case "$option" in
+ "ro")
+ read_only=1
+ ;;
+ "acl")
+ acl=1
+ ;;
+ "selinux")
+ selinux=1
+ ;;
+ "worm")
+ worm=1
+ ;;
+ "fopen-keep-cache")
+ fopen_keep_cache=1
+ ;;
+ "enable-ino32")
+ enable_ino32=1
+ ;;
+ "mem-accounting")
+ mem_accounting=1
+ ;;
+ "aux-gfid-mount")
+ if [ ${uname_s} = "Linux" ]; then
+ aux_gfid_mount=1
+ fi
+ ;;
+ # "mount -t glusterfs" sends this, but it's useless.
+ "rw")
+ ;;
+ # these ones are interpreted during system initialization
+ "noauto")
+ ;;
+ "_netdev")
+ ;;
+ *)
+ warn "Invalid option $option";
+ exit 1
+ ;;
+ esac
+}
+
+parse_options()
+{
+ local optarg=${1}
+ for pair in $(echo ${optarg//,/ }); do
+ key=$(echo "$pair" | cut -f1 -d'=');
+ value=$(echo "$pair" | cut -f2- -d'=');
+ if [ "$key" = "$value" ]; then
+ without_options $pair;
else
- case $pos_args in
- 0) volfile_loc=$opt ;;
- 1) mount_point=$opt ;;
- *) echo "extra arguments at end (ignored)" ;;
- esac
- pos_args=$((pos_args+1))
+ with_options $key $value;
fi
done
- if [ $in_opt = "yes" -o $pos_args -lt 2 ]; then
- usage
- exit 1
- fi
+}
+
+update_updatedb()
+{
+ # Append fuse.glusterfs to PRUNEFS variable in updatedb.conf(5).
+ # updatedb(8) should not index files under GlusterFS, indexing
+ # GlusterFS is not necessary and should be avoided.
+ # Following code disables updatedb crawl on 'glusterfs'
+ test -f $UPDATEDBCONF && {
+ if ! grep -q 'glusterfs' $UPDATEDBCONF; then
+ sed 's/\(PRUNEFS.*\)"/\1 fuse.glusterfs"/' $UPDATEDBCONF \
+ > ${UPDATEDBCONF}.bak
+ mv -f ${UPDATEDBCONF}.bak $UPDATEDBCONF
+ fi
+ }
+}
+
+main ()
+{
+
+ volfile_loc=$1
+ mount_point=$2
+
+ ## `mount` specifies options as a last argument
+ shift 2;
+ while getopts "Vo:h" opt; do
+ case "${opt}" in
+ o)
+ parse_options ${OPTARG};
+ ;;
+ V)
+ ${cmd_line} -V;
+ exit 0;
+ ;;
+ h)
+ print_usage;
+ exit 0;
+ ;;
+ ?)
+ print_usage;
+ exit 0;
+ ;;
+ esac
+ done
[ -r "$volfile_loc" ] || {
server_ip=$(echo "$volfile_loc" | sed -n 's/\([a-zA-Z0-9:.\-]*\):.*/\1/p');
- test_str=$(echo "$volfile_loc" | sed -n 's/.*:\([^ ]*\).*/\1/p');
- [ -n "$test_str" ] && {
- volume_id="$test_str";
+ volume_str=$(echo "$volfile_loc" | sed -n 's/.*:\([^ ]*\).*/\1/p');
+ [ -n "$volume_str" ] && {
+ volume_id="$volume_str";
}
- volfile_loc="";
+ volfile_loc="";
}
- #
- [ -n "$helper" ] && {
- cmd_line=$(echo "$cmd_line --$helper");
- exec $cmd_line;
- exit 0;
+ [ -z "$volume_id" -o -z "$server_ip" ] && {
+ cat <<EOF >/dev/stderr
+ERROR: Server name/volume name unspecified cannot proceed further..
+Please specify correct format
+Usage:
+man 8 $0
+EOF
+ exit 1;
+ }
+
+ grep_ret=$(echo ${mount_point} | grep '^\-o');
+ [ "x" != "x${grep_ret}" ] && {
+ cat <<EOF >/dev/stderr
+ERROR: -o options cannot be specified in either first two arguments..
+Please specify correct style
+Usage:
+man 8 $0
+EOF
+ exit 1;
}
# No need to do a ! -d test, it is taken care while initializing the
# variable mount_point
[ -z "$mount_point" -o ! -d "$mount_point" ] && {
- echo "ERROR: Mount point does not exist."
- usage;
- exit 0;
+ cat <<EOF >/dev/stderr
+ERROR: Mount point does not exist
+Please specify a mount point
+Usage:
+man 8 $0
+EOF
+ exit 1;
}
# Simple check to avoid multiple identical mounts
- if grep -q " ${mount_point}.*fuse" $mounttab; then
- echo -n "$0: according to mtab, GlusterFS is already mounted on "
- echo "$mount_point"
- exit 0;
+ if grep -q "[[:space:]+]${mount_point}[[:space:]+]fuse" $mounttab; then
+ warn "$0: according to mtab, GlusterFS is already mounted on" \
+ "$mount_point"
+ exit 32;
fi
- check_recursive_mount "$@";
+ check_recursive_mount "$mount_point";
- # Append fuse.glusterfs to PRUNEFS variable in updatedb.conf(5). updatedb(8)
- # should not index files under GlusterFS, indexing will slow down GlusteFS
- # if the filesystem is several TB in size.
- test -f $UPDATEDBCONF && {
- if ! grep -q 'glusterfs' $UPDATEDBCONF; then
- sed 's/\(PRUNEFS.*\)"/\1 fuse.glusterfs"/' $UPDATEDBCONF \
- > ${UPDATEDBCONF}.bak
- mv -f ${UPDATEDBCONF}.bak $UPDATEDBCONF
- fi
- }
+ update_updatedb;
start_glusterfs;
}
diff --git a/xlators/mount/fuse/utils/mount_glusterfs.in b/xlators/mount/fuse/utils/mount_glusterfs.in
index b12b4e04e..539b0f558 100755
--- a/xlators/mount/fuse/utils/mount_glusterfs.in
+++ b/xlators/mount/fuse/utils/mount_glusterfs.in
@@ -1,188 +1,538 @@
#!/bin/sh
-# (C) 2008 Gluster Inc. <http://www.gluster.com>
-#
+# (C) 2014 Red Hat Inc. <http://www.redhat.com>
+#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of
# the License, or (at your option) any later version.
-#
+#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
-#
+#
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the Free
# Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
# Boston, MA 02110-1301 USA
+warn ()
+{
+ echo "$@" >/dev/stderr
+}
_init ()
{
+
# log level definitions
LOG_NONE=NONE;
LOG_CRITICAL=CRITICAL;
LOG_ERROR=ERROR;
LOG_WARNING=WARNING;
- LOG_INFO=INFO;
+ LOG_INFO=INFO
LOG_DEBUG=DEBUG;
+ LOG_TRACE=TRACE;
- # set default log level to ERROR
- log_level=$LOG_INFO;
-}
+ HOST_NAME_MAX=64;
-start_glusterfs ()
-{
prefix="@prefix@";
exec_prefix=@exec_prefix@;
cmd_line=$(echo "@sbindir@/glusterfs");
-
+
+ alias lsL='ls -L'
+ uname_s=`uname -s`
+ case ${uname_s} in
+ Darwin)
+ getinode="stat -f %i"
+ getdev="stat -f %d"
+ ;;
+ esac
+}
+
+is_valid_hostname ()
+{
+ local server=$1
+
+ length=$(echo $server | wc -c)
+ if [ ${length} -gt ${HOST_NAME_MAX} ]; then
+ return 1
+ fi
+}
+
+parse_backup_volfile_servers ()
+{
+ local server_list=$1
+ local servers=""
+ local new_servers=""
+
+ servers=$(echo ${server_list} | sed 's/\:/ /g')
+ for server in ${servers}; do
+ is_valid_hostname ${server}
+ if [ $? -eq 1 ]; then
+ continue
+ fi
+ new_servers=$(echo "${new_servers} ${server}")
+ done
+
+ echo ${new_servers}
+}
+
+parse_volfile_servers ()
+{
+ local server_list=$1
+ local servers=""
+ local new_servers=""
+
+ servers=$(echo ${server_list} | sed 's/,/ /g')
+ for server in ${servers}; do
+ is_valid_hostname ${server}
+ if [ $? -eq 1 ]; then
+ continue
+ fi
+ new_servers=$(echo "${new_servers} ${server}")
+ done
+
+ echo ${new_servers}
+}
+
+start_glusterfs ()
+{
if [ -n "$log_level_str" ]; then
- case "$log_level_str" in
- "ERROR")
- log_level=$LOG_ERROR;
- ;;
+ case "$( echo $log_level_str | awk '{print toupper($0)}')" in
+ "ERROR")
+ log_level=$LOG_ERROR;
+ ;;
"INFO")
log_level=$LOG_INFO;
;;
- "DEBUG")
- log_level=$LOG_DEBUG;
- ;;
- "CRITICAL")
- log_level=$LOG_CRITICAL;
- ;;
- "WARNING")
- log_level=$LOG_WARNING;
- ;;
- "NONE")
- log_level=$LOG_NONE;
- ;;
- *)
- echo "invalid log level $log_level_str, using INFO";
- log_level=$LOG_INFO;
- ;;
- esac
- fi
- cmd_line=$(echo "$cmd_line --log-level=$log_level");
-
- if [ -n "$log_file" ]; then
- cmd_line=$(echo "$cmd_line --log-file=$log_file");
+ "DEBUG")
+ log_level=$LOG_DEBUG;
+ ;;
+ "CRITICAL")
+ log_level=$LOG_CRITICAL;
+ ;;
+ "WARNING")
+ log_level=$LOG_WARNING;
+ ;;
+ "TRACE")
+ log_level=$LOG_TRACE;
+ ;;
+ "NONE")
+ log_level=$LOG_NONE;
+ ;;
+ *)
+ warn "invalid log level $log_level_str, using INFO";
+ log_level=$LOG_INFO;
+ ;;
+ esac
+ fi
+
+ # options without values start here
+ if [ -n "$read_only" ]; then
+ cmd_line=$(echo "$cmd_line --read-only");
+ fi
+
+ if [ -n "$acl" ]; then
+ cmd_line=$(echo "$cmd_line --acl");
+ fi
+
+ if [ -n "$selinux" ]; then
+ cmd_line=$(echo "$cmd_line --selinux");
+ fi
+
+ if [ -n "$enable_ino32" ]; then
+ cmd_line=$(echo "$cmd_line --enable-ino32");
+ fi
+
+ if [ -n "$worm" ]; then
+ cmd_line=$(echo "$cmd_line --worm");
+ fi
+ if [ -n "$volfile_max_fetch_attempts" ]; then
+ cmd_line=$(echo "$cmd_line --volfile-max-fetch-attempts=$volfile_max_fetch_attempts")
+ fi
+
+ if [ -n "$fopen_keep_cache" ]; then
+ cmd_line=$(echo "$cmd_line --fopen-keep-cache");
fi
if [ -n "$volfile_check" ]; then
- cmd_line=$(echo "$cmd_line --volfile-check");
+ cmd_line=$(echo "$cmd_line --volfile-check");
+ fi
+
+ if [ -n "$mem_accounting" ]; then
+ cmd_line=$(echo "$cmd_line --mem-accounting");
+ fi
+
+ if [ -n "$aux_gfid_mount" ]; then
+ cmd_line=$(echo "$cmd_line --aux-gfid-mount");
+ fi
+
+ if [ -n "$no_root_squash" ]; then
+ cmd_line=$(echo "$cmd_line --no-root-squash");
+ fi
+
+#options with values start here
+ if [ -n "$log_level" ]; then
+ cmd_line=$(echo "$cmd_line --log-level=$log_level");
+ fi
+
+ if [ -n "$log_file" ]; then
+ cmd_line=$(echo "$cmd_line --log-file=$log_file");
fi
if [ -n "$direct_io_mode" ]; then
- cmd_line=$(echo "$cmd_line --disable-direct-io-mode");
+ cmd_line=$(echo "$cmd_line --direct-io-mode=$direct_io_mode");
fi
-
- if [ -z "$volfile_loc" ]; then
- if [ -n "$transport" ]; then
- cmd_line=$(echo "$cmd_line \
---volfile-server=$server_ip \
---volfile-server-transport=$transport");
- else
- cmd_line=$(echo "$cmd_line \
---volfile-server=$server_ip");
- fi
- else
- cmd_line=$(echo "$cmd_line --volfile=$volfile_loc");
+
+ if [ -n "$mac_compat" ]; then
+ cmd_line=$(echo "$cmd_line --mac-compat=$mac_compat");
+ fi
+
+ if [ -n "$use_readdirp" ]; then
+ cmd_line=$(echo "$cmd_line --use-readdirp=$use_readdirp");
fi
if [ -n "$volume_name" ]; then
cmd_line=$(echo "$cmd_line --volume-name=$volume_name");
fi
-
- if [ -n "$volume_id" ]; then
- cmd_line=$(echo "$cmd_line --volfile-id=$volume_id");
+
+ if [ -n "$attribute_timeout" ]; then
+ cmd_line=$(echo "$cmd_line --attribute-timeout=$attribute_timeout");
+ fi
+
+ if [ -n "$entry_timeout" ]; then
+ cmd_line=$(echo "$cmd_line --entry-timeout=$entry_timeout");
+ fi
+
+ if [ -n "$negative_timeout" ]; then
+ cmd_line=$(echo "$cmd_line --negative-timeout=$negative_timeout");
+ fi
+
+ if [ -n "$gid_timeout" ]; then
+ cmd_line=$(echo "$cmd_line --gid-timeout=$gid_timeout");
+ fi
+
+ if [ -n "$bg_qlen" ]; then
+ cmd_line=$(echo "$cmd_line --background-qlen=$bg_qlen");
+ fi
+
+ if [ -n "$cong_threshold" ]; then
+ cmd_line=$(echo "$cmd_line --congestion-threshold=$cong_threshold");
+ fi
+
+ if [ -n "$fuse_mountopts" ]; then
+ cmd_line=$(echo "$cmd_line --fuse-mountopts=$fuse_mountopts");
+ fi
+
+ if [ -n "$xlator_option" ]; then
+ cmd_line=$(echo "$cmd_line --xlator-option=$xlator_option");
+ fi
+
+ # for rdma volume, we have to fetch volfile with '.rdma' added
+ # to volume name, so that it fetches the right client vol file
+ volume_id_rdma="";
+
+ if [ -z "$volfile_loc" ]; then
+ if [ -n "$server_ip" ]; then
+
+ servers=$(parse_volfile_servers ${server_ip});
+ if [ -n "$servers" ]; then
+ for i in $(echo ${servers}); do
+ cmd_line=$(echo "$cmd_line --volfile-server=$i");
+ done
+ else
+ warn "ERROR: No valid servers found on command line.. exiting"
+ print_usage
+ exit 1
+ fi
+
+ if [ -n "$backupvolfile_server" ]; then
+ if [ -z "$backup_volfile_servers" ]; then
+ is_valid_hostname ${backupvolfile_server};
+ if [ $? -eq 1 ]; then
+ warn "ERROR: Invalid backup server specified.. exiting"
+ exit 1
+ fi
+ cmd_line=$(echo "$cmd_line --volfile-server=$backupvolfile_server");
+ fi
+ fi
+
+ if [ -n "$backup_volfile_servers" ]; then
+ backup_servers=$(parse_backup_volfile_servers ${backup_volfile_servers})
+ for i in $(echo ${backup_servers}); do
+ cmd_line=$(echo "$cmd_line --volfile-server=$i");
+ done
+ fi
+
+ if [ -n "$server_port" ]; then
+ cmd_line=$(echo "$cmd_line --volfile-server-port=$server_port");
+ fi
+
+ if [ -n "$transport" ]; then
+ cmd_line=$(echo "$cmd_line --volfile-server-transport=$transport");
+ if [ "$transport" = "rdma" ]; then
+ volume_id_rdma=".rdma";
+ fi
+ fi
+
+ if [ -n "$volume_id" ]; then
+ if [ -n "$volume_id_rdma" ]; then
+ volume_id="$volume_id$volume_id_rdma";
+ fi
+ cmd_line=$(echo "$cmd_line --volfile-id=$volume_id");
+ fi
+ fi
+ else
+ cmd_line=$(echo "$cmd_line --volfile=$volfile_loc");
+ fi
+
+ if [ -n "$fuse_mountopts" ]; then
+ cmd_line=$(echo "$cmd_line --fuse-mountopts=$fuse_mountopts");
fi
cmd_line=$(echo "$cmd_line $mount_point");
- exec $cmd_line;
+ $cmd_line;
+
+ if [ $? -ne 0 ]; then
+ exit 1;
+ fi
}
+print_usage ()
+{
+cat << EOF >/dev/stderr
+Usage: $0 <volumeserver>:<volumeid/volumeport> -o<options> <mountpoint>
+Options:
+man 8 $0
+To display the version number of the mount helper: $0 -V
+EOF
+}
-main ()
+with_options()
{
-
- new_log_level=""
- log_file=""
- transport=""
- direct_io_mode=""
- volume_name=""
- new_fs_options=""
- volfile_check=""
-
- while getopts o: opt; do
- case "$opt" in
- o)
- options=$(echo $OPTARG | sed -n 's/.*\-o[ ]*\([^ ]*\).*/\1/p');
- [ -z $new_log_level ] && {
- new_log_level=$(echo "$options" | sed -n 's/.*log-level=\([^,]*\).*/\1/p');
- }
-
- [ -z $log_file ] && {
- log_file=$(echo "$options" | sed -n 's/.*log-file=\([^,]*\).*/\1/p');
- }
-
- [ -z $transport ] && {
- transport=$(echo "$options" | sed -n 's/.*transport=\([^,]*\).*/\1/p');
- }
-
- [ -z $direct_io_mode ] && {
- direct_io_mode=$(echo "$options" | sed -n 's/.*direct-io-mode=\([^,]*\).*/\1/p');
- }
-
- [ -z $volfile_check ] && {
- volfile_check=$(echo "$options" | sed -n 's/.*volfile-check=\([^,]*\).*/\1/p');
- }
-
- [ -z $volume_name ] && {
- volume_name=$(echo "$options" | sed -n 's/.*volume-name=\([^,]*\).*/\1/p');
- }
-
- [ -z $volume_id ] && {
- volume_id=$(echo "$options" | sed -n 's/.*volume-id=\([^,]*\).*/\1/p');
- }
-
- this_option=$(echo "$options" | sed -e 's/[,]*log-file=[^,]*//' \
- -e 's/[,]*log-level=[^,]*//' \
- -e 's/[,]*volume-name=[^,]*//' \
- -e 's/[,]*volfile-check=[^,]*//' \
- -e 's/[,]*direct-io-mode=[^,]*//' \
- -e 's/[,]*transport=[^,]*//' \
- -e 's/[,]*volume-id=[^,]*//');
- new_fs_options="$new_fs_options $this_option";
- ;;
- esac
+ local key=$1
+ local value=$2
+
+ # Handle options with values.
+ case "$key" in
+ "log-level")
+ log_level_str=$value
+ ;;
+ "log-file")
+ log_file=$value
+ ;;
+ "transport")
+ transport=$value
+ ;;
+ "direct-io-mode")
+ direct_io_mode=$value
+ ;;
+ "mac-compat")
+ mac_compat=$value
+ ;;
+ "volume-name")
+ volume_name=$value
+ ;;
+ "volume-id")
+ volume_id=$value
+ ;;
+ "volfile-check")
+ volfile_check=$value
+ ;;
+ "server-port")
+ server_port=$value
+ ;;
+ "attribute-timeout")
+ attribute_timeout=$value
+ ;;
+ "entry-timeout")
+ entry_timeout=$value
+ ;;
+ "negative-timeout")
+ negative_timeout=$value
+ ;;
+ "gid-timeout")
+ gid_timeout=$value
+ ;;
+ "background-qlen")
+ bg_qlen=$value
+ ;;
+ "backup-volfile-servers")
+ backup_volfile_servers=$value
+ ;;
+ "backupvolfile-server")
+ backupvolfile_server=$value
+ ;;
+ "fetch-attempts")
+ volfile_max_fetch_attempts=$value
+ ;;
+ "congestion-threshold")
+ cong_threshold=$value
+ ;;
+ "xlator-option")
+ xlator_option=$value
+ ;;
+ "fuse-mountopts")
+ fuse_mountopts=$value
+ ;;
+ "use-readdirp")
+ use_readdirp=$value
+ ;;
+ "no-root-squash")
+ if [ $value == "yes" ] ||
+ [ $value == "on" ] ||
+ [ $value == "enable" ] ||
+ [ $value == "true" ] ; then
+ no_root_squash=1;
+ fi ;;
+ "root-squash")
+ if [ $value == "no" ] ||
+ [ $value == "off" ] ||
+ [ $value == "disable" ] ||
+ [ $value == "false" ] ; then
+ no_root_squash=1;
+ fi ;;
+ *)
+ warn "Invalid option: $key"
+ exit 1
+ ;;
+ esac
+}
+
+without_options()
+{
+ local option=$1
+ # Handle options without values.
+ case "$option" in
+ "ro")
+ read_only=1
+ ;;
+ "acl")
+ acl=1
+ ;;
+ "selinux")
+ selinux=1
+ ;;
+ "worm")
+ worm=1
+ ;;
+ "fopen-keep-cache")
+ fopen_keep_cache=1
+ ;;
+ "enable-ino32")
+ enable_ino32=1
+ ;;
+ "mem-accounting")
+ mem_accounting=1
+ ;;
+ "aux-gfid-mount")
+ if [ ${uname_s} = "Linux" ]; then
+ aux_gfid_mount=1
+ fi
+ ;;
+ # "mount -t glusterfs" sends this, but it's useless.
+ "rw")
+ ;;
+ # these ones are interpreted during system initialization
+ "noauto")
+ ;;
+ "_netdev")
+ ;;
+ *)
+ warn "Invalid option $option";
+ exit 1
+ ;;
+ esac
+}
+
+parse_options()
+{
+ local optarg=${1}
+ for pair in $(echo ${optarg//,/ }); do
+ key=$(echo "$pair" | cut -f1 -d'=');
+ value=$(echo "$pair" | cut -f2- -d'=');
+ if [ "$key" = "$value" ]; then
+ without_options $pair;
+ else
+ with_options $key $value;
+ fi
done
+}
- [ -n "$new_log_level" ] && {
- log_level_str="$new_log_level";
- }
+main ()
+{
+ ## `mount` on OSX specifies options as first argument
+ if [[ $1 =~ "-o" ]]; then
+ volfile_loc=$3
+ mount_point=$4
+ else
+ volfile_loc=$1
+ mount_point=$2
+ fi
- # TODO: use getopt. This is very much darwin specific
- volfile_loc="$1";
- while [ "$volfile_loc" = "-o" ] ; do
- shift ;
- shift ;
- volfile_loc="$1";
+ while getopts "Vo:h" opt; do
+ case "${opt}" in
+ o)
+ parse_options ${OPTARG};
+ ;;
+ V)
+ ${cmd_line} -V;
+ exit 0;
+ ;;
+ h)
+ print_usage;
+ exit 0;
+ ;;
+ ?)
+ print_usage;
+ exit 0;
+ ;;
+ esac
done
-
+
[ -r "$volfile_loc" ] || {
server_ip=$(echo "$volfile_loc" | sed -n 's/\([a-zA-Z0-9:.\-]*\):.*/\1/p');
- volume_id=$(echo "$volfile_loc" | sed -n 's/[a-zA-Z0-9:.\-]*:\(.*\)/\1/p');
- volfile_loc="";
+ volume_str=$(echo "$volfile_loc" | sed -n 's/.*:\([^ ]*\).*/\1/p');
+ [ -n "$volume_str" ] && {
+ volume_id="$volume_str";
+ }
+ volfile_loc="";
+ }
+
+ [ -z "$volume_id" -o -z "$server_ip" ] && {
+ cat <<EOF >/dev/stderr
+ERROR: Server name/volume name unspecified cannot proceed further..
+Please specify correct format
+Usage:
+man 8 $0
+EOF
+ exit 1;
+ }
+
+ grep_ret=$(echo ${mount_point} | grep '^\-o');
+ [ "x" != "x${grep_ret}" ] && {
+ cat <<EOF >/dev/stderr
+ERROR: -o options cannot be specified in either first two arguments..
+Please specify correct style
+Usage:
+man 8 $0
+EOF
+ exit 1;
+ }
+
+ # No need to do a ! -d test, it is taken care while initializing the
+ # variable mount_point
+ [ -z "$mount_point" -o ! -d "$mount_point" ] && {
+ cat <<EOF >/dev/stderr
+ERROR: Mount point does not exist
+Please specify a mount point
+Usage:
+man 8 $0
+EOF
+ exit 1;
}
- # following line is product of love towards sed
- # $2=$(echo "$@" | sed -n 's/[^ ]* \([^ ]*\).*/\1/p');
-
- mount_point="$2";
- fs_options=$(echo "$fs_options,$new_fs_options");
-
start_glusterfs;
}
diff --git a/xlators/nfs/server/src/Makefile.am b/xlators/nfs/server/src/Makefile.am
index e22489606..62fbf6535 100644
--- a/xlators/nfs/server/src/Makefile.am
+++ b/xlators/nfs/server/src/Makefile.am
@@ -1,19 +1,20 @@
xlator_LTLIBRARIES = server.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/nfs
nfsrpclibdir = $(top_srcdir)/rpc/rpc-lib/src
-server_la_LDFLAGS = -module -avoidversion
+server_la_LDFLAGS = -module -avoid-version
server_la_SOURCES = nfs.c nfs-common.c nfs-fops.c nfs-inodes.c \
nfs-generics.c mount3.c nfs3-fh.c nfs3.c nfs3-helpers.c nlm4.c \
- nlmcbk_svc.c mount3udp_svc.c
+ nlmcbk_svc.c mount3udp_svc.c acl3.c
server_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
noinst_HEADERS = nfs.h nfs-common.h nfs-fops.h nfs-inodes.h nfs-generics.h \
- mount3.h nfs3-fh.h nfs3.h nfs3-helpers.h nfs-mem-types.h nlm4.h
+ mount3.h nfs3-fh.h nfs3.h nfs3-helpers.h nfs-mem-types.h nlm4.h \
+ acl3.h
AM_CPPFLAGS = $(GF_CPPFLAGS) \
-DLIBDIR=\"$(libdir)/glusterfs/$(PACKAGE_VERSION)/auth\" \
-I$(top_srcdir)/libglusterfs/src \
- -I$(nfsrpclibdir) -I$(CONTRIBDIR)/rbtree\
+ -I$(nfsrpclibdir) -I$(CONTRIBDIR)/rbtree \
-I$(top_srcdir)/rpc/xdr/src/
AM_CFLAGS = -Wall $(GF_CFLAGS)
diff --git a/xlators/nfs/server/src/acl3.c b/xlators/nfs/server/src/acl3.c
new file mode 100644
index 000000000..43156eb44
--- /dev/null
+++ b/xlators/nfs/server/src/acl3.c
@@ -0,0 +1,843 @@
+/*
+ * Copyright (c) 2012-2013 Red Hat, Inc. <http://www.redhat.com>
+ * This file is part of GlusterFS.
+ *
+ * This file is licensed to you under your choice of the GNU Lesser
+ * General Public License, version 3 or any later version (LGPLv3 or
+ * later), or the GNU General Public License, version 2 (GPLv2), in all
+ * cases as published by the Free Software Foundation.
+ */
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "defaults.h"
+#include "rpcsvc.h"
+#include "dict.h"
+#include "xlator.h"
+#include "nfs.h"
+#include "mem-pool.h"
+#include "logging.h"
+#include "nfs-fops.h"
+#include "inode.h"
+#include "nfs3.h"
+#include "nfs-mem-types.h"
+#include "nfs3-helpers.h"
+#include "nfs3-fh.h"
+#include "nfs-generics.h"
+#include "acl3.h"
+#include "byte-order.h"
+
+static int
+acl3_nfs_acl_to_xattr (aclentry *ace, void *xattrbuf,
+ int aclcount, int defacl);
+
+static int
+acl3_nfs_acl_from_xattr (aclentry *ace, void *xattrbuf,
+ int bufsize, int defacl);
+
+typedef ssize_t (*acl3_serializer) (struct iovec outmsg, void *args);
+
+extern void nfs3_call_state_wipe (nfs3_call_state_t *cs);
+
+extern nfs3_call_state_t *
+nfs3_call_state_init (struct nfs3_state *s, rpcsvc_request_t *req, xlator_t *v);
+
+extern int
+nfs3_fh_validate (struct nfs3_fh *fh);
+
+extern fattr3
+nfs3_stat_to_fattr3 (struct iatt *buf);
+
+#define acl3_validate_nfs3_state(request, state, status, label, retval) \
+ do { \
+ state = rpcsvc_request_program_private (request); \
+ if (!state) { \
+ gf_log (GF_ACL, GF_LOG_ERROR, "NFSv3 state " \
+ "missing from RPC request"); \
+ rpcsvc_request_seterr (req, SYSTEM_ERR); \
+ status = NFS3ERR_SERVERFAULT; \
+ goto label; \
+ } \
+ } while (0); \
+
+#define acl3_validate_gluster_fh(handle, status, errlabel) \
+ do { \
+ if (!nfs3_fh_validate (handle)) { \
+ gf_log (GF_ACL, GF_LOG_ERROR, "Bad Handle"); \
+ status = NFS3ERR_BADHANDLE; \
+ goto errlabel; \
+ } \
+ } while (0) \
+
+
+extern xlator_t *
+nfs3_fh_to_xlator (struct nfs3_state *nfs3, struct nfs3_fh *fh);
+
+#define acl3_map_fh_to_volume(nfs3state, handle, req, volume, status, label) \
+ do { \
+ char exportid[256], gfid[256]; \
+ rpc_transport_t *trans = NULL; \
+ volume = nfs3_fh_to_xlator ((nfs3state), handle); \
+ if (!volume) { \
+ uuid_unparse (handle->exportid, exportid); \
+ uuid_unparse (handle->gfid, gfid); \
+ trans = rpcsvc_request_transport (req); \
+ gf_log (GF_ACL, GF_LOG_ERROR, "Failed to map " \
+ "FH to vol: client=%s, exportid=%s, gfid=%s",\
+ trans->peerinfo.identifier, exportid, \
+ gfid); \
+ gf_log (GF_ACL, GF_LOG_ERROR, \
+ "Stale nfs client %s must be trying to "\
+ "connect to a deleted volume, please " \
+ "unmount it.", trans->peerinfo.identifier);\
+ status = NFS3ERR_STALE; \
+ goto label; \
+ } else { \
+ gf_log (GF_ACL, GF_LOG_TRACE, "FH to Volume: %s"\
+ ,volume->name); \
+ rpcsvc_request_set_private (req, volume); \
+ } \
+ } while (0); \
+
+#define acl3_volume_started_check(nfs3state, vlm, rtval, erlbl) \
+ do { \
+ if ((!nfs_subvolume_started (nfs_state (nfs3state->nfsx), vlm))){\
+ gf_log (GF_ACL, GF_LOG_ERROR, "Volume is disabled: %s",\
+ vlm->name); \
+ rtval = RPCSVC_ACTOR_IGNORE; \
+ goto erlbl; \
+ } \
+ } while (0) \
+
+#define acl3_check_fh_resolve_status(cst, nfstat, erlabl) \
+ do { \
+ xlator_t *xlatorp = NULL; \
+ char buf[256], gfid[256]; \
+ rpc_transport_t *trans = NULL; \
+ if ((cst)->resolve_ret < 0) { \
+ trans = rpcsvc_request_transport (cst->req); \
+ xlatorp = nfs3_fh_to_xlator (cst->nfs3state, \
+ &cst->resolvefh); \
+ uuid_unparse (cst->resolvefh.gfid, gfid); \
+ snprintf (buf, sizeof (buf), "(%s) %s : %s", \
+ trans->peerinfo.identifier, \
+ xlatorp ? xlatorp->name : "ERR", \
+ gfid); \
+ gf_log (GF_ACL, GF_LOG_ERROR, "Unable to resolve FH"\
+ ": %s", buf); \
+ nfstat = nfs3_errno_to_nfsstat3 (cst->resolve_errno);\
+ goto erlabl; \
+ } \
+ } while (0) \
+
+#define acl3_handle_call_state_init(nfs3state, calls, rq, v, opstat, errlabel)\
+ do { \
+ calls = nfs3_call_state_init ((nfs3state), (rq), v); \
+ if (!calls) { \
+ gf_log (GF_ACL, GF_LOG_ERROR, "Failed to " \
+ "init call state"); \
+ opstat = NFS3ERR_SERVERFAULT; \
+ rpcsvc_request_seterr (req, SYSTEM_ERR); \
+ goto errlabel; \
+ } \
+ } while (0) \
+
+
+int
+acl3svc_submit_reply (rpcsvc_request_t *req, void *arg, acl3_serializer sfunc)
+{
+ struct iovec outmsg = {0, };
+ struct iobuf *iob = NULL;
+ struct nfs3_state *nfs3 = NULL;
+ int ret = -1;
+ ssize_t msglen = 0;
+ struct iobref *iobref = NULL;
+
+ if (!req)
+ return -1;
+
+ nfs3 = (struct nfs3_state *)rpcsvc_request_program_private (req);
+ if (!nfs3) {
+ gf_log (GF_ACL, GF_LOG_ERROR, "mount state not found");
+ goto ret;
+ }
+
+ /* First, get the io buffer into which the reply in arg will
+ * be serialized.
+ */
+ iob = iobuf_get (nfs3->iobpool);
+ if (!iob) {
+ gf_log (GF_ACL, GF_LOG_ERROR, "Failed to get iobuf");
+ goto ret;
+ }
+
+ iobuf_to_iovec (iob, &outmsg);
+ /* Use the given serializer to translate the give C structure in arg
+ * to XDR format which will be written into the buffer in outmsg.
+ */
+ msglen = sfunc (outmsg, arg);
+ if (msglen < 0) {
+ gf_log (GF_ACL, GF_LOG_ERROR, "Failed to encode message");
+ goto ret;
+ }
+ outmsg.iov_len = msglen;
+
+ iobref = iobref_new ();
+ if (iobref == NULL) {
+ gf_log (GF_ACL, GF_LOG_ERROR, "Failed to get iobref");
+ goto ret;
+ }
+
+ ret = iobref_add (iobref, iob);
+ if (ret) {
+ gf_log (GF_ACL, GF_LOG_ERROR, "Failed to add iob to iobref");
+ goto ret;
+ }
+
+ /* Then, submit the message for transmission. */
+ ret = rpcsvc_submit_message (req, &outmsg, 1, NULL, 0, iobref);
+ if (ret == -1) {
+ gf_log (GF_ACL, GF_LOG_ERROR, "Reply submission failed");
+ goto ret;
+ }
+
+ ret = 0;
+ret:
+ if (iob)
+ iobuf_unref (iob);
+ if (iobref)
+ iobref_unref (iobref);
+
+ return ret;
+}
+
+
+int
+acl3svc_null (rpcsvc_request_t *req)
+{
+ struct iovec dummyvec = {0, };
+
+ if (!req) {
+ gf_log (GF_ACL, GF_LOG_ERROR, "Got NULL request!");
+ return 0;
+ }
+ rpcsvc_submit_generic (req, &dummyvec, 1, NULL, 0, NULL);
+ return 0;
+}
+
+int
+acl3_getacl_reply (rpcsvc_request_t *req, getaclreply *reply)
+{
+ acl3svc_submit_reply (req, (void *)reply,
+ (acl3_serializer)xdr_serialize_getaclreply);
+ return 0;
+}
+
+int
+acl3_setacl_reply (rpcsvc_request_t *req, setaclreply *reply)
+{
+ acl3svc_submit_reply (req, (void *)reply,
+ (acl3_serializer)xdr_serialize_setaclreply);
+ return 0;
+}
+
+
+int
+acl3_getacl_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ nfsstat3 stat = NFS3ERR_SERVERFAULT;
+ nfs3_call_state_t *cs = NULL;
+ data_t *data = NULL;
+ getaclreply *getaclreply = NULL;
+ int aclcount = 0;
+ int defacl = 1; /* DEFAULT ACL */
+
+ if (!frame->local) {
+ gf_log (GF_ACL, GF_LOG_ERROR, "Invalid argument,"
+ " frame->local NULL");
+ return EINVAL;
+ }
+ cs = frame->local;
+ getaclreply = &cs->args.getaclreply;
+ if (op_ret < 0) {
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
+ goto err;
+ }
+
+ getaclreply->aclentry.aclentry_val = cs->aclentry;
+ getaclreply->daclentry.daclentry_val = cs->daclentry;
+
+ /* getfacl: NFS USER ACL */
+ data = dict_get (dict, POSIX_ACL_ACCESS_XATTR);
+ if (data && data->data) {
+ aclcount = acl3_nfs_acl_from_xattr (cs->aclentry,
+ data->data,
+ data->len,
+ !defacl);
+ if (aclcount < 0) {
+ gf_log (GF_ACL, GF_LOG_ERROR,
+ "Failed to get USER ACL");
+ stat = nfs3_errno_to_nfsstat3 (-aclcount);
+ goto err;
+ }
+
+ getaclreply->aclcount = aclcount;
+ getaclreply->aclentry.aclentry_len = aclcount;
+ }
+
+ /* getfacl: NFS DEFAULT ACL */
+ data = dict_get (dict, POSIX_ACL_DEFAULT_XATTR);
+ if (data && data->data) {
+ aclcount = acl3_nfs_acl_from_xattr (cs->daclentry,
+ data->data,
+ data->len,
+ defacl);
+ if (aclcount < 0) {
+ gf_log (GF_ACL, GF_LOG_ERROR,
+ "Failed to get DEFAULT ACL");
+ stat = nfs3_errno_to_nfsstat3 (-aclcount);
+ goto err;
+ }
+
+ getaclreply->daclcount = aclcount;
+ getaclreply->daclentry.daclentry_len = aclcount;
+ }
+
+ acl3_getacl_reply (cs->req, getaclreply);
+ nfs3_call_state_wipe (cs);
+ return 0;
+
+err:
+ if (getaclreply)
+ getaclreply->status = stat;
+ acl3_getacl_reply (cs->req, getaclreply);
+ nfs3_call_state_wipe (cs);
+ return 0;
+}
+
+int
+acl3_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ dict_t *xdata)
+{
+ nfsstat3 stat = NFS3ERR_SERVERFAULT;
+ nfs3_call_state_t *cs = NULL;
+ getaclreply *getaclreply = NULL;
+ int ret = -1;
+ nfs_user_t nfu = {0, };
+ uint64_t deviceid = 0;
+
+ if (!frame->local) {
+ gf_log (GF_ACL, GF_LOG_ERROR, "Invalid argument,"
+ " frame->local NULL");
+ return EINVAL;
+ }
+
+ cs = frame->local;
+ getaclreply = &cs->args.getaclreply;
+
+ if (op_ret == -1) {
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
+ goto err;
+ }
+
+ /* Fill the attrs before xattrs */
+ getaclreply->attr_follows = TRUE;
+ deviceid = nfs3_request_xlator_deviceid (cs->req);
+ nfs3_map_deviceid_to_statdev (buf, deviceid);
+ getaclreply->attr = nfs3_stat_to_fattr3 (buf);
+
+ nfs_request_user_init (&nfu, cs->req);
+ ret = nfs_getxattr (cs->nfsx, cs->vol, &nfu, &cs->resolvedloc,
+ NULL, NULL, acl3_getacl_cbk, cs);
+ if (ret < 0) {
+ stat = nfs3_errno_to_nfsstat3 (-ret);
+ goto err;
+ }
+ return 0;
+err:
+ getaclreply->status = stat;
+ acl3_getacl_reply (cs->req, getaclreply);
+ nfs3_call_state_wipe (cs);
+ return 0;
+}
+
+
+int
+acl3_getacl_resume (void *carg)
+{
+ int ret = -1;
+ nfs3_call_state_t *cs = NULL;
+ nfsstat3 stat = NFS3ERR_SERVERFAULT;
+ nfs_user_t nfu = {0, };
+
+ if (!carg)
+ return ret;
+
+ cs = (nfs3_call_state_t *)carg;
+ acl3_check_fh_resolve_status (cs, stat, acl3err);
+ nfs_request_user_init (&nfu, cs->req);
+
+ ret = nfs_stat (cs->nfsx, cs->vol, &nfu, &cs->resolvedloc,
+ acl3_stat_cbk, cs);
+ stat = -ret;
+acl3err:
+ if (ret < 0) {
+ gf_log (GF_ACL, GF_LOG_ERROR, "unable to open_and_resume");
+ cs->args.getaclreply.status = nfs3_errno_to_nfsstat3 (stat);
+ acl3_getacl_reply (cs->req, &cs->args.getaclreply);
+ nfs3_call_state_wipe (cs);
+ }
+
+ return ret;
+}
+
+
+int
+acl3svc_getacl (rpcsvc_request_t *req)
+{
+ xlator_t *vol = NULL;
+ struct nfs_state *nfs = NULL;
+ nfs3_state_t *nfs3 = NULL;
+ nfs3_call_state_t *cs = NULL;
+ int ret = RPCSVC_ACTOR_ERROR;
+ nfsstat3 stat = NFS3ERR_SERVERFAULT;
+ struct nfs3_fh fh, *fhp = NULL;
+ getaclargs getaclargs;
+ getaclreply getaclreply;
+
+ if (!req)
+ return ret;
+
+ acl3_validate_nfs3_state (req, nfs3, stat, rpcerr, ret);
+ nfs = nfs_state (nfs3->nfsx);
+ memset (&getaclargs, 0, sizeof (getaclargs));
+ memset (&getaclreply, 0, sizeof (getaclreply));
+ getaclargs.fh.n_bytes = (char *)&fh;
+ if (xdr_to_getaclargs(req->msg[0], &getaclargs) <= 0) {
+ gf_log (GF_ACL, GF_LOG_ERROR, "Error decoding args");
+ rpcsvc_request_seterr (req, GARBAGE_ARGS);
+ goto rpcerr;
+ }
+
+ /* Validate ACL mask */
+ if (getaclargs.mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) {
+ stat = NFS3ERR_INVAL;
+ goto acl3err;
+ }
+
+ fhp = &fh;
+ acl3_validate_gluster_fh (&fh, stat, acl3err);
+ acl3_map_fh_to_volume (nfs->nfs3state, fhp, req, vol, stat, acl3err);
+ acl3_volume_started_check (nfs3, vol, ret, rpcerr);
+ acl3_handle_call_state_init (nfs->nfs3state, cs, req,
+ vol, stat, acl3err);
+
+ cs->vol = vol;
+ cs->args.getaclreply.mask = getaclargs.mask;
+
+ ret = nfs3_fh_resolve_and_resume (cs, fhp, NULL, acl3_getacl_resume);
+ stat = nfs3_errno_to_nfsstat3 (-ret);
+
+acl3err:
+ if (ret < 0) {
+ gf_log (GF_ACL, GF_LOG_ERROR, "unable to resolve and resume");
+ getaclreply.status = stat;
+ acl3_getacl_reply (req, &getaclreply);
+ nfs3_call_state_wipe (cs);
+ return 0;
+ }
+
+rpcerr:
+ return ret;
+}
+
+int
+acl3_setacl_cbk (call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ dict_t *xdata)
+{
+ nfs3_call_state_t *cs = NULL;
+ cs = frame->local;
+ if (op_ret < 0) {
+ nfsstat3 status = nfs3_cbk_errno_status (op_ret, op_errno);
+ cs->args.setaclreply.status = status;
+ }
+
+ acl3_setacl_reply (cs->req, &cs->args.setaclreply);
+
+ return 0;
+}
+
+int
+acl3_setacl_resume (void *carg)
+{
+ int ret = -1;
+ nfs3_call_state_t *cs = NULL;
+ nfsstat3 stat = NFS3ERR_SERVERFAULT;
+ nfs_user_t nfu = {0, };
+ dict_t *xattr = NULL;
+
+ if (!carg)
+ return ret;
+ cs = (nfs3_call_state_t *)carg;
+ acl3_check_fh_resolve_status (cs, stat, acl3err);
+ nfs_request_user_init (&nfu, cs->req);
+ xattr = dict_new();
+ if (cs->aclcount)
+ ret = dict_set_static_bin (xattr, POSIX_ACL_ACCESS_XATTR,
+ cs->aclxattr,
+ posix_acl_xattr_size (cs->aclcount));
+ if (cs->daclcount)
+ ret = dict_set_static_bin (xattr, POSIX_ACL_DEFAULT_XATTR,
+ cs->daclxattr,
+ posix_acl_xattr_size (cs->daclcount));
+
+ ret = nfs_setxattr (cs->nfsx, cs->vol, &nfu, &cs->resolvedloc, xattr,
+ 0, NULL, acl3_setacl_cbk, cs);
+ dict_unref (xattr);
+
+acl3err:
+ if (ret < 0) {
+ stat = -ret;
+ gf_log (GF_ACL, GF_LOG_ERROR, "unable to open_and_resume");
+ cs->args.setaclreply.status = nfs3_errno_to_nfsstat3 (stat);
+ acl3_setacl_reply (cs->req, &cs->args.setaclreply);
+ nfs3_call_state_wipe (cs);
+ }
+
+ return ret;
+}
+
+
+int
+acl3svc_setacl (rpcsvc_request_t *req)
+{
+ xlator_t *vol = NULL;
+ struct nfs_state *nfs = NULL;
+ nfs3_state_t *nfs3 = NULL;
+ nfs3_call_state_t *cs = NULL;
+ int ret = RPCSVC_ACTOR_ERROR;
+ nfsstat3 stat = NFS3ERR_SERVERFAULT;
+ struct nfs3_fh fh;
+ struct nfs3_fh *fhp = NULL;
+ setaclargs setaclargs;
+ setaclreply setaclreply;
+ aclentry *daclentry = NULL;
+ aclentry *aclentry = NULL;
+ int aclerrno = 0;
+ int defacl = 1;
+
+ if (!req)
+ return ret;
+ aclentry = GF_CALLOC (NFS_ACL_MAX_ENTRIES, sizeof(*aclentry),
+ gf_nfs_mt_arr);
+ if (!aclentry) {
+ goto rpcerr;
+ }
+ daclentry = GF_CALLOC (NFS_ACL_MAX_ENTRIES, sizeof(*daclentry),
+ gf_nfs_mt_arr);
+ if (!daclentry) {
+ goto rpcerr;
+ }
+
+ acl3_validate_nfs3_state (req, nfs3, stat, rpcerr, ret);
+ nfs = nfs_state (nfs3->nfsx);
+ memset (&setaclargs, 0, sizeof (setaclargs));
+ memset (&setaclreply, 0, sizeof (setaclreply));
+ memset (&fh, 0, sizeof (fh));
+ setaclargs.fh.n_bytes = (char *)&fh;
+ setaclargs.aclentry.aclentry_val = aclentry;
+ setaclargs.daclentry.daclentry_val = daclentry;
+ if (xdr_to_setaclargs(req->msg[0], &setaclargs) <= 0) {
+ gf_log (GF_ACL, GF_LOG_ERROR, "Error decoding args");
+ rpcsvc_request_seterr (req, GARBAGE_ARGS);
+ goto rpcerr;
+ }
+
+ /* Validate ACL mask */
+ if (setaclargs.mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) {
+ stat = NFS3ERR_INVAL;
+ goto acl3err;
+ }
+
+ fhp = &fh;
+ acl3_validate_gluster_fh (fhp, stat, acl3err);
+ acl3_map_fh_to_volume (nfs->nfs3state, fhp, req, vol, stat, acl3err);
+ acl3_volume_started_check (nfs3, vol, ret, rpcerr);
+ acl3_handle_call_state_init (nfs->nfs3state, cs, req,
+ vol, stat, acl3err);
+
+ cs->vol = vol;
+ cs->aclcount = setaclargs.aclcount;
+ cs->daclcount = setaclargs.daclcount;
+
+ /* setfacl: NFS USER ACL */
+ aclerrno = acl3_nfs_acl_to_xattr (aclentry,
+ cs->aclxattr,
+ cs->aclcount,
+ !defacl);
+ if (aclerrno < 0) {
+ gf_log (GF_ACL, GF_LOG_ERROR, "Failed to set USER ACL");
+ stat = nfs3_errno_to_nfsstat3 (-aclerrno);
+ goto acl3err;
+ }
+
+ /* setfacl: NFS DEFAULT ACL */
+ aclerrno = acl3_nfs_acl_to_xattr (daclentry,
+ cs->daclxattr,
+ cs->daclcount,
+ defacl);
+ if (aclerrno < 0) {
+ gf_log (GF_ACL, GF_LOG_ERROR, "Failed to set DEFAULT ACL");
+ stat = nfs3_errno_to_nfsstat3 (-aclerrno);
+ goto acl3err;
+ }
+
+ ret = nfs3_fh_resolve_and_resume (cs, fhp, NULL, acl3_setacl_resume);
+ stat = nfs3_errno_to_nfsstat3 (-ret);
+
+acl3err:
+ if (ret < 0) {
+ gf_log (GF_ACL, GF_LOG_ERROR, "unable to resolve and resume");
+ setaclreply.status = stat;
+ acl3_setacl_reply (req, &setaclreply);
+ nfs3_call_state_wipe (cs);
+ GF_FREE(aclentry);
+ GF_FREE(daclentry);
+ return 0;
+ }
+
+rpcerr:
+ if (ret < 0)
+ nfs3_call_state_wipe (cs);
+ if (aclentry)
+ GF_FREE (aclentry);
+ if (daclentry)
+ GF_FREE (daclentry);
+ return ret;
+}
+
+
+
+rpcsvc_actor_t acl3svc_actors[ACL3_PROC_COUNT] = {
+ {"NULL", ACL3_NULL, acl3svc_null, NULL, 0},
+ {"GETACL", ACL3_GETACL, acl3svc_getacl, NULL, 0},
+ {"SETACL", ACL3_SETACL, acl3svc_setacl, NULL, 0},
+};
+
+rpcsvc_program_t acl3prog = {
+ .progname = "ACL3",
+ .prognum = ACL_PROGRAM,
+ .progver = ACLV3_VERSION,
+ .progport = GF_NFS3_PORT,
+ .actors = acl3svc_actors,
+ .numactors = ACL3_PROC_COUNT,
+ .min_auth = AUTH_NULL,
+};
+
+rpcsvc_program_t *
+acl3svc_init(xlator_t *nfsx)
+{
+ struct nfs3_state *ns = NULL;
+ struct nfs_state *nfs = NULL;
+ dict_t *options = NULL;
+ int ret = -1;
+ char *portstr = NULL;
+ static gf_boolean_t acl3_inited = _gf_false;
+
+ /* Already inited */
+ if (acl3_inited)
+ return &acl3prog;
+
+ nfs = (struct nfs_state*)nfsx->private;
+
+ ns = nfs->nfs3state;
+ if (!ns) {
+ gf_log (GF_ACL, GF_LOG_ERROR, "ACL3 init failed");
+ goto err;
+ }
+ acl3prog.private = ns;
+
+ options = dict_new ();
+
+ ret = gf_asprintf (&portstr, "%d", GF_ACL3_PORT);
+ if (ret == -1)
+ goto err;
+
+ ret = dict_set_dynstr (options, "transport.socket.listen-port",
+ portstr);
+ if (ret == -1)
+ goto err;
+ ret = dict_set_str (options, "transport-type", "socket");
+ if (ret == -1) {
+ gf_log (GF_ACL, GF_LOG_ERROR, "dict_set_str error");
+ goto err;
+ }
+
+ if (nfs->allow_insecure) {
+ ret = dict_set_str (options, "rpc-auth-allow-insecure", "on");
+ if (ret == -1) {
+ gf_log (GF_ACL, GF_LOG_ERROR, "dict_set_str error");
+ goto err;
+ }
+ ret = dict_set_str (options, "rpc-auth.ports.insecure", "on");
+ if (ret == -1) {
+ gf_log (GF_ACL, GF_LOG_ERROR, "dict_set_str error");
+ goto err;
+ }
+ }
+
+ ret = dict_set_str (options, "transport.address-family", "inet");
+ if (ret == -1) {
+ gf_log (GF_ACL, GF_LOG_ERROR, "dict_set_str error");
+ goto err;
+ }
+
+ ret = rpcsvc_create_listeners (nfs->rpcsvc, options, "ACL");
+ if (ret == -1) {
+ gf_log (GF_ACL, GF_LOG_ERROR, "Unable to create listeners");
+ dict_unref (options);
+ goto err;
+ }
+
+ acl3_inited = _gf_true;
+ return &acl3prog;
+err:
+ return NULL;
+}
+
+static int
+acl3_nfs_acl_to_xattr (aclentry *ace, /* ACL entries to be read */
+ void *xattrbuf, /* XATTR buf to be populated */
+ int aclcount, /* No of ACLs to be read */
+ int defacl) /* 1 if DEFAULT ACL */
+{
+ int idx = 0;
+ posix_acl_xattr_header *xheader = NULL;
+ posix_acl_xattr_entry *xentry = NULL;
+
+ if ((!ace) || (!xattrbuf))
+ return (-EINVAL);
+
+ /* ACL count is ZERO, nothing to do */
+ if (!aclcount)
+ return (0);
+
+ if ((aclcount < 0) || (aclcount > NFS_ACL_MAX_ENTRIES))
+ return (-EINVAL);
+
+ xheader = (posix_acl_xattr_header *) (xattrbuf);
+ xentry = (posix_acl_xattr_entry *) (xheader + 1);
+
+ /*
+ * For "default ACL", NFSv3 handles the 'type' differently
+ * i.e. by logical OR'ing 'type' with NFS_ACL_DEFAULT.
+ * Which the backend File system does not understand and
+ * that needs to be masked OFF.
+ */
+ xheader->version = POSIX_ACL_XATTR_VERSION;
+
+ for (idx = 0; idx < aclcount; idx++) {
+ xentry->tag = ace->type;
+ if (defacl)
+ xentry->tag &= ~NFS_ACL_DEFAULT;
+ xentry->perm = ace->perm;
+
+ switch (xentry->tag) {
+ case POSIX_ACL_USER:
+ case POSIX_ACL_GROUP:
+ if (xentry->perm & ~S_IRWXO)
+ return (-EINVAL);
+ xentry->id = ace->uid;
+ break;
+ case POSIX_ACL_USER_OBJ:
+ case POSIX_ACL_GROUP_OBJ:
+ case POSIX_ACL_OTHER:
+ if (xentry->perm & ~S_IRWXO)
+ return (-EINVAL);
+ xentry->id = POSIX_ACL_UNDEFINED_ID;
+ break;
+ case POSIX_ACL_MASK:
+ /* Solaris sometimes sets additional bits in
+ * the mask.
+ */
+ xentry->perm &= S_IRWXO;
+ xentry->id = POSIX_ACL_UNDEFINED_ID;
+ break;
+ default:
+ return (-EINVAL);
+ }
+
+ xentry++;
+ ace++;
+ }
+
+ /* SUCCESS */
+ return (0);
+}
+
+static int
+acl3_nfs_acl_from_xattr (aclentry *ace, /* ACL entries to be filled */
+ void *xattrbuf, /* XATTR buf to be read */
+ int bufsize, /* Size of XATTR buffer */
+ int defacl) /* 1 if DEFAULT ACL */
+{
+ int idx = 0;
+ ssize_t aclcount = 0;
+ posix_acl_xattr_header *xheader = NULL;
+ posix_acl_xattr_entry *xentry = NULL;
+
+ if ((!xattrbuf) || (!ace))
+ return (-EINVAL);
+
+ aclcount = posix_acl_xattr_count (bufsize);
+ if ((aclcount < 0) || (aclcount > NFS_ACL_MAX_ENTRIES))
+ return (-EINVAL);
+
+ xheader = (posix_acl_xattr_header *) (xattrbuf);
+ xentry = (posix_acl_xattr_entry *) (xheader + 1);
+
+ /* Check for supported POSIX ACL xattr version */
+ if (xheader->version != POSIX_ACL_XATTR_VERSION)
+ return (-ENOSYS);
+
+ for (idx = 0; idx < (int)aclcount; idx++) {
+ ace->type = xentry->tag;
+ if (defacl) {
+ /*
+ * SET the NFS_ACL_DEFAULT flag for default
+ * ACL which was masked OFF during setfacl().
+ */
+ ace->type |= NFS_ACL_DEFAULT;
+ }
+ ace->perm = (xentry->perm & S_IRWXO);
+
+ switch (xentry->tag) {
+ case POSIX_ACL_USER:
+ case POSIX_ACL_GROUP:
+ ace->uid = xentry->id;
+ break;
+ case POSIX_ACL_USER_OBJ:
+ case POSIX_ACL_GROUP_OBJ:
+ case POSIX_ACL_MASK:
+ case POSIX_ACL_OTHER:
+ ace->uid = POSIX_ACL_UNDEFINED_ID;
+ break;
+ default:
+ return (-EINVAL);
+ }
+
+
+ xentry++;
+ ace++;
+ }
+
+ /* SUCCESS: ACL count */
+ return aclcount;
+}
diff --git a/xlators/nfs/server/src/acl3.h b/xlators/nfs/server/src/acl3.h
new file mode 100644
index 000000000..220bc9e78
--- /dev/null
+++ b/xlators/nfs/server/src/acl3.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2012 Red Hat, Inc. <http://www.redhat.com>
+ * This file is part of GlusterFS.
+ *
+ * This file is licensed to you under your choice of the GNU Lesser
+ * General Public License, version 3 or any later version (LGPLv3 or
+ * later), or the GNU General Public License, version 2 (GPLv2), in all
+ * cases as published by the Free Software Foundation.
+ */
+
+#ifndef _ACL3_H
+#define _ACL3_H
+
+#include "glusterfs-acl.h"
+
+#define ACL3_NULL 0
+#define ACL3_GETACL 1
+#define ACL3_SETACL 2
+#define ACL3_PROC_COUNT 3
+
+#define GF_ACL3_PORT 38469
+#define GF_ACL GF_NFS"-ACL"
+
+/* Flags for the getacl/setacl mode */
+#define NFS_ACL 0x0001
+#define NFS_ACLCNT 0x0002
+#define NFS_DFACL 0x0004
+#define NFS_DFACLCNT 0x0008
+
+/*
+ * NFSv3, identifies the default ACL by NFS_ACL_DEFAULT. Gluster
+ * NFS needs to mask it OFF before sending it upto POSIX layer
+ * or File system layer.
+ */
+#define NFS_ACL_DEFAULT 0x1000
+
+#define NFS_ACL_MAX_ENTRIES 1024
+
+rpcsvc_program_t *
+acl3svc_init(xlator_t *nfsx);
+
+#endif
diff --git a/xlators/nfs/server/src/mount3.c b/xlators/nfs/server/src/mount3.c
index efd74aa58..47ff3845e 100644
--- a/xlators/nfs/server/src/mount3.c
+++ b/xlators/nfs/server/src/mount3.c
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _CONFIG_H
@@ -39,26 +30,71 @@
#include "nfs-mem-types.h"
#include "nfs.h"
#include "common-utils.h"
-
+#include "store.h"
#include <errno.h>
#include <sys/socket.h>
#include <sys/uio.h>
+
+#define IPv4_ADDR_SIZE 32
+
+/* Macro to typecast the parameter to struct sockaddr_in
+ */
+#define SA(addr) ((struct sockaddr_in*)(addr))
+
+/* Macro will mask the ip address with netmask.
+ */
+#define MASKED_IP(ipv4addr, netmask) \
+ (ntohl(SA(ipv4addr)->sin_addr.s_addr) & (netmask))
+
+/* Macro will compare two IP address after applying the mask
+ */
+#define COMPARE_IPv4_ADDRS(ip1, ip2, netmask) \
+ ((MASKED_IP(ip1, netmask)) == (MASKED_IP(ip2, netmask)))
+
+/* This macro will assist in freeing up entire link list
+ * of host_auth_spec structure.
+ */
+#define FREE_HOSTSPEC(exp) do { \
+ struct host_auth_spec *host= exp->hostspec; \
+ while (NULL != host){ \
+ struct host_auth_spec* temp = host; \
+ host = host->next; \
+ if (NULL != temp->host_addr) { \
+ GF_FREE (temp->host_addr); \
+ } \
+ GF_FREE (temp); \
+ } \
+ exp->hostspec = NULL; \
+ } while (0)
+
typedef ssize_t (*mnt3_serializer) (struct iovec outmsg, void *args);
extern void *
mount3udp_thread (void *argv);
+static inline void
+mnt3_export_free (struct mnt3_export *exp)
+{
+ if (!exp)
+ return;
+
+ if (exp->exptype == MNT3_EXPTYPE_DIR)
+ FREE_HOSTSPEC (exp);
+ GF_FREE (exp->expname);
+ GF_FREE (exp);
+}
/* Generic reply function for MOUNTv3 specific replies. */
int
mnt3svc_submit_reply (rpcsvc_request_t *req, void *arg, mnt3_serializer sfunc)
{
- struct iovec outmsg = {0, };
- struct iobuf *iob = NULL;
- struct mount3_state *ms = NULL;
- int ret = -1;
+ struct iovec outmsg = {0, };
+ struct iobuf *iob = NULL;
+ struct mount3_state *ms = NULL;
+ int ret = -1;
+ ssize_t msglen = 0;
struct iobref *iobref = NULL;
if (!req)
@@ -84,7 +120,12 @@ mnt3svc_submit_reply (rpcsvc_request_t *req, void *arg, mnt3_serializer sfunc)
/* Use the given serializer to translate the give C structure in arg
* to XDR format which will be written into the buffer in outmsg.
*/
- outmsg.iov_len = sfunc (outmsg, arg);
+ msglen = sfunc (outmsg, arg);
+ if (msglen < 0) {
+ gf_log (GF_MNT, GF_LOG_ERROR, "Failed to encode message");
+ goto ret;
+ }
+ outmsg.iov_len = msglen;
iobref = iobref_new ();
if (iobref == NULL) {
@@ -92,12 +133,14 @@ mnt3svc_submit_reply (rpcsvc_request_t *req, void *arg, mnt3_serializer sfunc)
goto ret;
}
- iobref_add (iobref, iob);
+ ret = iobref_add (iobref, iob);
+ if (ret) {
+ gf_log (GF_MNT, GF_LOG_ERROR, "Failed to add iob to iobref");
+ goto ret;
+ }
/* Then, submit the message for transmission. */
ret = rpcsvc_submit_message (req, &outmsg, 1, NULL, 0, iobref);
- iobuf_unref (iob);
- iobref_unref (iobref);
if (ret == -1) {
gf_log (GF_MNT, GF_LOG_ERROR, "Reply submission failed");
goto ret;
@@ -105,6 +148,11 @@ mnt3svc_submit_reply (rpcsvc_request_t *req, void *arg, mnt3_serializer sfunc)
ret = 0;
ret:
+ if (NULL != iob)
+ iobuf_unref (iob);
+ if (NULL != iobref)
+ iobref_unref (iobref);
+
return ret;
}
@@ -188,14 +236,278 @@ mnt3svc_set_mountres3 (mountstat3 stat, struct nfs3_fh *fh, int *authflavor,
return res;
}
+/* Read the rmtab from the store_handle and append (or not) the entries to the
+ * mountlist.
+ *
+ * Requires the store_handle to be locked.
+ */
+static int
+__mount_read_rmtab (gf_store_handle_t *sh, struct list_head *mountlist,
+ gf_boolean_t append)
+{
+ int ret = 0;
+ unsigned int idx = 0;
+ struct mountentry *me = NULL, *tmp = NULL;
+ /* me->hostname is a char[MNTPATHLEN] */
+ char key[MNTPATHLEN + 11];
+
+ GF_ASSERT (sh && mountlist);
+
+ if (!gf_store_locked_local (sh)) {
+ gf_log (GF_MNT, GF_LOG_WARNING, "Not reading unlocked %s",
+ sh->path);
+ return -1;
+ }
+
+ if (!append) {
+ list_for_each_entry_safe (me, tmp, mountlist, mlist) {
+ list_del (&me->mlist);
+ GF_FREE (me);
+ }
+ me = NULL;
+ }
+
+ for (;;) {
+ char *value = NULL;
+
+ if (me && append) {
+ /* do not add duplicates */
+ list_for_each_entry (tmp, mountlist, mlist) {
+ if (!strcmp(tmp->hostname, me->hostname) &&
+ !strcmp(tmp->exname, me->exname)) {
+ GF_FREE (me);
+ goto dont_add;
+ }
+ }
+ list_add_tail (&me->mlist, mountlist);
+ } else if (me) {
+ list_add_tail (&me->mlist, mountlist);
+ }
+
+dont_add:
+ me = GF_CALLOC (1, sizeof (*me), gf_nfs_mt_mountentry);
+ if (!me) {
+ gf_log (GF_MNT, GF_LOG_ERROR, "Out of memory");
+ ret = -1;
+ goto out;
+ }
+
+ INIT_LIST_HEAD (&me->mlist);
+
+ snprintf (key, 9 + MNTPATHLEN, "hostname-%d", idx);
+ ret = gf_store_retrieve_value (sh, key, &value);
+ if (ret)
+ break;
+ strncpy (me->hostname, value, MNTPATHLEN);
+ GF_FREE (value);
+
+ snprintf (key, 11 + MNTPATHLEN, "mountpoint-%d", idx);
+ ret = gf_store_retrieve_value (sh, key, &value);
+ if (ret)
+ break;
+ strncpy (me->exname, value, MNTPATHLEN);
+ GF_FREE (value);
+
+ idx++;
+ gf_log (GF_MNT, GF_LOG_TRACE, "Read entries %s:%s", me->hostname, me->exname);
+ }
+ gf_log (GF_MNT, GF_LOG_DEBUG, "Read %d entries from '%s'", idx, sh->path);
+ GF_FREE (me);
+out:
+ return ret;
+}
+
+/* Overwrite the contents of the rwtab with te in-memory client list.
+ * Fail gracefully if the stora_handle is not locked.
+ */
+static void
+__mount_rewrite_rmtab(struct mount3_state *ms, gf_store_handle_t *sh)
+{
+ struct mountentry *me = NULL;
+ char key[16];
+ int fd, ret;
+ unsigned int idx = 0;
+
+ if (!gf_store_locked_local (sh)) {
+ gf_log (GF_MNT, GF_LOG_WARNING, "Not modifying unlocked %s",
+ sh->path);
+ return;
+ }
+
+ fd = gf_store_mkstemp (sh);
+ if (fd == -1) {
+ gf_log (GF_MNT, GF_LOG_ERROR, "Failed to open %s", sh->path);
+ return;
+ }
+
+ list_for_each_entry (me, &ms->mountlist, mlist) {
+ snprintf (key, 16, "hostname-%d", idx);
+ ret = gf_store_save_value (fd, key, me->hostname);
+ if (ret)
+ goto fail;
+
+ snprintf (key, 16, "mountpoint-%d", idx);
+ ret = gf_store_save_value (fd, key, me->exname);
+ if (ret)
+ goto fail;
+
+ idx++;
+ }
+
+ gf_log (GF_MNT, GF_LOG_DEBUG, "Updated rmtab with %d entries", idx);
+
+ close (fd);
+ if (gf_store_rename_tmppath (sh))
+ gf_log (GF_MNT, GF_LOG_ERROR, "Failed to overwrite rwtab %s",
+ sh->path);
+
+ return;
+
+fail:
+ gf_log (GF_MNT, GF_LOG_ERROR, "Failed to update %s", sh->path);
+ close (fd);
+ gf_store_unlink_tmppath (sh);
+}
+
+/* Read the rmtab into a clean ms->mountlist.
+ */
+static void
+mount_read_rmtab (struct mount3_state *ms)
+{
+ gf_store_handle_t *sh = NULL;
+ struct nfs_state *nfs = NULL;
+ int ret;
+
+ nfs = (struct nfs_state *)ms->nfsx->private;
+
+ ret = gf_store_handle_new (nfs->rmtab, &sh);
+ if (ret) {
+ gf_log (GF_MNT, GF_LOG_WARNING, "Failed to open '%s'",
+ nfs->rmtab);
+ return;
+ }
+
+ if (gf_store_lock (sh)) {
+ gf_log (GF_MNT, GF_LOG_WARNING, "Failed to lock '%s'",
+ nfs->rmtab);
+ goto out;
+ }
+
+ __mount_read_rmtab (sh, &ms->mountlist, _gf_false);
+ gf_store_unlock (sh);
+
+out:
+ gf_store_handle_destroy (sh);
+}
+
+/* Write the ms->mountlist to the rmtab.
+ *
+ * The rmtab could be empty, or it can exists and have been updated by a
+ * different storage server without our knowing.
+ *
+ * 1. takes the store_handle lock on the current rmtab
+ * - blocks if an other storage server rewrites the rmtab at the same time
+ * 2. [if new_rmtab] takes the store_handle lock on the new rmtab
+ * 3. reads/merges the entries from the current rmtab
+ * 4. [if new_rmtab] reads/merges the entries from the new rmtab
+ * 5. [if new_rmtab] writes the new rmtab
+ * 6. [if not new_rmtab] writes the current rmtab
+ * 7 [if new_rmtab] replaces nfs->rmtab to point to the new location
+ * 8. [if new_rmtab] releases the store_handle lock of the new rmtab
+ * 9. releases the store_handle lock of the old rmtab
+ */
+void
+mount_rewrite_rmtab (struct mount3_state *ms, char *new_rmtab)
+{
+ gf_store_handle_t *sh = NULL, *nsh = NULL;
+ struct nfs_state *nfs = NULL;
+ int ret;
+ char *rmtab = NULL;
+
+ nfs = (struct nfs_state *)ms->nfsx->private;
+
+ ret = gf_store_handle_new (nfs->rmtab, &sh);
+ if (ret) {
+ gf_log (GF_MNT, GF_LOG_WARNING, "Failed to open '%s'",
+ nfs->rmtab);
+ return;
+ }
+
+ if (gf_store_lock (sh)) {
+ gf_log (GF_MNT, GF_LOG_WARNING, "Not rewriting '%s'",
+ nfs->rmtab);
+ goto free_sh;
+ }
+
+ if (new_rmtab) {
+ ret = gf_store_handle_new (new_rmtab, &nsh);
+ if (ret) {
+ gf_log (GF_MNT, GF_LOG_WARNING, "Failed to open '%s'",
+ new_rmtab);
+ goto unlock_sh;
+ }
+
+ if (gf_store_lock (nsh)) {
+ gf_log (GF_MNT, GF_LOG_WARNING, "Not rewriting '%s'",
+ new_rmtab);
+ goto free_nsh;
+ }
+ }
+
+ /* always read the currently used rmtab */
+ __mount_read_rmtab (sh, &ms->mountlist, _gf_true);
+
+ if (new_rmtab) {
+ /* read the new rmtab and write changes to the new location */
+ __mount_read_rmtab (nsh, &ms->mountlist, _gf_true);
+ __mount_rewrite_rmtab (ms, nsh);
+
+ /* replace the nfs->rmtab reference to the new rmtab */
+ rmtab = gf_strdup(new_rmtab);
+ if (rmtab == NULL) {
+ gf_log (GF_MNT, GF_LOG_ERROR, "Out of memory, keeping "
+ "%s as rmtab", nfs->rmtab);
+ } else {
+ GF_FREE (nfs->rmtab);
+ nfs->rmtab = rmtab;
+ }
+
+ gf_store_unlock (nsh);
+ } else {
+ /* rewrite the current (unchanged location) rmtab */
+ __mount_rewrite_rmtab (ms, sh);
+ }
+
+free_nsh:
+ if (new_rmtab)
+ gf_store_handle_destroy (nsh);
+unlock_sh:
+ gf_store_unlock (sh);
+free_sh:
+ gf_store_handle_destroy (sh);
+}
+/* Add a new NFS-client to the ms->mountlist and update the rmtab if we can.
+ *
+ * A NFS-client will only be removed from the ms->mountlist in case the
+ * NFS-client sends a unmount request. It is possible that a NFS-client
+ * crashed/rebooted had network loss or something else prevented the NFS-client
+ * to unmount cleanly. In this case, a duplicate entry would be added to the
+ * ms->mountlist, which is wrong and we should prevent.
+ *
+ * It is fully acceptible that the ms->mountlist is not 100% correct, this is a
+ * common issue for all(?) NFS-servers.
+ */
int
mnt3svc_update_mountlist (struct mount3_state *ms, rpcsvc_request_t *req,
char *expname)
{
struct mountentry *me = NULL;
+ struct mountentry *cur = NULL;
int ret = -1;
char *colon = NULL;
+ struct nfs_state *nfs = NULL;
+ gf_store_handle_t *sh = NULL;
if ((!ms) || (!req) || (!expname))
return -1;
@@ -205,14 +517,24 @@ mnt3svc_update_mountlist (struct mount3_state *ms, rpcsvc_request_t *req,
if (!me)
return -1;
- strcpy (me->exname, expname);
+ nfs = (struct nfs_state *)ms->nfsx->private;
+
+ ret = gf_store_handle_new (nfs->rmtab, &sh);
+ if (ret) {
+ gf_log (GF_MNT, GF_LOG_WARNING, "Failed to open '%s'",
+ nfs->rmtab);
+ goto free_err;
+ }
+
+ strncpy (me->exname, expname, MNTPATHLEN);
+
INIT_LIST_HEAD (&me->mlist);
/* Must get the IP or hostname of the client so we
* can map it into the mount entry.
*/
ret = rpcsvc_transport_peername (req->trans, me->hostname, MNTPATHLEN);
if (ret == -1)
- goto free_err;
+ goto free_err2;
colon = strrchr (me->hostname, ':');
if (colon) {
@@ -220,10 +542,37 @@ mnt3svc_update_mountlist (struct mount3_state *ms, rpcsvc_request_t *req,
}
LOCK (&ms->mountlock);
{
+ /* in case locking fails, we just don't write the rmtab */
+ if (gf_store_lock (sh)) {
+ gf_log (GF_MNT, GF_LOG_WARNING, "Failed to lock '%s'"
+ ", changes will not be written", nfs->rmtab);
+ } else {
+ __mount_read_rmtab (sh, &ms->mountlist, _gf_false);
+ }
+
+ /* do not add duplicates */
+ list_for_each_entry (cur, &ms->mountlist, mlist) {
+ if (!strcmp(cur->hostname, me->hostname) &&
+ !strcmp(cur->exname, me->exname)) {
+ GF_FREE (me);
+ goto dont_add;
+ }
+ }
list_add_tail (&me->mlist, &ms->mountlist);
+
+ /* only write the rmtab in case it was locked */
+ if (gf_store_locked_local (sh))
+ __mount_rewrite_rmtab (ms, sh);
}
+dont_add:
+ if (gf_store_locked_local (sh))
+ gf_store_unlock (sh);
+
UNLOCK (&ms->mountlock);
+free_err2:
+ gf_store_handle_destroy (sh);
+
free_err:
if (ret == -1)
GF_FREE (me);
@@ -242,6 +591,7 @@ __mnt3_get_volume_id (struct mount3_state *ms, xlator_t *mntxl,
if ((!ms) || (!mntxl))
return ret;
+ LOCK (&ms->mountlock);
list_for_each_entry (exp, &ms->exportlist, explist) {
if (exp->vol == mntxl) {
uuid_copy (volumeid, exp->volumeid);
@@ -251,6 +601,7 @@ __mnt3_get_volume_id (struct mount3_state *ms, xlator_t *mntxl,
}
out:
+ UNLOCK (&ms->mountlock);
return ret;
}
@@ -271,7 +622,7 @@ mnt3svc_lookup_mount_cbk (call_frame_t *frame, void *cookie,
rpcsvc_t *svc = NULL;
xlator_t *mntxl = NULL;
uuid_t volumeid = {0, };
- char fhstr[1024];
+ char fhstr[1024], *path = NULL;
req = (rpcsvc_request_t *)frame->local;
@@ -293,7 +644,15 @@ mnt3svc_lookup_mount_cbk (call_frame_t *frame, void *cookie,
if (status != MNT3_OK)
goto xmit_res;
- mnt3svc_update_mountlist (ms, req, mntxl->name);
+ path = GF_CALLOC (PATH_MAX, sizeof (char), gf_nfs_mt_char);
+ if (!path) {
+ gf_log (GF_MNT, GF_LOG_ERROR, "Out of memory");
+ goto xmit_res;
+ }
+
+ snprintf (path, PATH_MAX, "/%s", mntxl->name);
+ mnt3svc_update_mountlist (ms, req, path);
+ GF_FREE (path);
if (gf_nfs_dvm_off (nfs_state (ms->nfsx))) {
fh = nfs3_fh_build_indexed_root_fh (ms->nfsx->children, mntxl);
goto xmit_res;
@@ -303,7 +662,7 @@ mnt3svc_lookup_mount_cbk (call_frame_t *frame, void *cookie,
fh = nfs3_fh_build_uuid_root_fh (volumeid);
xmit_res:
- nfs3_fh_to_str (&fh, fhstr);
+ nfs3_fh_to_str (&fh, fhstr, sizeof (fhstr));
gf_log (GF_MNT, GF_LOG_DEBUG, "MNT reply: fh %s, status: %d", fhstr,
status);
if (op_ret == 0) {
@@ -467,8 +826,8 @@ mnt3_resolve_state_wipe (mnt3_resolve_t *mres)
/* Sets up the component argument to contain the next component in the path and
* sets up path as an absolute path starting from the next component.
*/
-char *
-__setup_next_component (char *path, char *component)
+static char *
+setup_next_component (char *path, size_t plen, char *component, size_t clen)
{
char *comp = NULL;
char *nextcomp = NULL;
@@ -476,7 +835,7 @@ __setup_next_component (char *path, char *component)
if ((!path) || (!component))
return NULL;
- strcpy (component, path);
+ strncpy (component, path, clen);
comp = index (component, (int)'/');
if (!comp)
goto err;
@@ -484,7 +843,7 @@ __setup_next_component (char *path, char *component)
comp++;
nextcomp = index (comp, (int)'/');
if (nextcomp) {
- strcpy (path, nextcomp);
+ strncpy (path, nextcomp, plen);
*nextcomp = '\0';
} else
path[0] = '\0';
@@ -514,7 +873,9 @@ __mnt3_resolve_export_subdir_comp (mnt3_resolve_t *mres)
if (!mres)
return ret;
- nextcomp = __setup_next_component (mres->remainingdir, dupsubdir);
+ nextcomp = setup_next_component (mres->remainingdir,
+ sizeof (mres->remainingdir),
+ dupsubdir, sizeof (dupsubdir));
if (!nextcomp)
goto err;
@@ -526,7 +887,7 @@ __mnt3_resolve_export_subdir_comp (mnt3_resolve_t *mres)
if ((ret < 0) && (ret != -2)) {
gf_log (GF_MNT, GF_LOG_ERROR, "Failed to resolve and create "
"inode: parent gfid %s, entry %s",
- uuid_utoa (mres->resolveloc.inode->gfid), nextcomp);
+ uuid_utoa (gfid), nextcomp);
ret = -EFAULT;
goto err;
}
@@ -554,6 +915,7 @@ mnt3_resolve_subdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
rpcsvc_t *svc = NULL;
mountres3 res = {0, };
xlator_t *mntxl = NULL;
+ char *path = NULL;
mres = frame->local;
mntxl = (xlator_t *)cookie;
@@ -571,15 +933,23 @@ mnt3_resolve_subdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
if (strlen (mres->remainingdir) <= 0) {
op_ret = -1;
mntstat = MNT3_OK;
+ path = GF_CALLOC (PATH_MAX, sizeof (char), gf_nfs_mt_char);
+ if (!path) {
+ gf_log (GF_MNT, GF_LOG_ERROR, "Memory allocation "
+ "failed");
+ goto err;
+ }
+ snprintf (path, PATH_MAX, "/%s%s", mres->exp->vol->name,
+ mres->resolveloc.path);
mnt3svc_update_mountlist (mres->mstate, mres->req,
- mres->exp->expname);
- goto err;
+ path);
+ GF_FREE (path);
+ } else {
+ mres->parentfh = fh;
+ op_ret = __mnt3_resolve_export_subdir_comp (mres);
+ if (op_ret < 0)
+ mntstat = mnt3svc_errno_to_mnterr (-op_ret);
}
-
- mres->parentfh = fh;
- op_ret = __mnt3_resolve_export_subdir_comp (mres);
- if (op_ret < 0)
- mntstat = mnt3svc_errno_to_mnterr (-op_ret);
err:
if (op_ret == -1) {
gf_log (GF_MNT, GF_LOG_DEBUG, "Mount reply status: %d",
@@ -622,7 +992,9 @@ __mnt3_resolve_subdir (mnt3_resolve_t *mres)
if (!mres)
return ret;
- firstcomp = __setup_next_component (mres->remainingdir, dupsubdir);
+ firstcomp = setup_next_component (mres->remainingdir,
+ sizeof (mres->remainingdir),
+ dupsubdir, sizeof (dupsubdir));
if (!firstcomp)
goto err;
@@ -645,6 +1017,132 @@ err:
}
+/**
+ * This function will verify if the client is allowed to mount
+ * the directory or not. Client's IP address will be compared with
+ * allowed IP list or range present in mnt3_export structure.
+ *
+ * @param req - RPC request. This structure contains client's IP address.
+ * @param export - mnt3_export structure. Contains allowed IP list/range.
+ *
+ * @return 0 - on Success and -EACCES on failure.
+ */
+int
+mnt3_verify_auth (rpcsvc_request_t *req, struct mnt3_export *export)
+{
+ int retvalue = -EACCES;
+ int ret = 0;
+ int shiftbits = 0;
+ uint32_t ipv4netmask = 0;
+ uint32_t routingprefix = 0;
+ struct host_auth_spec *host = NULL;
+ struct sockaddr_in *client_addr = NULL;
+ struct sockaddr_in *allowed_addr = NULL;
+ struct addrinfo *allowed_addrinfo = NULL;
+
+ /* Sanity check */
+ if ((NULL == req) ||
+ (NULL == req->trans) ||
+ (NULL == export) ||
+ (NULL == export->hostspec)) {
+ gf_log (GF_MNT, GF_LOG_ERROR, "Invalid argument");
+ return retvalue;
+ }
+
+ host = export->hostspec;
+
+
+ /* Client's IP address. */
+ client_addr = (struct sockaddr_in *)(&(req->trans->peerinfo.sockaddr));
+
+ /* Try to see if the client IP matches the allowed IP list.*/
+ while (NULL != host){
+ GF_ASSERT (host->host_addr);
+
+ if (NULL != allowed_addrinfo) {
+ freeaddrinfo (allowed_addrinfo);
+ allowed_addrinfo = NULL;
+ }
+
+ /* Get the addrinfo for the allowed host (host_addr). */
+ ret = getaddrinfo (host->host_addr,
+ NULL,
+ NULL,
+ &allowed_addrinfo);
+ if (0 != ret){
+ gf_log (GF_MNT, GF_LOG_ERROR, "getaddrinfo: %s\n",
+ gai_strerror (ret));
+ host = host->next;
+
+ /* Failed to get IP addrinfo. Continue to check other
+ * allowed IPs in the list.
+ */
+ continue;
+ }
+
+ allowed_addr = (struct sockaddr_in *)(allowed_addrinfo->ai_addr);
+
+ if (NULL == allowed_addr) {
+ gf_log (GF_MNT, GF_LOG_ERROR, "Invalid structure");
+ break;
+ }
+
+ if (AF_INET == allowed_addr->sin_family){
+ if (IPv4_ADDR_SIZE < host->routeprefix) {
+ gf_log (GF_MNT, GF_LOG_ERROR, "invalid IP "
+ "configured for export-dir AUTH");
+ host = host->next;
+ continue;
+ }
+
+ /* -1 means no route prefix is provided. In this case
+ * the IP should be an exact match. Which is same as
+ * providing a route prefix of IPv4_ADDR_SIZE.
+ */
+ if (-1 == host->routeprefix) {
+ routingprefix = IPv4_ADDR_SIZE;
+ } else {
+ routingprefix = host->routeprefix;
+ }
+
+ /* Create a mask from the routing prefix. User provided
+ * CIDR address is split into IP address (host_addr) and
+ * routing prefix (routeprefix). This CIDR address may
+ * denote a single, distinct interface address or the
+ * beginning address of an entire network.
+ *
+ * e.g. the IPv4 block 192.168.100.0/24 represents the
+ * 256 IPv4 addresses from 192.168.100.0 to
+ * 192.168.100.255.
+ * Therefore to check if an IP matches 192.168.100.0/24
+ * we should mask the IP with FFFFFF00 and compare it
+ * with host address part of CIDR.
+ */
+ shiftbits = IPv4_ADDR_SIZE - routingprefix;
+ ipv4netmask = 0xFFFFFFFFUL << shiftbits;
+
+ /* Mask both the IPs and then check if they match
+ * or not. */
+ if (COMPARE_IPv4_ADDRS (allowed_addr,
+ client_addr,
+ ipv4netmask)){
+ retvalue = 0;
+ break;
+ }
+ }
+
+ /* Client IP didn't match the allowed IP.
+ * Check with the next allowed IP.*/
+ host = host->next;
+ }
+
+ if (NULL != allowed_addrinfo) {
+ freeaddrinfo (allowed_addrinfo);
+ }
+
+ return retvalue;
+}
+
int
mnt3_resolve_subdir (rpcsvc_request_t *req, struct mount3_state *ms,
struct mnt3_export *exp, char *subdir)
@@ -656,6 +1154,16 @@ mnt3_resolve_subdir (rpcsvc_request_t *req, struct mount3_state *ms,
if ((!req) || (!ms) || (!exp) || (!subdir))
return ret;
+ /* Need to check AUTH */
+ if (NULL != exp->hostspec) {
+ ret = mnt3_verify_auth (req, exp);
+ if (0 != ret) {
+ gf_log (GF_MNT,GF_LOG_ERROR,
+ "AUTH verification failed");
+ return ret;
+ }
+ }
+
mres = GF_CALLOC (1, sizeof (mnt3_resolve_t), gf_nfs_mt_mnt3_resolve);
if (!mres) {
gf_log (GF_MNT, GF_LOG_ERROR, "Memory allocation failed");
@@ -665,7 +1173,7 @@ mnt3_resolve_subdir (rpcsvc_request_t *req, struct mount3_state *ms,
mres->exp = exp;
mres->mstate = ms;
mres->req = req;
- strcpy (mres->remainingdir, subdir);
+ strncpy (mres->remainingdir, subdir, MNTPATHLEN);
if (gf_nfs_dvm_off (nfs_state (ms->nfsx)))
pfh = nfs3_fh_build_indexed_root_fh (mres->mstate->nfsx->children, mres->exp->vol);
else
@@ -740,6 +1248,7 @@ mnt3_mntpath_to_export (struct mount3_state *ms, char *dirpath)
if ((!ms) || (!dirpath))
return NULL;
+ LOCK (&ms->mountlock);
list_for_each_entry (exp, &ms->exportlist, explist) {
/* Search for the an exact match with the volume */
@@ -753,6 +1262,7 @@ mnt3_mntpath_to_export (struct mount3_state *ms, char *dirpath)
gf_log (GF_MNT, GF_LOG_DEBUG, "Export not found");
foundexp:
+ UNLOCK (&ms->mountlock);
return found;
}
@@ -781,8 +1291,7 @@ mnt3_check_client_net (struct mount3_state *ms, rpcsvc_request_t *req,
gai_strerror (ret));
}
- ret = rpcsvc_transport_peer_check (svc->options, targetxl->name,
- trans);
+ ret = rpcsvc_auth_check (svc, targetxl->name, trans);
if (ret == RPCSVC_AUTH_REJECT) {
gf_log (GF_MNT, GF_LOG_INFO, "Peer %s not allowed", peer);
goto err;
@@ -809,7 +1318,8 @@ mnt3_parse_dir_exports (rpcsvc_request_t *req, struct mount3_state *ms,
char volname[1024];
struct mnt3_export *exp = NULL;
char *volname_ptr = NULL;
- int ret = -1;
+ int ret = -ENOENT;
+ struct nfs_state *nfs = NULL;
if ((!ms) || (!subdir))
return -1;
@@ -823,10 +1333,26 @@ mnt3_parse_dir_exports (rpcsvc_request_t *req, struct mount3_state *ms,
if (!exp)
goto err;
+ nfs = (struct nfs_state *)ms->nfsx->private;
+ if (!nfs)
+ goto err;
+
+ if (!nfs_subvolume_started (nfs, exp->vol)) {
+ gf_log (GF_MNT, GF_LOG_DEBUG,
+ "Volume %s not started", exp->vol->name);
+ goto err;
+ }
+
+ if (mnt3_check_client_net (ms, req, exp->vol) == RPCSVC_AUTH_REJECT) {
+ gf_log (GF_MNT, GF_LOG_DEBUG, "Client mount not allowed");
+ ret = -EACCES;
+ goto err;
+ }
+
ret = mnt3_resolve_subdir (req, ms, exp, subdir);
if (ret < 0) {
- gf_log (GF_MNT, GF_LOG_ERROR, "Failed to resolve export dir: %s"
- , subdir);
+ gf_log (GF_MNT, GF_LOG_ERROR,
+ "Failed to resolve export dir: %s", subdir);
goto err;
}
@@ -866,10 +1392,6 @@ mnt3_find_export (rpcsvc_request_t *req, char *path, struct mnt3_export **e)
}
ret = mnt3_parse_dir_exports (req, ms, path);
- if (ret == 0) {
- ret = -2;
- goto err;
- }
err:
return ret;
@@ -907,17 +1429,26 @@ mnt3svc_mnt (rpcsvc_request_t *req)
goto rpcerr;
}
- ret = 0;
nfs = (struct nfs_state *)ms->nfsx->private;
gf_log (GF_MNT, GF_LOG_DEBUG, "dirpath: %s", path);
ret = mnt3_find_export (req, path, &exp);
- if (ret == -2) {
- ret = 0;
- goto rpcerr;
- } else if (ret < 0) {
- ret = -1;
- mntstat = MNT3ERR_NOENT;
+ if (ret < 0) {
+ mntstat = mnt3svc_errno_to_mnterr (-ret);
goto mnterr;
+ } else if (!exp) {
+ /*
+ * SPECIAL CASE: exp is NULL if "path" is subdir in
+ * call to mnt3_find_export().
+ *
+ * This is subdir mount, we are already DONE!
+ * nfs_subvolume_started() and mnt3_check_client_net()
+ * validation are done in mnt3_parse_dir_exports()
+ * which is invoked through mnt3_find_export().
+ *
+ * TODO: All mount should happen thorugh mnt3svc_mount()
+ * It needs more clean up.
+ */
+ return (0);
}
if (!nfs_subvolume_started (nfs, exp->vol)) {
@@ -977,6 +1508,9 @@ __build_mountlist (struct mount3_state *ms, int *count)
if ((!ms) || (!count))
return NULL;
+ /* read rmtab, other peers might have updated it */
+ mount_read_rmtab(ms);
+
*count = 0;
gf_log (GF_MNT, GF_LOG_DEBUG, "Building mount list:");
list_for_each_entry (me, &ms->mountlist, mlist) {
@@ -987,6 +1521,8 @@ __build_mountlist (struct mount3_state *ms, int *count)
" failed");
goto free_list;
}
+ if (!first)
+ first = mlist;
mlist->ml_directory = GF_CALLOC (namelen + 2, sizeof (char),
gf_nfs_mt_char);
@@ -996,8 +1532,7 @@ __build_mountlist (struct mount3_state *ms, int *count)
goto free_list;
}
- strcpy (mlist->ml_directory, "/");
- strcat (mlist->ml_directory, me->exname);
+ strcpy (mlist->ml_directory, me->exname);
namelen = strlen (me->hostname);
mlist->ml_hostname = GF_CALLOC (namelen + 2, sizeof (char),
@@ -1018,9 +1553,6 @@ __build_mountlist (struct mount3_state *ms, int *count)
} else
prev = mlist;
- if (!first)
- first = mlist;
-
(*count)++;
}
@@ -1097,67 +1629,71 @@ rpcerr:
int
-__mnt3svc_umount (struct mount3_state *ms, char *dirpath, char *hostname)
+mnt3svc_umount (struct mount3_state *ms, char *dirpath, char *hostname)
{
struct mountentry *me = NULL;
- char *exname = NULL;
int ret = -1;
+ gf_store_handle_t *sh = NULL;
+ struct nfs_state *nfs = NULL;
if ((!ms) || (!dirpath) || (!hostname))
return -1;
- if (list_empty (&ms->mountlist))
- return 0;
-
- if (dirpath[0] == '/')
- exname = dirpath+1;
- else
- exname = dirpath;
+ nfs = (struct nfs_state *)ms->nfsx->private;
- list_for_each_entry (me, &ms->mountlist, mlist) {
- if ((strcmp (me->exname, exname) == 0) &&
- (strcmp (me->hostname, hostname) == 0)) {
- ret = 0;
- break;
- }
+ ret = gf_store_handle_new (nfs->rmtab, &sh);
+ if (ret) {
+ gf_log (GF_MNT, GF_LOG_WARNING, "Failed to open '%s'",
+ nfs->rmtab);
+ return 0;
}
- /* Need this check here because at the end of the search me might still
- * be pointing to the last entry, which may not be the one we're
- * looking for.
- */
- if (ret == -1) {/* Not found in list. */
- gf_log (GF_MNT, GF_LOG_DEBUG, "Export not found");
- goto ret;
+ ret = gf_store_lock (sh);
+ if (ret) {
+ goto out_free;
}
- if (!me)
- goto ret;
+ LOCK (&ms->mountlock);
+ {
+ __mount_read_rmtab (sh, &ms->mountlist, _gf_false);
+ if (list_empty (&ms->mountlist)) {
+ ret = 0;
+ goto out_unlock;
+ }
- gf_log (GF_MNT, GF_LOG_DEBUG, "Unmounting: dir %s, host: %s",
- me->exname, me->hostname);
- list_del (&me->mlist);
- GF_FREE (me);
- ret = 0;
-ret:
- return ret;
-}
+ ret = -1;
+ list_for_each_entry (me, &ms->mountlist, mlist) {
+ if ((strcmp (me->exname, dirpath) == 0) &&
+ (strcmp (me->hostname, hostname) == 0)) {
+ ret = 0;
+ break;
+ }
+ }
+ /* Need this check here because at the end of the search me
+ * might still be pointing to the last entry, which may not be
+ * the one we're looking for.
+ */
+ if (ret == -1) {/* Not found in list. */
+ gf_log (GF_MNT, GF_LOG_TRACE, "Export not found");
+ goto out_unlock;
+ }
+ if (!me)
+ goto out_unlock;
-int
-mnt3svc_umount (struct mount3_state *ms, char *dirpath, char *hostname)
-{
- int ret = -1;
- if ((!ms) || (!dirpath) || (!hostname))
- return -1;
+ gf_log (GF_MNT, GF_LOG_DEBUG, "Unmounting: dir %s, host: %s",
+ me->exname, me->hostname);
- LOCK (&ms->mountlock);
- {
- ret = __mnt3svc_umount (ms, dirpath, hostname);
+ list_del (&me->mlist);
+ GF_FREE (me);
+ __mount_rewrite_rmtab (ms, sh);
}
+out_unlock:
UNLOCK (&ms->mountlock);
-
+ gf_store_unlock (sh);
+out_free:
+ gf_store_handle_destroy (sh);
return ret;
}
@@ -1305,6 +1841,10 @@ mnt3_xlchildren_to_exports (rpcsvc_t *svc, struct mount3_state *ms)
return NULL;
nfs = (struct nfs_state *)ms->nfsx->private;
+ if (!nfs)
+ return NULL;
+
+ LOCK (&ms->mountlock);
list_for_each_entry(ent, &ms->exportlist, explist) {
/* If volume is not started yet, do not list it for tools like
@@ -1320,7 +1860,8 @@ mnt3_xlchildren_to_exports (rpcsvc_t *svc, struct mount3_state *ms)
" failed");
goto free_list;
}
-
+ if (!first)
+ first = elist;
elist->ex_dir = GF_CALLOC (namelen + 2, sizeof (char),
gf_nfs_mt_char);
if (!elist->ex_dir) {
@@ -1328,16 +1869,10 @@ mnt3_xlchildren_to_exports (rpcsvc_t *svc, struct mount3_state *ms)
" failed");
goto free_list;
}
-
strcpy (elist->ex_dir, ent->expname);
addrstr = rpcsvc_volume_allowed (svc->options,
ent->vol->name);
- if (addrstr)
- addrstr = gf_strdup (addrstr);
- else
- addrstr = gf_strdup ("No Access");
-
elist->ex_groups = GF_CALLOC (1, sizeof (struct groupnode),
gf_nfs_mt_groupnode);
if (!elist->ex_groups) {
@@ -1345,21 +1880,29 @@ mnt3_xlchildren_to_exports (rpcsvc_t *svc, struct mount3_state *ms)
" failed");
goto free_list;
}
+ /*This check has to be done after checking
+ * elist->ex_groups allocation check to avoid resource leak;
+ */
+ if (addrstr)
+ addrstr = gf_strdup (addrstr);
+ else
+ addrstr = gf_strdup ("No Access");
+ if (!addrstr) {
+ goto free_list;
+ }
elist->ex_groups->gr_name = addrstr;
if (prev) {
prev->ex_next = elist;
prev = elist;
} else
prev = elist;
-
- if (!first)
- first = elist;
}
ret = 0;
free_list:
+ UNLOCK (&ms->mountlock);
if (ret == -1) {
xdr_free_exports_list (first);
first = NULL;
@@ -1465,12 +2008,13 @@ mount3udp_add_mountlist (char *host, dirpath *expname)
while (*export == '/')
export++;
- strcpy (me->exname, export);
- strcpy (me->hostname, host);
+ strncpy (me->exname, export, MNTPATHLEN);
+ strncpy (me->hostname, host, MNTPATHLEN);
INIT_LIST_HEAD (&me->mlist);
LOCK (&ms->mountlock);
{
list_add_tail (&me->mlist, &ms->mountlist);
+ mount_rewrite_rmtab(ms, NULL);
}
UNLOCK (&ms->mountlock);
return 0;
@@ -1486,11 +2030,155 @@ mount3udp_delete_mountlist (char *hostname, dirpath *expname)
export = (char *)expname;
while (*export == '/')
export++;
- __mnt3svc_umount (ms, export, hostname);
+ mnt3svc_umount (ms, export, hostname);
return 0;
}
+/**
+ * This function will parse the hostip (IP addres, IP range, or hostname)
+ * and fill the host_auth_spec structure.
+ *
+ * @param hostspec - struct host_auth_spec
+ * @param hostip - IP address, IP range (CIDR format) or hostname
+ *
+ * @return 0 - on success and -1 on failure
+ */
+int
+mnt3_export_fill_hostspec (struct host_auth_spec* hostspec, const char* hostip)
+{
+ char *ipdupstr = NULL;
+ char *savptr = NULL;
+ char *ip = NULL;
+ char *token = NULL;
+ int ret = -1;
+
+ /* Create copy of the string so that the source won't change
+ */
+ ipdupstr = gf_strdup (hostip);
+ if (NULL == ipdupstr) {
+ gf_log (GF_MNT, GF_LOG_ERROR, "Memory allocation failed");
+ goto err;
+ }
+
+ ip = strtok_r (ipdupstr, "/", &savptr);
+ hostspec->host_addr = gf_strdup (ip);
+ if (NULL == hostspec->host_addr) {
+ gf_log (GF_MNT, GF_LOG_ERROR, "Memory allocation failed");
+ goto err;
+ }
+
+ /* Check if the IP is in <IP address> / <Range> format.
+ * If yes, then strip the range and store it separately.
+ */
+ token = strtok_r (NULL, "/", &savptr);
+
+ if (NULL == token) {
+ hostspec->routeprefix = -1;
+ } else {
+ hostspec->routeprefix = atoi (token);
+ }
+
+ // success
+ ret = 0;
+err:
+ if (NULL != ipdupstr) {
+ GF_FREE (ipdupstr);
+ }
+ return ret;
+}
+
+
+/**
+ * This function will parse the AUTH parameter passed along with
+ * "export-dir" option. If AUTH parameter is present then it will be
+ * stripped from exportpath and stored in mnt3_export (exp) structure.
+ *
+ * @param exp - mnt3_export structure. Holds information needed for mount.
+ * @param exportpath - Value of "export-dir" key. Holds both export path
+ * and AUTH parameter for the path.
+ * exportpath format: <abspath>[(hostdesc[|hostspec|...])]
+ *
+ * @return This function will return 0 on success and -1 on failure.
+ */
+int
+mnt3_export_parse_auth_param (struct mnt3_export* exp, char* exportpath)
+{
+ char *token = NULL;
+ char *savPtr = NULL;
+ char *hostip = NULL;
+ struct host_auth_spec *host = NULL;
+ int ret = 0;
+
+ /* Using exportpath directly in strtok_r because we want
+ * to strip off AUTH parameter from exportpath. */
+ token = strtok_r (exportpath, "(", &savPtr);
+
+ /* Get the next token, which will be the AUTH parameter. */
+ token = strtok_r (NULL, ")", &savPtr);
+
+ if (NULL == token) {
+ /* If AUTH is not present then we should return success. */
+ return 0;
+ }
+
+ /* Free any previously allocated hostspec structure. */
+ if (NULL != exp->hostspec) {
+ GF_FREE (exp->hostspec);
+ exp->hostspec = NULL;
+ }
+
+ exp->hostspec = GF_CALLOC (1,
+ sizeof (*(exp->hostspec)),
+ gf_nfs_mt_auth_spec);
+ if (NULL == exp->hostspec){
+ gf_log (GF_MNT, GF_LOG_ERROR, "Memory allocation failed");
+ return -1;
+ }
+
+ /* AUTH parameter can have multiple entries. For each entry
+ * a host_auth_spec structure is created. */
+ host = exp->hostspec;
+
+ hostip = strtok_r (token, "|", &savPtr);
+
+ /* Parse all AUTH parameters separated by '|' */
+ while (NULL != hostip){
+ ret = mnt3_export_fill_hostspec (host, hostip);
+ if (0 != ret) {
+ gf_log(GF_MNT, GF_LOG_WARNING,
+ "Failed to parse hostspec: %s", hostip);
+ goto err;
+ }
+
+ hostip = strtok_r (NULL, "|", &savPtr);
+ if (NULL == hostip) {
+ break;
+ }
+
+ host->next = GF_CALLOC (1, sizeof (*(host)),
+ gf_nfs_mt_auth_spec);
+ if (NULL == host->next){
+ gf_log (GF_MNT,GF_LOG_ERROR,
+ "Memory allocation failed");
+ goto err;
+ }
+ host = host->next;
+ }
+
+ /* In case of success return from here */
+ return 0;
+err:
+ /* In case of failure free up hostspec structure. */
+ FREE_HOSTSPEC (exp);
+ return -1;
+}
+
+/**
+ * exportpath will also have AUTH options (ip address, subnet address or
+ * hostname) mentioned.
+ * exportpath format: <abspath>[(hostdesc[|hostspec|...])]
+ */
struct mnt3_export *
mnt3_init_export_ent (struct mount3_state *ms, xlator_t *xl, char *exportpath,
uuid_t volumeid)
@@ -1508,6 +2196,20 @@ mnt3_init_export_ent (struct mount3_state *ms, xlator_t *xl, char *exportpath,
return NULL;
}
+ if (NULL != exportpath) {
+ /* If exportpath is not NULL then we should check if AUTH
+ * parameter is present or not. If AUTH parameter is present
+ * then it will be stripped and stored in mnt3_export (exp)
+ * structure.
+ */
+ if (0 != mnt3_export_parse_auth_param (exp, exportpath)){
+ gf_log (GF_MNT, GF_LOG_ERROR,
+ "Failed to parse auth param");
+ goto err;
+ }
+ }
+
+
INIT_LIST_HEAD (&exp->explist);
if (exportpath)
alloclen = strlen (xl->name) + 2 + strlen (exportpath);
@@ -1517,8 +2219,6 @@ mnt3_init_export_ent (struct mount3_state *ms, xlator_t *xl, char *exportpath,
exp->expname = GF_CALLOC (alloclen, sizeof (char), gf_nfs_mt_char);
if (!exp->expname) {
gf_log (GF_MNT, GF_LOG_ERROR, "Memory allocation failed");
- GF_FREE (exp);
- exp = NULL;
goto err;
}
@@ -1535,8 +2235,9 @@ mnt3_init_export_ent (struct mount3_state *ms, xlator_t *xl, char *exportpath,
ret = snprintf (exp->expname, alloclen, "/%s", xl->name);
}
if (ret < 0) {
- gf_log (xl->name, GF_LOG_WARNING,
- "failed to get the export name");
+ gf_log (xl->name, GF_LOG_ERROR,
+ "Failed to set the export name");
+ goto err;
}
/* Just copy without discrimination, we'll determine whether to
* actually use it when a mount request comes in and a file handle
@@ -1544,7 +2245,16 @@ mnt3_init_export_ent (struct mount3_state *ms, xlator_t *xl, char *exportpath,
*/
uuid_copy (exp->volumeid, volumeid);
exp->vol = xl;
+
+ /* On success we should return from here*/
+ return exp;
err:
+ /* On failure free exp and it's members.*/
+ if (NULL != exp) {
+ mnt3_export_free (exp);
+ exp = NULL;
+ }
+
return exp;
}
@@ -1705,8 +2415,11 @@ __mnt3_init_volume_export (struct mount3_state *ms, dict_t *opts)
goto err;
}
- gf_string2boolean (optstr, &boolt);
- ret = 0;
+ ret = gf_string2boolean (optstr, &boolt);
+ if (ret < 0) {
+ gf_log (GF_MNT, GF_LOG_ERROR, "Failed to convert"
+ " string to boolean");
+ }
err:
if (boolt == _gf_false) {
@@ -1745,8 +2458,11 @@ __mnt3_init_dir_export (struct mount3_state *ms, dict_t *opts)
goto err;
}
- gf_string2boolean (optstr, &boolt);
- ret = 0;
+ ret = gf_string2boolean (optstr, &boolt);
+ if (ret < 0) {
+ gf_log (GF_MNT, GF_LOG_ERROR, "Failed to convert"
+ " string to boolean");
+ }
err:
if (boolt == _gf_false) {
@@ -1844,18 +2560,20 @@ out:
}
rpcsvc_actor_t mnt3svc_actors[MOUNT3_PROC_COUNT] = {
- {"NULL", MOUNT3_NULL, mnt3svc_null, NULL, 0},
- {"MNT", MOUNT3_MNT, mnt3svc_mnt, NULL, 0},
- {"DUMP", MOUNT3_DUMP, mnt3svc_dump, NULL, 0},
- {"UMNT", MOUNT3_UMNT, mnt3svc_umnt, NULL, 0},
- {"UMNTALL", MOUNT3_UMNTALL, mnt3svc_umntall, NULL, 0},
- {"EXPORT", MOUNT3_EXPORT, mnt3svc_export, NULL, 0}
+ {"NULL", MOUNT3_NULL, mnt3svc_null, NULL, 0, DRC_NA},
+ {"MNT", MOUNT3_MNT, mnt3svc_mnt, NULL, 0, DRC_NA},
+ {"DUMP", MOUNT3_DUMP, mnt3svc_dump, NULL, 0, DRC_NA},
+ {"UMNT", MOUNT3_UMNT, mnt3svc_umnt, NULL, 0, DRC_NA},
+ {"UMNTALL", MOUNT3_UMNTALL, mnt3svc_umntall, NULL, 0, DRC_NA},
+ {"EXPORT", MOUNT3_EXPORT, mnt3svc_export, NULL, 0, DRC_NA}
};
/* Static init parts are assigned here, dynamic ones are done in
* mnt3svc_init and mnt3_init_state.
+ * Making MOUNT3 a synctask so that the blocking DNS calls during rpc auth
+ * gets offloaded to syncenv, keeping the main/poll thread unblocked
*/
rpcsvc_program_t mnt3prog = {
.progname = "MOUNT3",
@@ -1865,6 +2583,7 @@ rpcsvc_program_t mnt3prog = {
.actors = mnt3svc_actors,
.numactors = MOUNT3_PROC_COUNT,
.min_auth = AUTH_NULL,
+ .synctask = _gf_true,
};
@@ -1920,7 +2639,7 @@ mnt3svc_init (xlator_t *nfsx)
}
}
- rpcsvc_create_listeners (nfs->rpcsvc, options, nfsx->name);
+ ret= rpcsvc_create_listeners (nfs->rpcsvc, options, nfsx->name);
if (ret == -1) {
gf_log (GF_NFS, GF_LOG_ERROR, "Unable to create listeners");
dict_unref (options);
@@ -1937,12 +2656,12 @@ err:
rpcsvc_actor_t mnt1svc_actors[MOUNT1_PROC_COUNT] = {
- {"NULL", MOUNT1_NULL, mnt3svc_null, NULL, 0},
- {{0, 0}, },
- {"DUMP", MOUNT1_DUMP, mnt3svc_dump, NULL, 0},
- {"UMNT", MOUNT1_UMNT, mnt3svc_umnt, NULL, 0},
- {{0, 0}, },
- {"EXPORT", MOUNT1_EXPORT, mnt3svc_export, NULL, 0}
+ {"NULL", MOUNT1_NULL, mnt3svc_null, NULL, 0, DRC_NA},
+ {"MNT", MOUNT1_MNT, NULL, NULL, 0, DRC_NA },
+ {"DUMP", MOUNT1_DUMP, mnt3svc_dump, NULL, 0, DRC_NA},
+ {"UMNT", MOUNT1_UMNT, mnt3svc_umnt, NULL, 0, DRC_NA},
+ {"UMNTALL", MOUNT1_UMNTALL, NULL, NULL, 0, DRC_NA},
+ {"EXPORT", MOUNT1_EXPORT, mnt3svc_export, NULL, 0, DRC_NA}
};
rpcsvc_program_t mnt1prog = {
@@ -1953,6 +2672,7 @@ rpcsvc_program_t mnt1prog = {
.actors = mnt1svc_actors,
.numactors = MOUNT1_PROC_COUNT,
.min_auth = AUTH_NULL,
+ .synctask = _gf_true,
};
@@ -2007,7 +2727,7 @@ mnt1svc_init (xlator_t *nfsx)
}
}
- rpcsvc_create_listeners (nfs->rpcsvc, options, nfsx->name);
+ ret = rpcsvc_create_listeners (nfs->rpcsvc, options, nfsx->name);
if (ret == -1) {
gf_log (GF_NFS, GF_LOG_ERROR, "Unable to create listeners");
dict_unref (options);
@@ -2018,3 +2738,44 @@ mnt1svc_init (xlator_t *nfsx)
err:
return NULL;
}
+
+int
+mount_reconfigure_state (xlator_t *nfsx, dict_t *options)
+{
+ int ret = -1;
+ struct nfs_state *nfs = NULL;
+ struct mount3_state *ms = NULL;
+ struct mnt3_export *exp = NULL;
+ struct mnt3_export *texp = NULL;
+
+ if ((!nfsx) || (!options))
+ return (-1);
+
+ nfs = (struct nfs_state *)nfs_state (nfsx);
+ if (!nfs)
+ return (-1);
+
+ ms = nfs->mstate;
+ if (!ms)
+ return (-1);
+
+ /*
+ * Free() up the old export list. mnt3_init_options() will
+ * rebuild the export list from scratch. Do it with locking
+ * to avoid unnecessary race conditions.
+ */
+ LOCK (&ms->mountlock);
+ list_for_each_entry_safe (exp, texp, &ms->exportlist, explist) {
+ list_del (&exp->explist);
+ mnt3_export_free (exp);
+ }
+ ret = mnt3_init_options (ms, options);
+ UNLOCK (&ms->mountlock);
+
+ if (ret < 0) {
+ gf_log (GF_MNT, GF_LOG_ERROR, "Options reconfigure failed");
+ return (-1);
+ }
+
+ return (0);
+}
diff --git a/xlators/nfs/server/src/mount3.h b/xlators/nfs/server/src/mount3.h
index c0eae3644..7fc16ed57 100644
--- a/xlators/nfs/server/src/mount3.h
+++ b/xlators/nfs/server/src/mount3.h
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _MOUNT3_H_
@@ -53,6 +44,12 @@ mnt1svc_init (xlator_t *nfsx);
extern int
mount_init_state (xlator_t *nfsx);
+extern int
+mount_reconfigure_state (xlator_t *nfsx, dict_t *options);
+
+void
+mount_rewrite_rmtab (struct mount3_state *ms, char *new_rmtab);
+
/* Data structure used to store the list of mounts points currently
* in use by NFS clients.
*/
@@ -68,6 +65,13 @@ struct mountentry {
#define MNT3_EXPTYPE_VOLUME 1
#define MNT3_EXPTYPE_DIR 2
+/* Structure to hold export-dir AUTH parameter */
+struct host_auth_spec {
+ char *host_addr; /* Allowed IP or host name */
+ int routeprefix; /* Routing prefix */
+ struct host_auth_spec *next; /* Pointer to next AUTH struct */
+};
+
struct mnt3_export {
struct list_head explist;
@@ -75,6 +79,11 @@ struct mnt3_export {
* is exported or the subdirectory in the volume.
*/
char *expname;
+ /*
+ * IP address, hostname or subnets who are allowed to connect to expname
+ * subvolume or subdirectory
+ */
+ struct host_auth_spec* hostspec;
xlator_t *vol;
int exptype;
@@ -101,8 +110,8 @@ struct mount3_state {
gf_lock_t mountlock;
/* Set to 0 if exporting full volumes is disabled. On by default. */
- int export_volumes;
- int export_dirs;
+ gf_boolean_t export_volumes;
+ gf_boolean_t export_dirs;
};
#define gf_mnt3_export_dirs(mst) ((mst)->export_dirs)
diff --git a/xlators/nfs/server/src/mount3udp_svc.c b/xlators/nfs/server/src/mount3udp_svc.c
index aa38b1cc4..fb59e282c 100644
--- a/xlators/nfs/server/src/mount3udp_svc.c
+++ b/xlators/nfs/server/src/mount3udp_svc.c
@@ -2,19 +2,10 @@
Copyright (c) 2012 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
diff --git a/xlators/nfs/server/src/nfs-common.c b/xlators/nfs/server/src/nfs-common.c
index d35df87bc..f74396ee8 100644
--- a/xlators/nfs/server/src/nfs-common.c
+++ b/xlators/nfs/server/src/nfs-common.c
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _CONFIG_H
@@ -94,7 +85,7 @@ nfs_mntpath_to_xlator (xlator_list_t *cl, char *path)
if ((!cl) || (!path))
return NULL;
- strcpy (volname, path);
+ strncpy (volname, path, MNTPATHLEN);
pathlen = strlen (volname);
gf_log (GF_NFS, GF_LOG_TRACE, "Subvolume search: %s", path);
if (volname[0] == '/')
@@ -221,6 +212,8 @@ nfs_inode_loc_fill (inode_t *inode, loc_t *loc, int how)
snprintf (tmp_path, sizeof (tmp_path), "<gfid:%s>",
uuid_utoa (loc->gfid));
resolvedpath = gf_strdup (tmp_path);
+ } else {
+ parent = inode_parent (inode, loc->pargfid, NULL);
}
ret = nfs_loc_fill (loc, inode, parent, resolvedpath);
@@ -415,6 +408,9 @@ nfs_hash_gfid (uuid_t gfid)
uint32_t b1 = 0;
uint32_t b2 = 0;
+ if (__is_root_gfid (gfid))
+ return 0x1;
+
memcpy (&msb64, &gfid[8], 8);
memcpy (&lsb64, &gfid[0], 8);
@@ -432,3 +428,38 @@ nfs_hash_gfid (uuid_t gfid)
}
+void
+nfs_fix_generation (xlator_t *this, inode_t *inode)
+{
+ uint64_t raw_ctx = 0;
+ struct nfs_inode_ctx *ictx = NULL;
+ struct nfs_state *priv = NULL;
+ int ret = -1;
+
+ if (!inode) {
+ return;
+ }
+ priv = this->private;
+
+ if (inode_ctx_get(inode,this,&raw_ctx) == 0) {
+ ictx = (struct nfs_inode_ctx *)raw_ctx;
+ ictx->generation = priv->generation;
+ }
+ else {
+ ictx = GF_CALLOC (1, sizeof (struct nfs_inode_ctx),
+ gf_nfs_mt_inode_ctx);
+ if (!ictx) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not allocate nfs inode ctx");
+ return;
+ }
+ INIT_LIST_HEAD(&ictx->shares);
+ ictx->generation = priv->generation;
+ ret = inode_ctx_put (inode, this, (uint64_t)ictx);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not store nfs inode ctx");
+ return;
+ }
+ }
+}
diff --git a/xlators/nfs/server/src/nfs-common.h b/xlators/nfs/server/src/nfs-common.h
index 88fc14961..2e97f1563 100644
--- a/xlators/nfs/server/src/nfs-common.h
+++ b/xlators/nfs/server/src/nfs-common.h
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _NFS_COMMON_H_
@@ -84,4 +75,7 @@ nfs_hash_gfid (uuid_t gfid);
extern int
nfs_gfid_loc_fill (inode_table_t *itable, uuid_t gfid, loc_t *loc, int how);
+
+void
+nfs_fix_generation (xlator_t *this, inode_t *inode);
#endif
diff --git a/xlators/nfs/server/src/nfs-fops.c b/xlators/nfs/server/src/nfs-fops.c
index e2eedf433..56d4cba47 100644
--- a/xlators/nfs/server/src/nfs-fops.c
+++ b/xlators/nfs/server/src/nfs-fops.c
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _CONFIG_H
@@ -39,27 +30,47 @@
#include <libgen.h>
#include <semaphore.h>
+static int gf_auth_max_groups_nfs_log = 0;
+
void
nfs_fix_groups (xlator_t *this, call_stack_t *root)
{
struct passwd mypw;
char mystrs[1024];
struct passwd *result;
+#ifdef GF_DARWIN_HOST_OS
+ /* BSD/DARWIN does not correctly uses gid_t in getgrouplist */
+ int mygroups[GF_MAX_AUX_GROUPS];
+#else
gid_t mygroups[GF_MAX_AUX_GROUPS];
+#endif
int ngroups;
int i;
+ int max_groups;
struct nfs_state *priv = this->private;
const gid_list_t *agl;
- gid_list_t gl;
+ gid_list_t gl;
if (!priv->server_aux_gids) {
return;
}
- agl = gid_cache_lookup(&priv->gid_cache, root->uid);
+ /* RPC enforces the GF_AUTH_GLUSTERFS_MAX_GROUPS limit */
+ max_groups = GF_AUTH_GLUSTERFS_MAX_GROUPS(root->lk_owner.len);
+
+ agl = gid_cache_lookup(&priv->gid_cache, root->uid, 0, 0);
if (agl) {
- for (ngroups = 0; ngroups < agl->gl_count; ngroups++)
+ if (agl->gl_count > max_groups) {
+ GF_LOG_OCCASIONALLY (gf_auth_max_groups_nfs_log,
+ this->name, GF_LOG_WARNING,
+ "too many groups, reducing %d -> %d",
+ agl->gl_count, max_groups);
+ }
+
+ for (ngroups = 0; ngroups < agl->gl_count
+ && ngroups <= max_groups; ngroups++) {
root->groups[ngroups] = agl->gl_list[ngroups];
+ }
root->ngrps = ngroups;
gid_cache_release(&priv->gid_cache, agl);
return;
@@ -93,12 +104,24 @@ nfs_fix_groups (xlator_t *this, call_stack_t *root)
if (gl.gl_list) {
/* It's not fatal if the alloc failed. */
gl.gl_id = root->uid;
+ gl.gl_uid = 0;
+ gl.gl_gid = 0;
gl.gl_count = ngroups;
memcpy(gl.gl_list, mygroups, sizeof(gid_t) * ngroups);
if (gid_cache_add(&priv->gid_cache, &gl) != 1)
GF_FREE(gl.gl_list);
}
+ /* RPC enforces the GF_AUTH_GLUSTERFS_MAX_GROUPS limit */
+ if (ngroups > max_groups) {
+ GF_LOG_OCCASIONALLY (gf_auth_max_groups_nfs_log,
+ this->name, GF_LOG_WARNING,
+ "too many groups, reducing %d -> %d",
+ ngroups, max_groups);
+
+ ngroups = max_groups;
+ }
+
/* Copy data to the frame. */
for (i = 0; i < ngroups; ++i) {
gf_log (this->name, GF_LOG_TRACE,
@@ -191,6 +214,12 @@ nfs_create_frame (xlator_t *xl, nfs_user_t *nfu)
frame = create_frame (xl, (call_pool_t *)xl->ctx->pool);
if (!frame)
goto err;
+ if (call_stack_alloc_groups (frame->root, nfu->ngrps) != 0) {
+ STACK_DESTROY (frame->root);
+ frame = NULL;
+ goto err;
+ }
+
frame->root->pid = NFS_PID;
frame->root->uid = nfu->uid;
frame->root->gid = nfu->gids[NFS_PRIMGID_IDX];
@@ -318,6 +347,9 @@ nfs_gfid_dict (inode_t *inode)
uuid_t rootgfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
dyngfid = GF_CALLOC (1, sizeof (uuid_t), gf_common_mt_char);
+ if (dyngfid == NULL)
+ return (NULL);
+
uuid_generate (newgfid);
if (uuid_compare (inode->gfid, rootgfid) == 0)
@@ -328,16 +360,17 @@ nfs_gfid_dict (inode_t *inode)
dictgfid = dict_new ();
if (!dictgfid) {
gf_log (GF_NFS, GF_LOG_ERROR, "Failed to create gfid dict");
- goto out;
+ GF_FREE (dyngfid);
+ return (NULL);
}
ret = dict_set_bin (dictgfid, "gfid-req", dyngfid, sizeof (uuid_t));
if (ret < 0) {
+ GF_FREE (dyngfid);
dict_unref (dictgfid);
- dictgfid = NULL;
+ return (NULL);
}
-out:
return dictgfid;
}
@@ -353,9 +386,6 @@ out:
} \
} while (0) \
-
-
-
/* Fops Layer Explained
* The fops layer has three types of functions. They can all be identified by
* their names. Here are the three patterns:
@@ -389,6 +419,10 @@ nfs_fop_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct nfs_fop_local *local = NULL;
fop_lookup_cbk_t progcbk;
+ if (op_ret == 0) {
+ nfs_fix_generation(this,inode);
+ }
+
nfl_to_prog_data (local, progcbk, frame);
nfs_fop_restore_root_ino (local, op_ret, buf, NULL, NULL, postparent);
if (progcbk)
@@ -759,6 +793,10 @@ nfs_fop_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct nfs_fop_local *nfl = NULL;
fop_create_cbk_t progcbk = NULL;
+ if (op_ret == 0) {
+ nfs_fix_generation(this,inode);
+ }
+
nfl_to_prog_data (nfl, progcbk, frame);
nfs_fop_restore_root_ino (nfl, op_ret, buf, NULL, preparent,
postparent);
@@ -861,6 +899,10 @@ nfs_fop_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct nfs_fop_local *nfl = NULL;
fop_mkdir_cbk_t progcbk = NULL;
+ if (op_ret == 0) {
+ nfs_fix_generation(this,inode);
+ }
+
nfl_to_prog_data (nfl, progcbk, frame);
nfs_fop_restore_root_ino (nfl, op_ret, buf, NULL,preparent, postparent);
if (progcbk)
@@ -910,6 +952,10 @@ nfs_fop_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct nfs_fop_local *nfl = NULL;
fop_symlink_cbk_t progcbk = NULL;
+ if (op_ret == 0) {
+ nfs_fix_generation(this,inode);
+ }
+
nfl_to_prog_data (nfl, progcbk, frame);
nfs_fop_restore_root_ino (nfl, op_ret,buf, NULL, preparent, postparent);
if (progcbk)
@@ -1006,6 +1052,10 @@ nfs_fop_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct nfs_fop_local *nfl = NULL;
fop_mknod_cbk_t progcbk = NULL;
+ if (op_ret == 0) {
+ nfs_fix_generation(this,inode);
+ }
+
nfl_to_prog_data (nfl, progcbk, frame);
nfs_fop_restore_root_ino (nfl, op_ret,buf, NULL, preparent, postparent);
if (progcbk)
@@ -1153,6 +1203,10 @@ nfs_fop_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct nfs_fop_local *nfl = NULL;
fop_link_cbk_t progcbk = NULL;
+ if (op_ret == 0) {
+ nfs_fix_generation(this,inode);
+ }
+
nfl_to_prog_data (nfl, progcbk, frame);
nfs_fop_restore_root_ino (nfl, op_ret, buf, NULL, preparent,
postparent);
@@ -1345,7 +1399,7 @@ nfs_fop_write (xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *fd,
iobref_add (nfl->iobref, srciob);
*/
STACK_WIND_COOKIE (frame, nfs_fop_writev_cbk, xl, xl,xl->fops->writev,
- fd, vector, count, offset, 0, srciobref, NULL);
+ fd, vector, count, offset, fd->flags, srciobref, NULL);
ret = 0;
err:
if (ret < 0) {
@@ -1505,6 +1559,95 @@ err:
return ret;
}
+int32_t
+nfs_fop_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ struct nfs_fop_local *nfl = NULL;
+ fop_getxattr_cbk_t progcbk = NULL;
+
+ nfl_to_prog_data (nfl, progcbk, frame);
+
+ if (progcbk)
+ progcbk (frame, cookie, this, op_ret, op_errno, dict, xdata);
+
+ nfs_stack_destroy (nfl, frame);
+ return 0;
+}
+
+
+int
+nfs_fop_getxattr (xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *loc,
+ char *name, dict_t *xdata, fop_getxattr_cbk_t cbk, void *local)
+{
+ call_frame_t *frame = NULL;
+ int ret = -EFAULT;
+ struct nfs_fop_local *nfl = NULL;
+
+ if ((!xl) || (!loc) || (!nfu))
+ return ret;
+
+ nfs_fop_handle_frame_create (frame, nfsx, nfu, ret, err);
+ nfs_fop_handle_local_init (frame, nfsx, nfl, cbk, local, ret, err);
+
+ STACK_WIND_COOKIE (frame, nfs_fop_getxattr_cbk, xl, xl, xl->fops->getxattr,
+ loc, name, NULL);
+ ret = 0;
+err:
+ if (ret < 0) {
+ if (frame)
+ nfs_stack_destroy (nfl, frame);
+ }
+
+ return ret;
+}
+
+
+int32_t
+nfs_fop_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ struct nfs_fop_local *nfl = NULL;
+ fop_setxattr_cbk_t progcbk = NULL;
+
+ nfl_to_prog_data (nfl, progcbk, frame);
+
+ if (progcbk)
+ progcbk (frame, cookie, this, op_ret, op_errno, xdata);
+
+ nfs_stack_destroy (nfl, frame);
+ return 0;
+}
+
+
+int
+nfs_fop_setxattr (xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu,
+ loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata,
+ fop_setxattr_cbk_t cbk, void *local)
+{
+ call_frame_t *frame = NULL;
+ int ret = -EFAULT;
+ struct nfs_fop_local *nfl = NULL;
+
+ if ((!xl) || (!loc) || (!nfu))
+ return ret;
+
+ nfs_fop_handle_frame_create (frame, nfsx, nfu, ret, err);
+ nfs_fop_handle_local_init (frame, nfsx, nfl, cbk, local, ret, err);
+
+ STACK_WIND_COOKIE (frame, nfs_fop_setxattr_cbk, xl, xl, xl->fops->setxattr,
+ loc, dict, flags, xdata);
+ ret = 0;
+err:
+ if (ret < 0) {
+ if (frame)
+ nfs_stack_destroy (nfl, frame);
+ }
+
+ return ret;
+}
+
int32_t
nfs_fop_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
diff --git a/xlators/nfs/server/src/nfs-fops.h b/xlators/nfs/server/src/nfs-fops.h
index b82805939..44e99c66b 100644
--- a/xlators/nfs/server/src/nfs-fops.h
+++ b/xlators/nfs/server/src/nfs-fops.h
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _NFS_FOPS_H_
@@ -120,7 +111,7 @@ nfs_fop_local_wipe (xlator_t *xl, struct nfs_fop_local *l);
nflocal = nfs_fop_local_init (nf); \
if (nflocal) { \
nflocal->proglocal = plocal; \
- nflocal->progcbk = pcbk; \
+ nflocal->progcbk = *VOID(&pcbk); \
nflocal->nfsx = nf; \
if (fram) \
((call_frame_t *)fram)->local = nflocal;\
@@ -245,4 +236,13 @@ extern int
nfs_fop_lk (xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *fd,
int cmd, struct gf_flock *flock, fop_lk_cbk_t cbk, void *local);
+extern int
+nfs_fop_getxattr (xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *loc,
+ char *name, dict_t *xdata, fop_getxattr_cbk_t cbk, void *local);
+
+extern int
+nfs_fop_setxattr (xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu,
+ loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata,
+ fop_setxattr_cbk_t cbk, void *local);
+
#endif
diff --git a/xlators/nfs/server/src/nfs-generics.c b/xlators/nfs/server/src/nfs-generics.c
index 553cc7f39..cb32b7f1b 100644
--- a/xlators/nfs/server/src/nfs-generics.c
+++ b/xlators/nfs/server/src/nfs-generics.c
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _CONFIG_H
@@ -166,6 +157,22 @@ nfs_lk (xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *fd,
}
int
+nfs_getxattr (xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *loc,
+ char *name, dict_t *xdata, fop_getxattr_cbk_t cbk, void *local)
+{
+ return nfs_fop_getxattr (nfsx, xl, nfu, loc, name, xdata, cbk, local);
+}
+
+int
+nfs_setxattr (xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu,
+ loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata,
+ fop_setxattr_cbk_t cbk, void *local)
+{
+ return nfs_fop_setxattr (nfsx, xl, nfu, loc, dict, flags, xdata, cbk,
+ local);
+}
+
+int
nfs_fsync (xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *fd,
int32_t datasync, fop_fsync_cbk_t cbk, void *local)
{
diff --git a/xlators/nfs/server/src/nfs-generics.h b/xlators/nfs/server/src/nfs-generics.h
index 11f191f5f..01876d68e 100644
--- a/xlators/nfs/server/src/nfs-generics.h
+++ b/xlators/nfs/server/src/nfs-generics.h
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _NFS_GENERICS_H_
@@ -164,4 +155,14 @@ nfs_access (xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *pathloc,
extern int
nfs_lk (xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, fd_t *fd,
int cmd, struct gf_flock *flock, fop_lk_cbk_t cbk, void *local);
+
+extern int
+nfs_getxattr (xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu, loc_t *loc,
+ char *name, dict_t *xdata, fop_getxattr_cbk_t cbk, void *local);
+
+extern int
+nfs_setxattr (xlator_t *nfsx, xlator_t *xl, nfs_user_t *nfu,
+ loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata,
+ fop_setxattr_cbk_t cbk, void *local);
+
#endif
diff --git a/xlators/nfs/server/src/nfs-inodes.c b/xlators/nfs/server/src/nfs-inodes.c
index c9bf37206..63d5e8a19 100644
--- a/xlators/nfs/server/src/nfs-inodes.c
+++ b/xlators/nfs/server/src/nfs-inodes.c
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _CONFIG_H
@@ -36,7 +27,7 @@
do { \
nflocal = fram->local; \
fram->local = nflocal->proglocal; \
- pcbk = nflocal->progcbk; \
+ *VOID(&pcbk) = nflocal->progcbk; \
nfs_fop_local_wipe (nflocal->nfsx, nflocal); \
} while (0) \
@@ -57,10 +48,10 @@ nfl_inodes_init (struct nfs_fop_local *nfl, inode_t *inode, inode_t *parent,
nfl->newparent = inode_ref (newparent);
if (name)
- strcpy (nfl->path, name);
+ strncpy (nfl->path, name, NFS_NAME_MAX);
if (newname)
- strcpy (nfl->newpath, newname);
+ strncpy (nfl->newpath, newname, NFS_NAME_MAX);
return;
}
@@ -572,9 +563,7 @@ nfs_inode_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct nfs_fop_local *nfl = NULL;
fop_open_cbk_t progcbk = NULL;
- if ((op_ret == -1) && (fd))
- fd_unref (fd);
- else
+ if (op_ret != -1)
fd_bind (fd);
inodes_nfl_to_prog_data (nfl, progcbk, frame);
diff --git a/xlators/nfs/server/src/nfs-inodes.h b/xlators/nfs/server/src/nfs-inodes.h
index 7c962b339..ba7a57124 100644
--- a/xlators/nfs/server/src/nfs-inodes.h
+++ b/xlators/nfs/server/src/nfs-inodes.h
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _NFS_INODES_H_
diff --git a/xlators/nfs/server/src/nfs-mem-types.h b/xlators/nfs/server/src/nfs-mem-types.h
index de25b08a8..450b6f2fe 100644
--- a/xlators/nfs/server/src/nfs-mem-types.h
+++ b/xlators/nfs/server/src/nfs-mem-types.h
@@ -2,19 +2,10 @@
Copyright (c) 2008-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
@@ -53,6 +44,9 @@ enum gf_nfs_mem_types_ {
gf_nfs_mt_nlm4_nlmclnt,
gf_nfs_mt_nlm4_share,
gf_nfs_mt_aux_gids,
+ gf_nfs_mt_inode_ctx,
+ gf_nfs_mt_auth_spec,
+ gf_nfs_mt_arr,
gf_nfs_mt_end
};
#endif
diff --git a/xlators/nfs/server/src/nfs.c b/xlators/nfs/server/src/nfs.c
index 0a5a9d1e3..918e86312 100644
--- a/xlators/nfs/server/src/nfs.c
+++ b/xlators/nfs/server/src/nfs.c
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
/* This is the primary translator source for NFS.
@@ -42,9 +33,210 @@
#include "nfs3-helpers.h"
#include "nlm4.h"
#include "options.h"
+#include "acl3.h"
+#include "rpc-drc.h"
+#include "syscall.h"
+
+#define STRINGIFY(val) #val
+#define TOSTRING(val) STRINGIFY(val)
#define OPT_SERVER_AUX_GIDS "nfs.server-aux-gids"
#define OPT_SERVER_GID_CACHE_TIMEOUT "nfs.server.aux-gid-timeout"
+#define OPT_SERVER_RPC_STATD "nfs.rpc-statd"
+#define OPT_SERVER_RPC_STATD_PIDFILE "nfs.rpc-statd-pidfile"
+#define OPT_SERVER_RPC_STATD_NOTIFY_PIDFILE "nfs.rpc-statd-notify-pidfile"
+
+/* TODO: DATADIR should be based on configure's $(localstatedir) */
+#define DATADIR "/var/lib/glusterd"
+#define NFS_DATADIR DATADIR "/nfs"
+
+/* Forward declaration */
+int nfs_add_initer (struct list_head *list, nfs_version_initer_t init);
+
+static int
+nfs_init_version (xlator_t *this, nfs_version_initer_t init)
+{
+ int ret = -1;
+ struct nfs_initer_list *version = NULL;
+ struct nfs_initer_list *tmp = NULL;
+ rpcsvc_program_t *prog = NULL;
+ struct list_head *versions = NULL;
+ struct nfs_state *nfs = NULL;
+ gf_boolean_t found = _gf_false;
+
+ if ((!this) || (!this->private) || (!init))
+ return (-1);
+
+ nfs = (struct nfs_state *)this->private;
+
+ ret = nfs_add_initer (&nfs->versions, init);
+ if (ret == -1) {
+ gf_log (GF_NFS, GF_LOG_ERROR,
+ "Failed to add protocol initializer");
+ goto err;
+ }
+
+ versions = &nfs->versions;
+ list_for_each_entry_safe (version, tmp, versions, list) {
+ prog = version->program;
+ if (version->init == init) {
+ prog = init(this);
+ if (!prog) {
+ ret = -1;
+ goto err;
+ }
+ version->program = prog;
+ found = _gf_true;
+ break;
+ }
+ }
+
+ /* program not added */
+ if (!found) {
+ gf_log (GF_NFS, GF_LOG_ERROR,
+ "Program: %s NOT found", prog->progname);
+ goto err;
+ }
+
+ /* Check if nfs.port is configured */
+ if (nfs->override_portnum)
+ prog->progport = nfs->override_portnum;
+
+ gf_log (GF_NFS, GF_LOG_DEBUG, "Starting program: %s", prog->progname);
+
+ ret = rpcsvc_program_register (nfs->rpcsvc, prog);
+ if (ret == -1) {
+ gf_log (GF_NFS, GF_LOG_ERROR, "Program: %s init failed",
+ prog->progname);
+ goto err;
+ }
+
+ /* Registration with portmapper is disabled, Nothing to do */
+ if (!nfs->register_portmap)
+ goto err;
+
+ ret = rpcsvc_program_register_portmap (prog, prog->progport);
+ if (ret == -1) {
+ gf_log (GF_NFS, GF_LOG_ERROR,
+ "Program %s registration failed",
+ prog->progname);
+ goto err;
+ }
+ ret = 0; /* All well */
+err:
+ return ret;
+}
+
+static int
+nfs_deinit_version (struct nfs_state *nfs, nfs_version_initer_t init)
+{
+ int ret = -1;
+ struct nfs_initer_list *version = NULL;
+ struct nfs_initer_list *tmp = NULL;
+ rpcsvc_program_t *prog = NULL;
+ struct list_head *versions = NULL;
+
+ if ((!nfs) || (!init))
+ return (-1);
+
+ versions = &nfs->versions;
+ list_for_each_entry_safe (version, tmp, versions, list) {
+ prog = version->program;
+ if (version->init == init) {
+ prog = version->program;
+ ret = rpcsvc_program_unregister (nfs->rpcsvc, prog);
+ if (ret != 0)
+ return (-1);
+ list_del (&version->list);
+ GF_FREE (version);
+ return (0);
+ }
+ }
+
+ return (-1);
+}
+
+static int
+nfs_reconfigure_acl3 (xlator_t *this)
+{
+ struct nfs_state *nfs = NULL;
+
+ if ((!this) || (!this->private))
+ return (-1);
+
+ nfs = (struct nfs_state *)this->private;
+
+ /* ACL is enabled */
+ if (nfs->enable_acl)
+ return nfs_init_version (this, acl3svc_init);
+
+ /* ACL is disabled */
+ return nfs_deinit_version (nfs, acl3svc_init);
+}
+
+static int
+nfs_reconfigure_nlm4 (xlator_t *this)
+{
+ struct nfs_state *nfs = NULL;
+
+ if ((!this) || (!this->private))
+ return (-1);
+
+ nfs = (struct nfs_state *)this->private;
+
+ /* NLM is enabled */
+ if (nfs->enable_nlm)
+ return nfs_init_version (this, nlm4svc_init);
+
+ /* NLM is disabled */
+ return nfs_deinit_version (nfs, nlm4svc_init);
+}
+
+static int
+nfs_program_register_portmap_all (struct nfs_state *nfs)
+{
+ struct list_head *versions = NULL;
+ struct nfs_initer_list *version = NULL;
+ struct nfs_initer_list *tmp = NULL;
+ rpcsvc_program_t *prog = NULL;
+
+ if (nfs == NULL)
+ return (-1);
+
+ versions = &nfs->versions;
+ list_for_each_entry_safe (version, tmp, versions, list) {
+ prog = version->program;
+ if (prog == NULL)
+ continue;
+ if (nfs->override_portnum)
+ prog->progport = nfs->override_portnum;
+ (void) rpcsvc_program_register_portmap (prog, prog->progport);
+ }
+
+ return (0);
+}
+
+static int
+nfs_program_unregister_portmap_all (struct nfs_state *nfs)
+{
+ struct list_head *versions = NULL;
+ struct nfs_initer_list *version = NULL;
+ struct nfs_initer_list *tmp = NULL;
+ rpcsvc_program_t *prog = NULL;
+
+ if (nfs == NULL)
+ return (-1);
+
+ versions = &nfs->versions;
+ list_for_each_entry_safe (version, tmp, versions, list) {
+ prog = version->program;
+ if (prog == NULL)
+ continue;
+ (void) rpcsvc_program_unregister_portmap (prog);
+ }
+
+ return (0);
+}
/* Every NFS version must call this function with the init function
* for its particular version.
@@ -122,7 +314,7 @@ nfs_init_versions (struct nfs_state *nfs, xlator_t *this)
ret = -1;
goto err;
}
-// prog->actorxl = this;
+
version->program = prog;
if (nfs->override_portnum)
prog->progport = nfs->override_portnum;
@@ -131,17 +323,21 @@ nfs_init_versions (struct nfs_state *nfs, xlator_t *this)
ret = rpcsvc_program_register (nfs->rpcsvc, prog);
if (ret == -1) {
- gf_log (GF_NFS, GF_LOG_ERROR, "Program init failed");
+ gf_log (GF_NFS, GF_LOG_ERROR, "Program: %s init failed",
+ prog->progname);
goto err;
}
- if (rpcsvc_register_portmap_enabled(nfs->rpcsvc)) {
+ if (nfs->register_portmap) {
ret = rpcsvc_program_register_portmap (prog,
prog->progport);
if (ret == -1) {
- gf_log (GF_NFS, GF_LOG_ERROR, "Program registration failed");
+ gf_log (GF_NFS, GF_LOG_ERROR,
+ "Program %s registration failed",
+ prog->progname);
goto err;
}
}
+
}
ret = 0;
@@ -158,22 +354,22 @@ nfs_add_all_initiators (struct nfs_state *nfs)
/* Add the initializers for all versions. */
ret = nfs_add_initer (&nfs->versions, mnt3svc_init);
if (ret == -1) {
- gf_log (GF_NFS, GF_LOG_ERROR, "Failed to add protocol"
- " initializer");
+ gf_log (GF_NFS, GF_LOG_ERROR, "Failed to add "
+ "MOUNT3 protocol initializer");
goto ret;
}
ret = nfs_add_initer (&nfs->versions, mnt1svc_init);
if (ret == -1) {
- gf_log (GF_NFS, GF_LOG_ERROR, "Failed to add protocol"
- " initializer");
+ gf_log (GF_NFS, GF_LOG_ERROR, "Failed to add "
+ "MOUNT1 protocol initializer");
goto ret;
}
ret = nfs_add_initer (&nfs->versions, nfs3svc_init);
if (ret == -1) {
- gf_log (GF_NFS, GF_LOG_ERROR, "Failed to add protocol"
- " initializer");
+ gf_log (GF_NFS, GF_LOG_ERROR, "Failed to add "
+ "NFS3 protocol initializer");
goto ret;
}
@@ -186,6 +382,15 @@ nfs_add_all_initiators (struct nfs_state *nfs)
}
}
+ if (nfs->enable_acl == _gf_true) {
+ ret = nfs_add_initer (&nfs->versions, acl3svc_init);
+ if (ret == -1) {
+ gf_log (GF_NFS, GF_LOG_ERROR, "Failed to add "
+ "ACL protocol initializer");
+ goto ret;
+ }
+ }
+
ret = 0;
ret:
return ret;
@@ -519,10 +724,11 @@ nfs_init_state (xlator_t *this)
if (!this)
return NULL;
- if ((!this->children) || (!this->children->xlator)) {
- gf_log (GF_NFS, GF_LOG_ERROR, "nfs must have at least one"
- " child subvolume");
- return NULL;
+ if (!this->children) {
+ gf_log (GF_NFS, GF_LOG_INFO,
+ "NFS is manually disabled: Exiting");
+ /* Nothing for nfs process to do, exit cleanly */
+ kill (getpid (), SIGTERM);
}
nfs = GF_CALLOC (1, sizeof (*nfs), gf_nfs_mt_nfs_state);
@@ -578,19 +784,17 @@ nfs_init_state (xlator_t *this)
}
nfs->enable_nlm = _gf_true;
- if (!dict_get_str (this->options, "nfs.nlm", &optstr)) {
-
- ret = gf_string2boolean (optstr, &boolt);
- if (ret < 0) {
- gf_log (GF_NFS, GF_LOG_ERROR, "Failed to parse"
- " bool string");
- goto free_foppool;
- }
+ ret = dict_get_str_boolean (this->options, "nfs.nlm", _gf_true);
+ if (ret == _gf_false) {
+ gf_log (GF_NFS, GF_LOG_INFO, "NLM is manually disabled");
+ nfs->enable_nlm = _gf_false;
+ }
- if (boolt == _gf_false) {
- gf_log (GF_NFS, GF_LOG_INFO, "NLM is manually disabled");
- nfs->enable_nlm = _gf_false;
- }
+ nfs->enable_acl = _gf_true;
+ ret = dict_get_str_boolean (this->options, "nfs.acl", _gf_true);
+ if (ret == _gf_false) {
+ gf_log (GF_NFS, GF_LOG_INFO, "ACL is manually disabled");
+ nfs->enable_acl = _gf_false;
}
nfs->enable_ino32 = 0;
@@ -674,6 +878,15 @@ nfs_init_state (xlator_t *this)
nfs->mount_udp = 1;
}
+ nfs->rmtab = gf_strdup (NFS_DATADIR "/rmtab");
+ if (dict_get(this->options, "nfs.mount-rmtab")) {
+ ret = dict_get_str (this->options, "nfs.mount-rmtab", &nfs->rmtab);
+ if (ret == -1) {
+ gf_log (GF_NFS, GF_LOG_ERROR, "Failed to parse dict");
+ goto free_foppool;
+ }
+ }
+
/* support both options rpc-auth.ports.insecure and
* rpc-auth-allow-insecure for backward compatibility
*/
@@ -733,32 +946,57 @@ nfs_init_state (xlator_t *this)
goto free_foppool;
}
}
+ GF_OPTION_INIT (OPT_SERVER_RPC_STATD, nfs->rpc_statd, path, free_foppool);
+
+ GF_OPTION_INIT (OPT_SERVER_RPC_STATD_PIDFILE, nfs->rpc_statd_pid_file, path, free_foppool);
GF_OPTION_INIT (OPT_SERVER_AUX_GIDS, nfs->server_aux_gids,
bool, free_foppool);
- GF_OPTION_INIT (OPT_SERVER_GID_CACHE_TIMEOUT, nfs->server_aux_gids_max_age,
+ GF_OPTION_INIT (OPT_SERVER_GID_CACHE_TIMEOUT,
+ nfs->server_aux_gids_max_age,
uint32, free_foppool);
- if (gid_cache_init(&nfs->gid_cache, nfs->server_aux_gids_max_age) < 0) {
- gf_log(GF_NFS, GF_LOG_ERROR, "Failed to initialize group cache.");
- goto free_foppool;
- }
+ if (gid_cache_init(&nfs->gid_cache, nfs->server_aux_gids_max_age) < 0) {
+ gf_log(GF_NFS, GF_LOG_ERROR, "Failed to initialize group cache.");
+ goto free_foppool;
+ }
+
+ ret = sys_access (nfs->rpc_statd, X_OK);
+ if (ret) {
+ gf_log (GF_NFS, GF_LOG_WARNING, "%s not enough permissions to"
+ " access. Disabling NLM", nfs->rpc_statd);
+ nfs->enable_nlm = _gf_false;
+ }
- if (stat("/sbin/rpc.statd", &stbuf) == -1) {
- gf_log (GF_NFS, GF_LOG_WARNING, "/sbin/rpc.statd not found. "
- "Disabling NLM");
+ ret = sys_stat (nfs->rpc_statd, &stbuf);
+ if (ret || !S_ISREG (stbuf.st_mode)) {
+ gf_log (GF_NFS, GF_LOG_WARNING, "%s not a regular file."
+ " Disabling NLM", nfs->rpc_statd);
nfs->enable_nlm = _gf_false;
}
- nfs->rpcsvc = rpcsvc_init (this, this->ctx, this->options, 0);
+ nfs->rpcsvc = rpcsvc_init (this, this->ctx,
+ this->options, fopspoolsize);
if (!nfs->rpcsvc) {
ret = -1;
gf_log (GF_NFS, GF_LOG_ERROR, "RPC service init failed");
goto free_foppool;
}
+ ret = rpcsvc_set_outstanding_rpc_limit (nfs->rpcsvc,
+ this->options,
+ RPCSVC_DEF_NFS_OUTSTANDING_RPC_LIMIT);
+ if (ret < 0) {
+ gf_log (GF_NFS, GF_LOG_ERROR,
+ "Failed to configure outstanding-rpc-limit");
+ goto free_foppool;
+ }
+
+ nfs->register_portmap = rpcsvc_register_portmap_enabled (nfs->rpcsvc);
+
this->private = (void *)nfs;
INIT_LIST_HEAD (&nfs->versions);
+ nfs->generation = 1965;
ret = 0;
@@ -777,7 +1015,293 @@ free_rpcsvc:
return nfs;
}
+int
+nfs_drc_init (xlator_t *this)
+{
+ int ret = -1;
+ rpcsvc_t *svc = NULL;
+
+ GF_VALIDATE_OR_GOTO (GF_NFS, this, out);
+ GF_VALIDATE_OR_GOTO (GF_NFS, this->private, out);
+
+ svc = ((struct nfs_state *)(this->private))->rpcsvc;
+ if (!svc)
+ goto out;
+
+ ret = rpcsvc_drc_init (svc, this->options);
+
+ out:
+ return ret;
+}
+
+int
+nfs_reconfigure_state (xlator_t *this, dict_t *options)
+{
+ int ret = 0;
+ int keyindx = 0;
+ char *rmtab = NULL;
+ char *rpc_statd = NULL;
+ gf_boolean_t optbool;
+ uint32_t optuint32;
+ struct nfs_state *nfs = NULL;
+ char *blacklist_keys[] = {
+ "nfs.port",
+ "nfs.transport-type",
+ "nfs.mem-factor",
+ NULL};
+
+ GF_VALIDATE_OR_GOTO (GF_NFS, this, out);
+ GF_VALIDATE_OR_GOTO (GF_NFS, this->private, out);
+ GF_VALIDATE_OR_GOTO (GF_NFS, options, out);
+
+ nfs = (struct nfs_state *)this->private;
+
+ /* Black listed options can't be reconfigured, they need
+ * NFS to be restarted. There are two cases 1. SET 2. UNSET.
+ * 1. SET */
+ while (blacklist_keys[keyindx]) {
+ if (dict_get (options, blacklist_keys[keyindx])) {
+ gf_log (GF_NFS, GF_LOG_ERROR,
+ "Reconfiguring %s needs NFS restart",
+ blacklist_keys[keyindx]);
+ goto out;
+ }
+ keyindx ++;
+ }
+
+ /* UNSET for nfs.mem-factor */
+ if ((!dict_get (options, "nfs.mem-factor")) &&
+ (nfs->memfactor != GF_NFS_DEFAULT_MEMFACTOR)) {
+ gf_log (GF_NFS, GF_LOG_INFO,
+ "Reconfiguring nfs.mem-factor needs NFS restart");
+ goto out;
+ }
+
+ /* UNSET for nfs.port */
+ if ((!dict_get (options, "nfs.port")) &&
+ (nfs->override_portnum)) {
+ gf_log (GF_NFS, GF_LOG_ERROR,
+ "Reconfiguring nfs.port needs NFS restart");
+ goto out;
+ }
+
+ /* reconfig nfs.rpc-statd... */
+ rpc_statd = GF_RPC_STATD_PROG;
+ if (dict_get (options, OPT_SERVER_RPC_STATD_PIDFILE)) {
+ ret = dict_get_str (options, "nfs.rpc-statd", &rpc_statd);
+ if (ret < 0) {
+ gf_log (GF_NFS, GF_LOG_ERROR, "Failed to read "
+ "reconfigured option: nfs.rpc-statd");
+ goto out;
+ }
+ }
+
+ if (strcmp(nfs->rpc_statd, rpc_statd) != 0) {
+ gf_log (GF_NFS, GF_LOG_INFO,
+ "Reconfiguring nfs.rpc-statd needs NFS restart");
+ goto out;
+ }
+
+ /* reconfig nfs.mount-rmtab */
+ rmtab = NFS_DATADIR "/rmtab";
+ if (dict_get (options, "nfs.mount-rmtab")) {
+ ret = dict_get_str (options, "nfs.mount-rmtab", &rmtab);
+ if (ret < 0) {
+ gf_log (GF_NFS, GF_LOG_ERROR, "Failed to read "
+ "reconfigured option: nfs.mount-rmtab");
+ goto out;
+ }
+ gf_path_strip_trailing_slashes (rmtab);
+ }
+ if (strcmp (nfs->rmtab, rmtab) != 0) {
+ mount_rewrite_rmtab (nfs->mstate, rmtab);
+ gf_log (GF_NFS, GF_LOG_INFO,
+ "Reconfigured nfs.mount-rmtab path: %s",
+ nfs->rmtab);
+ }
+
+ GF_OPTION_RECONF (OPT_SERVER_AUX_GIDS, optbool,
+ options, bool, out);
+ if (nfs->server_aux_gids != optbool) {
+ nfs->server_aux_gids = optbool;
+ gf_log(GF_NFS, GF_LOG_INFO, "Reconfigured %s with value %d",
+ OPT_SERVER_AUX_GIDS, optbool);
+ }
+
+ GF_OPTION_RECONF (OPT_SERVER_GID_CACHE_TIMEOUT, optuint32,
+ options, uint32, out);
+ if (nfs->server_aux_gids_max_age != optuint32) {
+ nfs->server_aux_gids_max_age = optuint32;
+ gid_cache_reconf (&nfs->gid_cache, optuint32);
+ gf_log(GF_NFS, GF_LOG_INFO, "Reconfigured %s with value %d",
+ OPT_SERVER_GID_CACHE_TIMEOUT, optuint32);
+ }
+
+ /* reconfig nfs.dynamic-volumes */
+ ret = dict_get_str_boolean (options, "nfs.dynamic-volumes",
+ GF_NFS_DVM_OFF);
+ switch (ret) {
+ case GF_NFS_DVM_ON:
+ case GF_NFS_DVM_OFF:
+ optbool = ret;
+ break;
+ default:
+ optbool = GF_NFS_DVM_OFF;
+ break;
+ }
+ if (nfs->dynamicvolumes != optbool) {
+ nfs->dynamicvolumes = optbool;
+ gf_log(GF_NFS, GF_LOG_INFO, "Reconfigured nfs.dynamic-volumes"
+ " with value %d", optbool);
+ }
+
+ optbool = _gf_false;
+ if (dict_get (options, "nfs.enable-ino32")) {
+ ret = dict_get_str_boolean (options, "nfs.enable-ino32",
+ _gf_false);
+ if (ret < 0) {
+ gf_log (GF_NFS, GF_LOG_ERROR,
+ "Failed to read reconfigured option: "
+ "nfs.enable-ino32");
+ goto out;
+ }
+ optbool = ret;
+ }
+ if (nfs->enable_ino32 != optbool) {
+ nfs->enable_ino32 = optbool;
+ gf_log(GF_NFS, GF_LOG_INFO, "Reconfigured nfs.enable-ino32"
+ " with value %d", optbool);
+ }
+
+ /* nfs.nlm is enabled by default */
+ ret = dict_get_str_boolean (options, "nfs.nlm", _gf_true);
+ if (ret < 0) {
+ optbool = _gf_true;
+ } else {
+ optbool = ret;
+ }
+ if (nfs->enable_nlm != optbool) {
+ gf_log (GF_NFS, GF_LOG_INFO, "NLM is manually %s",
+ (optbool ? "enabled":"disabled"));
+ nfs->enable_nlm = optbool;
+ nfs_reconfigure_nlm4 (this);
+ }
+
+ /* nfs.acl is enabled by default */
+ ret = dict_get_str_boolean (options, "nfs.acl", _gf_true);
+ if (ret < 0) {
+ optbool = _gf_true;
+ } else {
+ optbool = ret;
+ }
+ if (nfs->enable_acl != optbool) {
+ gf_log (GF_NFS, GF_LOG_INFO, "ACL is manually %s",
+ (optbool ? "enabled":"disabled"));
+ nfs->enable_acl = optbool;
+ nfs_reconfigure_acl3 (this);
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+/*
+ * reconfigure() for NFS server xlator.
+ */
+int
+reconfigure (xlator_t *this, dict_t *options)
+{
+ int ret = 0;
+ struct nfs_state *nfs = NULL;
+ gf_boolean_t regpmap = _gf_true;
+
+ if ((!this) || (!this->private) || (!options))
+ return (-1);
+
+ nfs = (struct nfs_state *)this->private;
+
+ /* Reconfigure nfs options */
+ ret = nfs_reconfigure_state(this, options);
+ if (ret) {
+ gf_log (GF_NFS, GF_LOG_ERROR,
+ "nfs reconfigure state failed");
+ return (-1);
+ }
+
+ /* Reconfigure nfs3 options */
+ ret = nfs3_reconfigure_state(this, options);
+ if (ret) {
+ gf_log (GF_NFS, GF_LOG_ERROR,
+ "nfs3 reconfigure state failed");
+ return (-1);
+ }
+
+ /* Reconfigure mount options */
+ ret = mount_reconfigure_state(this, options);
+ if (ret) {
+ gf_log (GF_NFS, GF_LOG_ERROR,
+ "mount reconfigure state failed");
+ return (-1);
+ }
+
+ /* Reconfigure rpc layer */
+ ret = rpcsvc_reconfigure_options (nfs->rpcsvc, options);
+ if (ret) {
+ gf_log (GF_NFS, GF_LOG_ERROR,
+ "rpcsvc reconfigure options failed");
+ return (-1);
+ }
+
+ /* Reconfigure rpc.outstanding-rpc-limit */
+ ret = rpcsvc_set_outstanding_rpc_limit (nfs->rpcsvc,
+ options,
+ RPCSVC_DEF_NFS_OUTSTANDING_RPC_LIMIT);
+ if (ret < 0) {
+ gf_log (GF_NFS, GF_LOG_ERROR,
+ "Failed to reconfigure outstanding-rpc-limit");
+ return (-1);
+ }
+
+ regpmap = rpcsvc_register_portmap_enabled(nfs->rpcsvc);
+ if (nfs->register_portmap != regpmap) {
+ nfs->register_portmap = regpmap;
+ if (regpmap) {
+ (void) nfs_program_register_portmap_all (nfs);
+ } else {
+ (void) nfs_program_unregister_portmap_all (nfs);
+ }
+ }
+
+ /* Reconfigure drc */
+ ret = rpcsvc_drc_reconfigure (nfs->rpcsvc, options);
+ if (ret) {
+ gf_log (GF_NFS, GF_LOG_ERROR,
+ "rpcsvc DRC reconfigure failed");
+ return (-1);
+ }
+
+ return (0);
+}
+
+/* Main init() routine for NFS server xlator. It inits NFS v3 protocol
+ * and its dependent protocols e.g. ACL, MOUNT v3 (mount3), NLM and
+ * DRC.
+ *
+ * Usage: glusterfsd:
+ * glusterfs_process_volfp() =>
+ * glusterfs_graph_activate() =>
+ * glusterfs_graph_init() =>
+ * xlator_init () => NFS init() routine
+ *
+ * If init() routine fails, the glusterfsd cleans up the NFS process
+ * by invoking cleanup_and_exit().
+ *
+ * RETURN:
+ * 0 (SUCCESS) if all protocol specific inits PASS.
+ * -1 (FAILURE) if any of them FAILS.
+ */
int
init (xlator_t *this) {
@@ -785,57 +1309,52 @@ init (xlator_t *this) {
int ret = -1;
if (!this)
- return -1;
+ return (-1);
nfs = nfs_init_state (this);
if (!nfs) {
gf_log (GF_NFS, GF_LOG_ERROR, "Failed to init nfs option");
- return -1;
+ return (-1);
}
ret = nfs_add_all_initiators (nfs);
- if (ret == -1) {
+ if (ret) {
gf_log (GF_NFS, GF_LOG_ERROR, "Failed to add initiators");
- goto err;
+ return (-1);
}
ret = nfs_init_subvolumes (nfs, this->children);
- if (ret == -1) {
- gf_log (GF_NFS, GF_LOG_CRITICAL, "Failed to init NFS "
- "exports");
- goto err;
+ if (ret) {
+ gf_log (GF_NFS, GF_LOG_CRITICAL, "Failed to init NFS exports");
+ return (-1);
}
ret = mount_init_state (this);
- if (ret == -1) {
- gf_log (GF_NFS, GF_LOG_CRITICAL, "Failed to init Mount"
- "state");
- goto err;
+ if (ret) {
+ gf_log (GF_NFS, GF_LOG_CRITICAL, "Failed to init Mount state");
+ return (-1);
}
ret = nlm4_init_state (this);
- if (ret == -1) {
- gf_log (GF_NFS, GF_LOG_CRITICAL, "Failed to init NLM"
- "state");
- goto err;
+ if (ret) {
+ gf_log (GF_NFS, GF_LOG_CRITICAL, "Failed to init NLM state");
+ return (-1);
}
ret = nfs_init_versions (nfs, this);
- if (ret == -1) {
- gf_log (GF_NFS, GF_LOG_ERROR, "Failed to initialize "
- "protocols");
- /* Do not return an error on this. If we dont return
- * an error, the process keeps running and it helps
- * to point out where the log is by doing ps ax|grep gluster.
- */
- ret = 0;
- goto err;
+ if (ret) {
+ gf_log (GF_NFS, GF_LOG_ERROR, "Failed to initialize protocols");
+ return (-1);
}
- gf_log (GF_NFS, GF_LOG_INFO, "NFS service started");
-err:
+ ret = nfs_drc_init (this);
+ if (ret) {
+ gf_log (GF_NFS, GF_LOG_ERROR, "Failed to initialize DRC");
+ return (-1);
+ }
- return ret;
+ gf_log (GF_NFS, GF_LOG_INFO, "NFS service started");
+ return (0); /* SUCCESS */
}
@@ -843,24 +1362,26 @@ int
notify (xlator_t *this, int32_t event, void *data, ...)
{
xlator_t *subvol = NULL;
+ struct nfs_state *priv = NULL;
subvol = (xlator_t *)data;
gf_log (GF_NFS, GF_LOG_TRACE, "Notification received: %d",
event);
- switch (event)
- {
- case GF_EVENT_CHILD_UP:
- {
- nfs_startup_subvolume (this, subvol);
- break;
- }
- case GF_EVENT_PARENT_UP:
- {
- default_notify (this, GF_EVENT_PARENT_UP, data);
- break;
- }
+ switch (event) {
+ case GF_EVENT_CHILD_UP:
+ nfs_startup_subvolume (this, subvol);
+ break;
+
+ case GF_EVENT_CHILD_MODIFIED:
+ priv = this->private;
+ ++(priv->generation);
+ break;
+
+ case GF_EVENT_PARENT_UP:
+ default_notify (this, GF_EVENT_PARENT_UP, data);
+ break;
}
return 0;
@@ -882,14 +1403,14 @@ fini (xlator_t *this)
int32_t
nfs_forget (xlator_t *this, inode_t *inode)
{
- uint64_t ctx = 0;
- struct list_head *head = NULL;
+ uint64_t ctx = 0;
+ struct nfs_inode_ctx *ictx = NULL;
if (inode_ctx_del (inode, this, &ctx))
return -1;
- head = (struct list_head *)ctx;
- GF_FREE (head);
+ ictx = (struct nfs_inode_ctx *)ctx;
+ GF_FREE (ictx);
return 0;
}
@@ -985,7 +1506,22 @@ nlm_priv (xlator_t *this);
int32_t
nfs_priv (xlator_t *this)
{
- return nlm_priv (this);
+ int32_t ret = -1;
+
+ /* DRC needs the global drc structure, xl is of no use to it. */
+ ret = rpcsvc_drc_priv (((struct nfs_state *)(this->private))->rpcsvc->drc);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "Statedump of DRC failed");
+ goto out;
+ }
+
+ ret = nlm_priv (this);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG, "Statedump of NLM failed");
+ goto out;
+ }
+ out:
+ return ret;
}
@@ -993,7 +1529,7 @@ struct xlator_cbks cbks = {
.forget = nfs_forget,
};
-struct xlator_fops fops = { };
+struct xlator_fops fops;
struct xlator_dumpops dumpops = {
.priv = nfs_priv,
@@ -1008,29 +1544,53 @@ struct xlator_dumpops dumpops = {
struct volume_options options[] = {
{ .key = {"nfs3.read-size"},
.type = GF_OPTION_TYPE_SIZET,
- .description = "Size in which the client should issue read requests"
- " to the Gluster NFSv3 server. Must be a multiple of"
- " 4KB."
+ .min = GF_NFS3_RTMIN,
+ .max = GF_NFS3_RTMAX,
+ .default_value = TOSTRING(GF_NFS3_RTPREF),
+ .description = "Size in which the client should issue read requests "
+ "to the Gluster NFSv3 server. Must be a multiple of "
+ "4KB (4096). Min and Max supported values are 4KB "
+ "(4096) and 1MB (1048576) respectively. If the "
+ "specified value is within the supported range but "
+ "not a multiple of 4096, it is rounded up to the "
+ "nearest multiple of 4096."
},
{ .key = {"nfs3.write-size"},
.type = GF_OPTION_TYPE_SIZET,
- .description = "Size in which the client should issue write requests"
- " to the Gluster NFSv3 server. Must be a multiple of"
- " 4KB."
+ .min = GF_NFS3_WTMIN,
+ .max = GF_NFS3_WTMAX,
+ .default_value = TOSTRING(GF_NFS3_WTPREF),
+ .description = "Size in which the client should issue write requests "
+ "to the Gluster NFSv3 server. Must be a multiple of "
+ "1KB (1024). Min and Max supported values are "
+ "4KB (4096) and 1MB(1048576) respectively. If the "
+ "specified value is within the supported range but "
+ "not a multiple of 4096, it is rounded up to the "
+ "nearest multiple of 4096."
},
{ .key = {"nfs3.readdir-size"},
.type = GF_OPTION_TYPE_SIZET,
+ .min = GF_NFS3_DTMIN,
+ .max = GF_NFS3_DTMAX,
+ .default_value = TOSTRING(GF_NFS3_DTPREF),
.description = "Size in which the client should issue directory "
- " reading requests."
+ "reading requests to the Gluster NFSv3 server. Must "
+ "be a multiple of 1KB (1024). Min and Max supported "
+ "values are 4KB (4096) and 1MB (1048576) respectively."
+ "If the specified value is within the supported range "
+ "but not a multiple of 4096, it is rounded up to the "
+ "nearest multiple of 4096."
},
{ .key = {"nfs3.*.volume-access"},
.type = GF_OPTION_TYPE_STR,
.value = {"read-only", "read-write"},
+ .default_value = "read-write",
.description = "Type of access desired for this subvolume: "
" read-only, read-write(default)"
},
{ .key = {"nfs3.*.trusted-write"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
.description = "On an UNSTABLE write from client, return STABLE flag"
" to force client to not send a COMMIT request. In "
"some environments, combined with a replicated "
@@ -1045,6 +1605,7 @@ struct volume_options options[] = {
},
{ .key = {"nfs3.*.trusted-sync"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
.description = "All writes and COMMIT requests are treated as async."
" This implies that no write requests are guaranteed"
" to be on server disks when the write reply is "
@@ -1054,25 +1615,40 @@ struct volume_options options[] = {
},
{ .key = {"nfs3.*.export-dir"},
.type = GF_OPTION_TYPE_PATH,
+ .default_value = "",
.description = "By default, all subvolumes of nfs are exported as "
"individual exports. There are cases where a "
"subdirectory or subdirectories in the volume need to "
"be exported separately. This option can also be used "
"in conjunction with nfs3.export-volumes option to "
"restrict exports only to the subdirectories specified"
- " through this option. Must be an absolute path."
+ " through this option. Must be an absolute path. Along"
+ " with path allowed list of IPs/hostname can be "
+ "associated with each subdirectory. If provided "
+ "connection will allowed only from these IPs. By "
+ "default connections from all IPs are allowed. "
+ "Format: <dir>[(hostspec[|hostspec|...])][,...]. Where"
+ " hostspec can be an IP address, hostname or an IP "
+ "range in CIDR notation. "
+ "e.g. /foo(192.168.1.0/24|host1|10.1.1.8),/host2."
+ " NOTE: Care must be taken while configuring this "
+ "option as invalid entries and/or unreachable DNS "
+ "servers can introduce unwanted delay in all the mount"
+ " calls."
},
{ .key = {"nfs3.export-dirs"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
.description = "By default, all subvolumes of nfs are exported as "
"individual exports. There are cases where a "
"subdirectory or subdirectories in the volume need to "
"be exported separately. Enabling this option allows "
"any directory on a volumes to be exported separately."
- " Directory exports are enabled by default."
+ "Directory exports are enabled by default."
},
{ .key = {"nfs3.export-volumes"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
.description = "Enable or disable exporting whole volumes, instead "
"if used in conjunction with nfs3.export-dir, can "
"allow setting up only subdirectories as exports. On "
@@ -1080,37 +1656,42 @@ struct volume_options options[] = {
},
{ .key = {"rpc-auth.auth-unix"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
.description = "Disable or enable the AUTH_UNIX authentication type."
- "Must always be enabled for better interoperability."
- "However, can be disabled if needed. Enabled by"
+ "Must always be enabled for better interoperability. "
+ "However, can be disabled if needed. Enabled by "
"default"
},
{ .key = {"rpc-auth.auth-null"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
.description = "Disable or enable the AUTH_NULL authentication type."
"Must always be enabled. This option is here only to"
" avoid unrecognized option warnings"
},
{ .key = {"rpc-auth.auth-unix.*"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
.description = "Disable or enable the AUTH_UNIX authentication type "
"for a particular exported volume overriding defaults"
" and general setting for AUTH_UNIX scheme. Must "
- "always be enabled for better interoperability."
- "However, can be disabled if needed. Enabled by"
+ "always be enabled for better interoperability. "
+ "However, can be disabled if needed. Enabled by "
"default."
},
{ .key = {"rpc-auth.auth-unix.*.allow"},
.type = GF_OPTION_TYPE_STR,
+ .default_value = "on",
.description = "Disable or enable the AUTH_UNIX authentication type "
"for a particular exported volume overriding defaults"
" and general setting for AUTH_UNIX scheme. Must "
- "always be enabled for better interoperability."
- "However, can be disabled if needed. Enabled by"
+ "always be enabled for better interoperability. "
+ "However, can be disabled if needed. Enabled by "
"default."
},
{ .key = {"rpc-auth.auth-null.*"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
.description = "Disable or enable the AUTH_NULL authentication type "
"for a particular exported volume overriding defaults"
" and general setting for AUTH_NULL. Must always be "
@@ -1118,35 +1699,40 @@ struct volume_options options[] = {
"unrecognized option warnings."
},
{ .key = {"rpc-auth.addr.allow"},
- .type = GF_OPTION_TYPE_INTERNET_ADDRESS_LIST,
+ .type = GF_OPTION_TYPE_CLIENT_AUTH_ADDR,
+ .default_value = "all",
.description = "Allow a comma separated list of addresses and/or"
" hostnames to connect to the server. By default, all"
" connections are allowed. This allows users to "
"define a general rule for all exported volumes."
},
{ .key = {"rpc-auth.addr.reject"},
- .type = GF_OPTION_TYPE_INTERNET_ADDRESS_LIST,
+ .type = GF_OPTION_TYPE_CLIENT_AUTH_ADDR,
+ .default_value = "none",
.description = "Reject a comma separated list of addresses and/or"
" hostnames from connecting to the server. By default,"
" all connections are allowed. This allows users to"
"define a general rule for all exported volumes."
},
{ .key = {"rpc-auth.addr.*.allow"},
- .type = GF_OPTION_TYPE_INTERNET_ADDRESS_LIST,
+ .type = GF_OPTION_TYPE_CLIENT_AUTH_ADDR,
+ .default_value = "all",
.description = "Allow a comma separated list of addresses and/or"
" hostnames to connect to the server. By default, all"
" connections are allowed. This allows users to "
"define a rule for a specific exported volume."
},
{ .key = {"rpc-auth.addr.*.reject"},
- .type = GF_OPTION_TYPE_INTERNET_ADDRESS_LIST,
+ .type = GF_OPTION_TYPE_CLIENT_AUTH_ADDR,
+ .default_value = "none",
.description = "Reject a comma separated list of addresses and/or"
" hostnames from connecting to the server. By default,"
- " all connections are allowed. This allows users to"
+ " all connections are allowed. This allows users to "
"define a rule for a specific exported volume."
},
{ .key = {"rpc-auth.ports.insecure"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
.description = "Allow client connections from unprivileged ports. By "
"default only privileged ports are allowed. This is a"
"global setting in case insecure ports are to be "
@@ -1154,31 +1740,35 @@ struct volume_options options[] = {
},
{ .key = {"rpc-auth.ports.*.insecure"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
.description = "Allow client connections from unprivileged ports. By "
"default only privileged ports are allowed. Use this"
" option to enable or disable insecure ports for "
- "a specific subvolume and to override the global setting "
- " set by the previous option."
+ "a specific subvolume and to override the global "
+ "setting set by the previous option."
},
{ .key = {"rpc-auth.addr.namelookup"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
.description = "Users have the option of turning on name lookup for"
" incoming client connections using this option. Use this "
"option to turn on name lookups during address-based "
"authentication. Turning this on will enable you to"
- " use hostnames in rpc-auth.addr.* filters. In some "
+ " use hostnames in nfs.rpc-auth-* filters. In some "
"setups, the name server can take too long to reply to DNS "
- "queries resulting in timeouts of mount requests. By default, "
- " name lookup is off"
+ "queries resulting in timeouts of mount requests. By "
+ "default, name lookup is off"
},
{ .key = {"nfs.dynamic-volumes"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
.description = "Internal option set to tell gnfs to use a different"
" scheme for encoding file handles when DVM is being"
" used."
},
{ .key = {"nfs3.*.volume-id"},
.type = GF_OPTION_TYPE_STR,
+ .default_value = "",
.description = "When nfs.dynamic-volumes is set, gnfs expects every "
"subvolume to have this option set for it, so that "
"gnfs can use this option to identify the volume. "
@@ -1187,22 +1777,34 @@ struct volume_options options[] = {
},
{ .key = {"nfs.enable-ino32"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
.description = "For nfs clients or apps that do not support 64-bit "
"inode numbers, use this option to make NFS return "
- "32-bit inode numbers instead. Disabled by default, so "
- "NFS returns 64-bit inode numbers."
+ "32-bit inode numbers instead. Disabled by default, so"
+ " NFS returns 64-bit inode numbers."
},
{ .key = {"rpc.register-with-portmap"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
.description = "For systems that need to run multiple nfs servers, "
"only one registration is possible with "
"portmap service. Use this option to turn off portmap "
"registration for Gluster NFS. On by default"
},
+ { .key = {"rpc.outstanding-rpc-limit"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = RPCSVC_MIN_OUTSTANDING_RPC_LIMIT,
+ .max = RPCSVC_MAX_OUTSTANDING_RPC_LIMIT,
+ .default_value = TOSTRING(RPCSVC_DEF_NFS_OUTSTANDING_RPC_LIMIT),
+ .description = "Parameter to throttle the number of incoming RPC "
+ "requests from a client. 0 means no limit (can "
+ "potentially run out of memory)"
+ },
{ .key = {"nfs.port"},
.type = GF_OPTION_TYPE_INT,
.min = 1,
.max = 0xffff,
+ .default_value = TOSTRING(GF_NFS3_PORT),
.description = "Use this option on systems that need Gluster NFS to "
"be associated with a non-default port number."
},
@@ -1210,6 +1812,7 @@ struct volume_options options[] = {
.type = GF_OPTION_TYPE_INT,
.min = 1,
.max = 1024,
+ .default_value = TOSTRING(GF_NFS_DEFAULT_MEMFACTOR),
.description = "Use this option to make NFS be faster on systems by "
"using more memory. This option specifies a multiple "
"that determines the total amount of memory used. "
@@ -1220,12 +1823,14 @@ struct volume_options options[] = {
},
{ .key = {"nfs.*.disable"},
.type = GF_OPTION_TYPE_BOOL,
- .description = "This option is used to start or stop NFS server"
- "for individual volume."
+ .default_value = "false",
+ .description = "This option is used to start or stop the NFS server "
+ "for individual volumes."
},
{ .key = {"nfs.nlm"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
.description = "This option, if set to 'off', disables NLM server "
"by not registering the service with the portmapper."
" Set it to 'on' to re-enable it. Default value: 'on'"
@@ -1233,9 +1838,32 @@ struct volume_options options[] = {
{ .key = {"nfs.mount-udp"},
.type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
.description = "set the option to 'on' to enable mountd on UDP. "
- "Needed by Solaris NFS clients if NLM support is"
- "needed"
+ "Required for some Solaris and AIX NFS clients. "
+ "The need for enabling this option often depends "
+ "on the usage of NLM."
+ },
+ { .key = {"nfs.mount-rmtab"},
+ .type = GF_OPTION_TYPE_PATH,
+ .default_value = DATADIR "/rmtab",
+ .description = "Set the location of the cache file that is used to "
+ "list all the NFS-clients that have connected "
+ "through the MOUNT protocol. If this is on shared "
+ "storage, all GlusterFS servers will update and "
+ "output (with 'showmount') the same list."
+ },
+ { .key = {OPT_SERVER_RPC_STATD},
+ .type = GF_OPTION_TYPE_PATH,
+ .default_value = GF_RPC_STATD_PROG,
+ .description = "The executable of RPC statd utility. "
+ "Defaults to " GF_RPC_STATD_PROG
+ },
+ { .key = {OPT_SERVER_RPC_STATD_PIDFILE},
+ .type = GF_OPTION_TYPE_PATH,
+ .default_value = GF_RPC_STATD_PIDFILE,
+ .description = "The pid file of RPC statd utility. "
+ "Defaults to " GF_RPC_STATD_PIDFILE
},
{ .key = {OPT_SERVER_AUX_GIDS},
.type = GF_OPTION_TYPE_BOOL,
@@ -1255,7 +1883,23 @@ struct volume_options options[] = {
.description = "Number of seconds to cache auxiliary-GID data, when "
OPT_SERVER_AUX_GIDS " is set."
},
-
+ { .key = {"nfs.acl"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "This option is used to control ACL support for NFS."
+ },
+ { .key = {"nfs.drc"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = "on",
+ .description = "Enable Duplicate Request Cache in gNFS server to "
+ "improve correctness of non-idempotent operations like "
+ "write, delete, link, et al"
+ },
+ { .key = {"nfs.drc-size"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "0x20000",
+ .description = "Sets the number of non-idempotent "
+ "requests to cache in drc"
+ },
{ .key = {NULL} },
};
-
diff --git a/xlators/nfs/server/src/nfs.h b/xlators/nfs/server/src/nfs.h
index c3deba00a..fc745fbbd 100644
--- a/xlators/nfs/server/src/nfs.h
+++ b/xlators/nfs/server/src/nfs.h
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef __NFS_H__
@@ -45,7 +36,7 @@
#define GF_NFS_MAX_MEMFACTOR 30
#define GF_NFS_DVM_ON 1
-#define GF_NFS_DVM_OFF 2
+#define GF_NFS_DVM_OFF 0
/* This corresponds to the max 16 number of group IDs that are sent through an
* RPC request. Since NFS is the only one going to set this, we can be safe
@@ -86,11 +77,22 @@ struct nfs_state {
unsigned int override_portnum;
int allow_insecure;
int enable_nlm;
+ int enable_acl;
int mount_udp;
+ char *rmtab;
struct rpc_clnt *rpc_clnt;
gf_boolean_t server_aux_gids;
uint32_t server_aux_gids_max_age;
gid_cache_t gid_cache;
+ uint32_t generation;
+ gf_boolean_t register_portmap;
+ char *rpc_statd;
+ char *rpc_statd_pid_file;
+};
+
+struct nfs_inode_ctx {
+ struct list_head shares;
+ uint32_t generation;
};
#define gf_nfs_dvm_on(nfsstt) (((struct nfs_state *)nfsstt)->dynamicvolumes == GF_NFS_DVM_ON)
diff --git a/xlators/nfs/server/src/nfs3-fh.c b/xlators/nfs/server/src/nfs3-fh.c
index eb8223924..e199c56dc 100644
--- a/xlators/nfs/server/src/nfs3-fh.c
+++ b/xlators/nfs/server/src/nfs3-fh.c
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _CONFIG_H
@@ -46,6 +37,12 @@ nfs3_fh_validate (struct nfs3_fh *fh)
if (fh->ident[1] != GF_NFSFH_IDENT1)
return 0;
+ if (fh->ident[2] != GF_NFSFH_IDENT2)
+ return 0;
+
+ if (fh->ident[3] != GF_NFSFH_IDENT3)
+ return 0;
+
return 1;
}
@@ -58,6 +55,8 @@ nfs3_fh_init (struct nfs3_fh *fh, struct iatt *buf)
fh->ident[0] = GF_NFSFH_IDENT0;
fh->ident[1] = GF_NFSFH_IDENT1;
+ fh->ident[2] = GF_NFSFH_IDENT2;
+ fh->ident[3] = GF_NFSFH_IDENT3;
uuid_copy (fh->gfid, buf->ia_gfid);
}
@@ -112,17 +111,17 @@ nfs3_fh_is_root_fh (struct nfs3_fh *fh)
void
-nfs3_fh_to_str (struct nfs3_fh *fh, char *str)
+nfs3_fh_to_str (struct nfs3_fh *fh, char *str, size_t len)
{
- char gfid[512];
- char exportid[512];
+ char gfid[GF_UUID_BUF_SIZE];
+ char exportid[GF_UUID_BUF_SIZE];
if ((!fh) || (!str))
return;
- sprintf (str, "FH: exportid %s, gfid %s",
- uuid_utoa_r (fh->exportid, exportid),
- uuid_utoa_r (fh->gfid, gfid));
+ snprintf (str, len, "FH: exportid %s, gfid %s",
+ uuid_utoa_r (fh->exportid, exportid),
+ uuid_utoa_r (fh->gfid, gfid));
}
void
@@ -161,6 +160,8 @@ nfs3_build_fh (inode_t *inode, uuid_t exportid, struct nfs3_fh *newfh)
newfh->ident[0] = GF_NFSFH_IDENT0;
newfh->ident[1] = GF_NFSFH_IDENT1;
+ newfh->ident[2] = GF_NFSFH_IDENT2;
+ newfh->ident[3] = GF_NFSFH_IDENT3;
uuid_copy (newfh->gfid, inode->gfid);
uuid_copy (newfh->exportid, exportid);
return 0;
diff --git a/xlators/nfs/server/src/nfs3-fh.h b/xlators/nfs/server/src/nfs3-fh.h
index a9002afe7..1049cdc96 100644
--- a/xlators/nfs/server/src/nfs3-fh.h
+++ b/xlators/nfs/server/src/nfs3-fh.h
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _NFS_FH_H_
@@ -35,7 +26,9 @@
* handles for now. This will change if and when we need v4. */
#define GF_NFSFH_IDENT0 ':'
#define GF_NFSFH_IDENT1 'O'
-#define GF_NFSFH_IDENT_SIZE (sizeof(char) * 2)
+#define GF_NFSFH_IDENT2 'G'
+#define GF_NFSFH_IDENT3 'L'
+#define GF_NFSFH_IDENT_SIZE (sizeof(char) * 4)
#define GF_NFSFH_STATIC_SIZE (GF_NFSFH_IDENT_SIZE + (2*sizeof (uuid_t)))
#define nfs3_fh_exportid_to_index(exprtid) ((uint16_t)exprtid[15])
@@ -45,9 +38,9 @@
struct nfs3_fh {
/* Used to ensure that a bunch of bytes are actually a GlusterFS NFS
- * file handle. Should contain ":O"
+ * file handle. Should contain ":OGL"
*/
- char ident[2];
+ char ident[4];
/* UUID that identifies an export. The value stored in exportid
* depends on the usage of gluster nfs. If the DVM is enabled using
@@ -63,6 +56,11 @@ struct nfs3_fh {
/* File/dir gfid. */
uuid_t gfid;
+ /* This structure must be exactly NFS3_FHSIZE (64) bytes long.
+ Having the structure shorter results in buffer overflows
+ during XDR decoding.
+ */
+ unsigned char padding[NFS3_FHSIZE - GF_NFSFH_STATIC_SIZE];
} __attribute__((__packed__));
#define GF_NFS3FH_STATIC_INITIALIZER {{0},}
@@ -90,7 +88,7 @@ extern void
nfs3_log_fh (struct nfs3_fh *fh);
extern void
-nfs3_fh_to_str (struct nfs3_fh *fh, char *str);
+nfs3_fh_to_str (struct nfs3_fh *fh, char *str, size_t len);
extern int
nfs3_fh_build_parent_fh (struct nfs3_fh *child, struct iatt *newstat,
diff --git a/xlators/nfs/server/src/nfs3-helpers.c b/xlators/nfs/server/src/nfs3-helpers.c
index 253fe0754..f67cccf1a 100644
--- a/xlators/nfs/server/src/nfs3-helpers.c
+++ b/xlators/nfs/server/src/nfs3-helpers.c
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _CONFIG_H
@@ -94,7 +85,7 @@ struct nfs3stat_strerror nfs3stat_strerror_table[] = {
{ NFS3ERR_SERVERFAULT, "Error occurred on the server or IO Error" },
{ NFS3ERR_BADTYPE, "Type not supported by the server" },
{ NFS3ERR_JUKEBOX, "Cannot complete server initiated request" },
- { -1, "IO Error" },
+ { NFS3ERR_END_OF_LIST, "IO Error" },
};
@@ -265,6 +256,20 @@ nfs3_errno_to_nfsstat3 (int errnum)
return stat;
}
+/*
+ * Special case: If op_ret is -1, it's very unusual op_errno being
+ * 0 which means something came wrong from upper layer(s). If it
+ * happens by any means, then set NFS3 status to NFS3ERR_SERVERFAULT.
+ */
+inline nfsstat3
+nfs3_cbk_errno_status (int32_t op_ret, int32_t op_errno)
+{
+ if ((op_ret == -1) && (op_errno == 0)) {
+ return NFS3ERR_SERVERFAULT;
+ }
+
+ return nfs3_errno_to_nfsstat3 (op_errno);
+}
void
nfs3_fill_lookup3res_error (lookup3res *res, nfsstat3 stat,
@@ -284,6 +289,9 @@ nfs3_stat_to_fattr3 (struct iatt *buf)
{
fattr3 fa = {0, };
+ if (buf == NULL)
+ goto out;
+
if (IA_ISDIR (buf->ia_type))
fa.type = NF3DIR;
else if (IA_ISREG (buf->ia_type))
@@ -353,6 +361,7 @@ nfs3_stat_to_fattr3 (struct iatt *buf)
fa.mtime.seconds = buf->ia_mtime;
fa.mtime.nseconds = buf->ia_mtime_nsec;
+out:
return fa;
}
@@ -496,7 +505,7 @@ nfs3_fill_fsinfo3res (struct nfs3_state *nfs3, fsinfo3res *res,
resok.wtpref = nfs3->writesize;
resok.wtmult = GF_NFS3_WTMULT;
resok.dtpref = nfs3->readdirsize;
- resok.maxfilesize = GF_NFS3_MAXFILE;
+ resok.maxfilesize = GF_NFS3_MAXFILESIZE;
resok.time_delta = tdelta;
resok.properties = GF_NFS3_FS_PROP;
@@ -534,7 +543,7 @@ char *
nfsstat3_strerror(int stat)
{
int i;
- for(i = 0; nfs3stat_strerror_table[i].stat != -1; i++) {
+ for(i = 0; nfs3stat_strerror_table[i].stat != NFS3ERR_END_OF_LIST ; i++) {
if (nfs3stat_strerror_table[i].stat == stat)
return nfs3stat_strerror_table[i].strerror;
}
@@ -591,7 +600,8 @@ nfs3_request_to_accessbits (int32_t accbits)
return acc_request;
}
void
-nfs3_fill_access3res (access3res *res, nfsstat3 status, int32_t accbits)
+nfs3_fill_access3res (access3res *res, nfsstat3 status, int32_t accbits,
+ int32_t reqaccbits)
{
uint32_t accres = 0;
@@ -602,7 +612,8 @@ nfs3_fill_access3res (access3res *res, nfsstat3 status, int32_t accbits)
accres = nfs3_accessbits (accbits);
- res->access3res_u.resok.access = accres;
+ /* do not answer what was not asked */
+ res->access3res_u.resok.access = accres & reqaccbits;
}
void
@@ -1605,13 +1616,14 @@ err:
void
nfs3_stat_to_errstr (uint32_t xid, char *op, nfsstat3 stat, int pstat,
- char *errstr)
+ char *errstr, size_t len)
{
if ((!op) || (!errstr))
return;
- sprintf (errstr, "XID: %x, %s: NFS: %d(%s), POSIX: %d(%s)", xid, op,
- stat, nfsstat3_strerror (stat), pstat, strerror (pstat));
+ snprintf (errstr, len, "XID: %x, %s: NFS: %d(%s), POSIX: %d(%s)",
+ xid, op,stat, nfsstat3_strerror (stat), pstat,
+ strerror (pstat));
}
void
@@ -1619,10 +1631,10 @@ nfs3_log_common_call (uint32_t xid, char *op, struct nfs3_fh *fh)
{
char fhstr[1024];
- if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
- return;
+ if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
+ return;
- nfs3_fh_to_str (fh, fhstr);
+ nfs3_fh_to_str (fh, fhstr, sizeof (fhstr));
gf_log (GF_NFS3, GF_LOG_DEBUG, "XID: %x, %s: args: %s", xid, op,
fhstr);
}
@@ -1634,9 +1646,9 @@ nfs3_log_fh_entry_call (uint32_t xid, char *op, struct nfs3_fh *fh,
{
char fhstr[1024];
- if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
- return;
- nfs3_fh_to_str (fh, fhstr);
+ if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
+ return;
+ nfs3_fh_to_str (fh, fhstr, sizeof (fhstr));
gf_log (GF_NFS3, GF_LOG_DEBUG, "XID: %x, %s: args: %s, name: %s", xid,
op, fhstr, name);
}
@@ -1649,10 +1661,10 @@ nfs3_log_rename_call (uint32_t xid, struct nfs3_fh *src, char *sname,
char sfhstr[1024];
char dfhstr[1024];
- if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
- return;
- nfs3_fh_to_str (src, sfhstr);
- nfs3_fh_to_str (dst, dfhstr);
+ if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
+ return;
+ nfs3_fh_to_str (src, sfhstr, sizeof (sfhstr));
+ nfs3_fh_to_str (dst, dfhstr, sizeof (dfhstr));
gf_log (GF_NFS3, GF_LOG_DEBUG, "XID: %x, RENAME: args: Src: %s, "
"name: %s, Dst: %s, name: %s", xid, sfhstr, sname, dfhstr,
dname);
@@ -1670,9 +1682,9 @@ nfs3_log_create_call (uint32_t xid, struct nfs3_fh *fh, char *name,
char unchkd[] = "UNCHECKED";
char guarded[] = "GUARDED";
- if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
- return;
- nfs3_fh_to_str (fh, fhstr);
+ if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
+ return;
+ nfs3_fh_to_str (fh, fhstr, sizeof (fhstr));
if (mode == EXCLUSIVE)
modestr = exclmode;
else if (mode == GUARDED)
@@ -1695,9 +1707,9 @@ nfs3_log_mknod_call (uint32_t xid, struct nfs3_fh *fh, char *name, int type)
char sock[] = "SOCK";
char fifo[] = "FIFO";
- if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
- return;
- nfs3_fh_to_str (fh, fhstr);
+ if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
+ return;
+ nfs3_fh_to_str (fh, fhstr, sizeof (fhstr));
if (type == NF3CHR)
modestr = chr;
else if (type == NF3BLK)
@@ -1718,9 +1730,9 @@ nfs3_log_symlink_call (uint32_t xid, struct nfs3_fh *fh, char *name, char *tgt)
{
char fhstr[1024];
- if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
- return;
- nfs3_fh_to_str (fh, fhstr);
+ if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
+ return;
+ nfs3_fh_to_str (fh, fhstr, sizeof (fhstr));
gf_log (GF_NFS3, GF_LOG_DEBUG, "XID: %x, SYMLINK: args: %s, name: %s,"
" target: %s", xid, fhstr, name, tgt);
}
@@ -1733,10 +1745,10 @@ nfs3_log_link_call (uint32_t xid, struct nfs3_fh *fh, char *name,
char dfhstr[1024];
char tfhstr[1024];
- if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
- return;
- nfs3_fh_to_str (fh, dfhstr);
- nfs3_fh_to_str (tgt, tfhstr);
+ if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
+ return;
+ nfs3_fh_to_str (fh, dfhstr, sizeof (dfhstr));
+ nfs3_fh_to_str (tgt, tfhstr, sizeof (tfhstr));
gf_log (GF_NFS3, GF_LOG_DEBUG, "XID: %x, LINK: args: %s, name: %s,"
" target: %s", xid, dfhstr, name, tfhstr);
}
@@ -1748,9 +1760,9 @@ nfs3_log_rw_call (uint32_t xid, char *op, struct nfs3_fh *fh, offset3 offt,
{
char fhstr[1024];
- if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
- return;
- nfs3_fh_to_str (fh, fhstr);
+ if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
+ return;
+ nfs3_fh_to_str (fh, fhstr, sizeof (fhstr));
if (stablewrite == -1)
gf_log (GF_NFS3, GF_LOG_DEBUG, "XID: %x, %s: args: %s, offset:"
" %"PRIu64", count: %"PRIu32, xid, op, fhstr, offt,
@@ -3381,11 +3393,11 @@ void
nfs3_log_common_res (uint32_t xid, int op, nfsstat3 stat, int pstat)
{
char errstr[1024];
- int ll = nfs3_loglevel (op, stat);
+ int ll = nfs3_loglevel (op, stat);
- if (THIS->ctx->log.loglevel < ll)
- return;
- nfs3_stat_to_errstr (xid, nfs3op_strings[op].str, stat, pstat, errstr);
+ if (THIS->ctx->log.loglevel < ll)
+ return;
+ nfs3_stat_to_errstr (xid, nfs3op_strings[op].str, stat, pstat, errstr, sizeof (errstr));
gf_log (GF_NFS3, ll, "%s", errstr);
}
@@ -3393,14 +3405,14 @@ void
nfs3_log_readlink_res (uint32_t xid, nfsstat3 stat, int pstat, char *linkpath)
{
char errstr[1024];
- int ll = nfs3_loglevel (NFS3_READLINK, stat);
+ int ll = nfs3_loglevel (NFS3_READLINK, stat);
- if (THIS->ctx->log.loglevel < ll)
- return;
+ if (THIS->ctx->log.loglevel < ll)
+ return;
- nfs3_stat_to_errstr (xid, "READLINK", stat, pstat, errstr);
+ nfs3_stat_to_errstr (xid, "READLINK", stat, pstat, errstr, sizeof (errstr));
gf_log (GF_NFS3, ll, "%s, target: %s",
- errstr, linkpath);
+ errstr, linkpath);
}
@@ -3409,12 +3421,12 @@ nfs3_log_read_res (uint32_t xid, nfsstat3 stat, int pstat, count3 count,
int is_eof, struct iovec *vec, int32_t veccount)
{
char errstr[1024];
- int ll = GF_LOG_DEBUG;
+ int ll = GF_LOG_DEBUG;
ll = nfs3_loglevel (NFS3_READ, stat);
- if (THIS->ctx->log.loglevel < ll)
- return;
- nfs3_stat_to_errstr (xid, "READ", stat, pstat, errstr);
+ if (THIS->ctx->log.loglevel < ll)
+ return;
+ nfs3_stat_to_errstr (xid, "READ", stat, pstat, errstr, sizeof (errstr));
if (vec)
gf_log (GF_NFS3, ll, "%s, count: %"PRIu32", is_eof:"
" %d, vector: count: %d, len: %zd", errstr, count,
@@ -3430,12 +3442,12 @@ nfs3_log_write_res (uint32_t xid, nfsstat3 stat, int pstat, count3 count,
int stable, uint64_t wverf)
{
char errstr[1024];
- int ll = nfs3_loglevel (NFS3_WRITE, stat);
+ int ll = nfs3_loglevel (NFS3_WRITE, stat);
- if (THIS->ctx->log.loglevel < ll)
- return;
+ if (THIS->ctx->log.loglevel < ll)
+ return;
- nfs3_stat_to_errstr (xid, "WRITE", stat, pstat, errstr);
+ nfs3_stat_to_errstr (xid, "WRITE", stat, pstat, errstr, sizeof (errstr));
gf_log (GF_NFS3, ll, "%s, count: %"PRIu32", %s,wverf: %"PRIu64
, errstr, count, (stable == UNSTABLE)?"UNSTABLE":"STABLE",
wverf);
@@ -3448,12 +3460,12 @@ nfs3_log_newfh_res (uint32_t xid, int op, nfsstat3 stat, int pstat,
{
char errstr[1024];
char fhstr[1024];
- int ll = nfs3_loglevel (op, stat);
+ int ll = nfs3_loglevel (op, stat);
- if (THIS->ctx->log.loglevel < ll)
- return;
- nfs3_stat_to_errstr (xid, nfs3op_strings[op].str, stat, pstat, errstr);
- nfs3_fh_to_str (newfh, fhstr);
+ if (THIS->ctx->log.loglevel < ll)
+ return;
+ nfs3_stat_to_errstr (xid, nfs3op_strings[op].str, stat, pstat, errstr, sizeof (errstr));
+ nfs3_fh_to_str (newfh, fhstr, sizeof (fhstr));
gf_log (GF_NFS3, nfs3_loglevel (op, stat), "%s, %s", errstr, fhstr);
}
@@ -3464,11 +3476,11 @@ nfs3_log_readdir_res (uint32_t xid, nfsstat3 stat, int pstat, uint64_t cverf,
count3 count, int is_eof)
{
char errstr[1024];
- int ll = nfs3_loglevel (NFS3_READDIR, stat);
+ int ll = nfs3_loglevel (NFS3_READDIR, stat);
- if (THIS->ctx->log.loglevel < ll)
- return;
- nfs3_stat_to_errstr (xid, "READDIR", stat, pstat, errstr);
+ if (THIS->ctx->log.loglevel < ll)
+ return;
+ nfs3_stat_to_errstr (xid, "READDIR", stat, pstat, errstr, sizeof (errstr));
gf_log (GF_NFS3, ll, "%s, count: %"PRIu32", cverf: %"PRIu64
", is_eof: %d", errstr, count, cverf, is_eof);
}
@@ -3481,9 +3493,9 @@ nfs3_log_readdirp_res (uint32_t xid, nfsstat3 stat, int pstat, uint64_t cverf,
char errstr[1024];
int ll = nfs3_loglevel (NFS3_READDIRP, stat);
- if (THIS->ctx->log.loglevel < ll)
- return;
- nfs3_stat_to_errstr (xid, "READDIRPLUS", stat, pstat, errstr);
+ if (THIS->ctx->log.loglevel < ll)
+ return;
+ nfs3_stat_to_errstr (xid, "READDIRPLUS", stat, pstat, errstr, sizeof (errstr));
gf_log (GF_NFS3, ll, "%s, dircount: %"PRIu32", maxcount: %"
PRIu32", cverf: %"PRIu64", is_eof: %d", errstr, dircount,
maxcount, cverf, is_eof);
@@ -3496,9 +3508,9 @@ nfs3_log_commit_res (uint32_t xid, nfsstat3 stat, int pstat, uint64_t wverf)
char errstr[1024];
int ll = nfs3_loglevel (NFS3_COMMIT, stat);
- if (THIS->ctx->log.loglevel < ll)
- return;
- nfs3_stat_to_errstr (xid, "COMMIT", stat, pstat, errstr);
+ if (THIS->ctx->log.loglevel < ll)
+ return;
+ nfs3_stat_to_errstr (xid, "COMMIT", stat, pstat, errstr, sizeof (errstr));
gf_log (GF_NFS3, ll, "%s, wverf: %"PRIu64, errstr, wverf);
}
@@ -3509,10 +3521,10 @@ nfs3_log_readdir_call (uint32_t xid, struct nfs3_fh *fh, count3 dircount,
{
char fhstr[1024];
- if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
- return;
+ if (THIS->ctx->log.loglevel < GF_LOG_DEBUG)
+ return;
- nfs3_fh_to_str (fh, fhstr);
+ nfs3_fh_to_str (fh, fhstr, sizeof (fhstr));
if (maxcount == 0)
gf_log (GF_NFS3, GF_LOG_DEBUG, "XID: %x, READDIR: args: %s,"
@@ -3575,7 +3587,8 @@ nfs3_fh_resolve_entry_lookup_cbk (call_frame_t *frame, void *cookie,
cs->resolvedloc.name, buf);
if (linked_inode) {
inode_lookup (linked_inode);
- inode_unref (linked_inode);
+ inode_unref (cs->resolvedloc.inode);
+ cs->resolvedloc.inode = linked_inode;
}
err:
nfs3_call_resume (cs);
diff --git a/xlators/nfs/server/src/nfs3-helpers.h b/xlators/nfs/server/src/nfs3-helpers.h
index 67935d143..eada24221 100644
--- a/xlators/nfs/server/src/nfs3-helpers.h
+++ b/xlators/nfs/server/src/nfs3-helpers.h
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _NFS3_HELPER_H_
@@ -44,6 +35,9 @@ nfs3_extract_lookup_name (lookup3args *args);
extern nfsstat3
nfs3_errno_to_nfsstat3 (int errnum);
+extern nfsstat3
+nfs3_cbk_errno_status (int32_t, int32_t);
+
extern void
nfs3_fill_lookup3res (lookup3res *res, nfsstat3 stat, struct nfs3_fh *newfh,
struct iatt *stbuf, struct iatt *postparent,
@@ -99,7 +93,8 @@ extern void
nfs3_prep_access3args (access3args *args, struct nfs3_fh *fh);
extern void
-nfs3_fill_access3res (access3res *res, nfsstat3 status, int32_t accbits);
+nfs3_fill_access3res (access3res *res, nfsstat3 status, int32_t accbits,
+ int32_t reqaccbits);
extern char *
nfs3_fhcache_getpath (struct nfs3_state *nfs3, struct nfs3_fh *fh);
@@ -339,4 +334,7 @@ nfs3_is_parentdir_entry (char *entry);
uint32_t
nfs3_request_to_accessbits (int32_t accbits);
+void
+nfs3_map_deviceid_to_statdev (struct iatt *ia, uint64_t deviceid);
+
#endif
diff --git a/xlators/nfs/server/src/nfs3.c b/xlators/nfs/server/src/nfs3.c
index 1c3799972..6361f9e20 100644
--- a/xlators/nfs/server/src/nfs3.c
+++ b/xlators/nfs/server/src/nfs3.c
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _CONFIG_H
@@ -32,6 +23,7 @@
#include "nfs3.h"
#include "mem-pool.h"
#include "logging.h"
+#include "nfs-common.h"
#include "nfs-fops.h"
#include "nfs-inodes.h"
#include "nfs-generics.h"
@@ -225,32 +217,32 @@ out:
uuid_unparse (handle->exportid, exportid); \
uuid_unparse (handle->gfid, gfid); \
trans = rpcsvc_request_transport (req); \
- gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to map " \
- "FH to vol: client=%s, exportid=%s, gfid=%s",\
- trans->peerinfo.identifier, exportid, \
- gfid); \
- gf_log (GF_NFS3, GF_LOG_ERROR, \
- "Stale nfs client %s must be trying to "\
- "connect to a deleted volume, please " \
- "unmount it.", trans->peerinfo.identifier);\
+ GF_LOG_OCCASIONALLY (nfs3state->occ_logger, \
+ GF_NFS3, GF_LOG_ERROR, "Failed to map " \
+ "FH to vol: client=%s, exportid=%s, " \
+ "gfid=%s", trans->peerinfo.identifier, \
+ exportid, gfid); \
+ GF_LOG_OCCASIONALLY (nfs3state->occ_logger, \
+ GF_NFS3, GF_LOG_ERROR, "Stale nfs " \
+ "client %s must be trying to connect to"\
+ " a deleted volume, please unmount it.",\
+ trans->peerinfo.identifier); \
status = NFS3ERR_STALE; \
goto label; \
} else { \
- gf_log (GF_NFS3, GF_LOG_TRACE, "FH to Volume: %s"\
- ,volume->name); \
- rpcsvc_request_set_private (req, volume); \
+ gf_log (GF_NFS3, GF_LOG_TRACE, "FH to Volume:" \
+ "%s", volume->name); \
+ rpcsvc_request_set_private (req, volume); \
} \
} while (0); \
#define nfs3_validate_gluster_fh(handle, status, errlabel) \
do { \
- if ((handle)) { \
- if (!nfs3_fh_validate (handle)) { \
- gf_log (GF_NFS3, GF_LOG_ERROR, "Bad Handle");\
- status = NFS3ERR_BADHANDLE; \
- goto errlabel; \
- } \
+ if (!nfs3_fh_validate (handle)) { \
+ gf_log (GF_NFS3, GF_LOG_ERROR, "Bad Handle"); \
+ status = NFS3ERR_BADHANDLE; \
+ goto errlabel; \
} \
} while (0) \
@@ -264,10 +256,12 @@ out:
xlatorp = nfs3_fh_to_xlator (cst->nfs3state, \
&cst->resolvefh); \
uuid_unparse (cst->resolvefh.gfid, gfid); \
- sprintf (buf, "(%s) %s : %s", trans->peerinfo.identifier,\
- xlatorp ? xlatorp->name : "ERR", gfid); \
- gf_log (GF_NFS3, GF_LOG_ERROR, "Unable to resolve FH"\
- ": %s", buf); \
+ snprintf (buf, sizeof (buf), "(%s) %s : %s", \
+ trans->peerinfo.identifier, \
+ xlatorp ? xlatorp->name : "ERR", \
+ gfid ); \
+ gf_log (GF_NFS3, GF_LOG_ERROR, "%s: %s", \
+ strerror(cst->resolve_errno), buf); \
nfstat = nfs3_errno_to_nfsstat3 (cst->resolve_errno);\
goto erlabl; \
} \
@@ -284,10 +278,12 @@ out:
xlatorp = nfs3_fh_to_xlator (cst->nfs3state, \
&cst->resolvefh); \
uuid_unparse (cst->resolvefh.gfid, gfid); \
- sprintf (buf, "(%s) %s : %s", trans->peerinfo.identifier,\
- xlatorp ? xlatorp->name : "ERR", gfid); \
- gf_log (GF_NFS3, GF_LOG_ERROR, "Unable to resolve FH"\
- ": %s", buf); \
+ snprintf (buf, sizeof (buf), "(%s) %s : %s", \
+ trans->peerinfo.identifier, \
+ xlatorp ? xlatorp->name : "ERR", \
+ gfid); \
+ gf_log (GF_NFS3, GF_LOG_ERROR, "%s: %s", \
+ strerror(cst->resolve_errno), buf); \
nfstat = nfs3_errno_to_nfsstat3 (cs->resolve_errno);\
goto erlabl; \
} \
@@ -553,23 +549,18 @@ nfs3svc_submit_reply (rpcsvc_request_t *req, void *arg, nfs3_serializer sfunc)
iobref = iobref_new ();
if (!iobref) {
- iobuf_unref (iob);
gf_log (GF_NFS3, GF_LOG_ERROR, "failed on iobref_new()");
goto ret;
}
- iobref_add (iobref, iob);
+ ret = iobref_add (iobref, iob);
+ if (ret) {
+ gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to add iob to iobref");
+ goto ret;
+ }
/* Then, submit the message for transmission. */
ret = rpcsvc_submit_message (req, &outmsg, 1, NULL, 0, iobref);
-
- /* Now that we've done our job of handing the message to the RPC layer
- * we can safely unref the iob in the hope that RPC layer must have
- * ref'ed the iob on receiving into the txlist.
- */
- iobuf_unref (iob);
- iobref_unref (iobref);
-
if (ret == -1) {
gf_log (GF_NFS3, GF_LOG_ERROR, "Reply submission failed");
goto ret;
@@ -577,6 +568,14 @@ nfs3svc_submit_reply (rpcsvc_request_t *req, void *arg, nfs3_serializer sfunc)
ret = 0;
ret:
+ /* Now that we've done our job of handing the message to the RPC layer
+ * we can safely unref the iob in the hope that RPC layer must have
+ * ref'ed the iob on receiving into the txlist.
+ */
+ if (NULL != iob)
+ iobuf_unref (iob);
+ if (NULL != iobref)
+ iobref_unref (iobref);
return ret;
}
@@ -608,19 +607,14 @@ nfs3svc_submit_vector_reply (rpcsvc_request_t *req, void *arg,
new_iobref = 1;
}
- iobref_add (iobref, iob);
+ ret = iobref_add (iobref, iob);
+ if (ret) {
+ gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to add iob to iobref");
+ goto ret;
+ }
/* Then, submit the message for transmission. */
ret = rpcsvc_submit_message (req, &outmsg, 1, payload, vcount, iobref);
-
- /* Now that we've done our job of handing the message to the RPC layer
- * we can safely unref the iob in the hope that RPC layer must have
- * ref'ed the iob on receiving into the txlist.
- */
- iobuf_unref (iob);
- if (new_iobref)
- iobref_unref (iobref);
-
if (ret == -1) {
gf_log (GF_NFS3, GF_LOG_ERROR, "Reply submission failed");
goto ret;
@@ -628,6 +622,14 @@ nfs3svc_submit_vector_reply (rpcsvc_request_t *req, void *arg,
ret = 0;
ret:
+ /* Now that we've done our job of handing the message to the RPC layer
+ * we can safely unref the iob in the hope that RPC layer must have
+ * ref'ed the iob on receiving into the txlist.
+ */
+ if (NULL != iob)
+ iobuf_unref (iob);
+ if (new_iobref)
+ iobref_unref (iobref);
return ret;
}
@@ -692,11 +694,20 @@ nfs3svc_getattr_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
cs = frame->local;
+ /*
+ * Somewhat counter-intuitively, we don't need to look for sh-failed
+ * here. Failing this getattr will generate a new lookup from the
+ * client, and nfs_fop_lookup_cbk will detect any self-heal failures.
+ */
+
if (op_ret == -1) {
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- status = nfs3_errno_to_nfsstat3 (op_errno);
+ status = nfs3_cbk_errno_status (op_ret, op_errno);
+ }
+ else {
+ nfs_fix_generation(this,inode);
}
nfs3_log_common_res (rpcsvc_request_xid (cs->req), NFS3_GETATTR,
@@ -723,7 +734,7 @@ nfs3svc_getattr_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- status = nfs3_errno_to_nfsstat3 (op_errno);
+ status = nfs3_cbk_errno_status (op_ret, op_errno);
}
nfs3_log_common_res (rpcsvc_request_xid (cs->req), NFS3_GETATTR,
@@ -743,6 +754,9 @@ nfs3_getattr_resume (void *carg)
int ret = -EFAULT;
nfs_user_t nfu = {0, };
nfs3_call_state_t *cs = NULL;
+ uint64_t raw_ctx = 0;
+ struct nfs_inode_ctx *ictx = NULL;
+ struct nfs_state *priv = NULL;
if (!carg)
return ret;
@@ -769,9 +783,28 @@ nfs3_getattr_resume (void *carg)
goto nfs3err;
}
+ /*
+ * If brick state changed, we need to force a proper lookup cycle (as
+ * would happen in native protocol) to do self-heal checks. We detect
+ * this by comparing the generation number for the last successful
+ * creation/lookup on the inode to the current number, so inodes that
+ * haven't been validated since the state change are affected.
+ */
+ if (inode_ctx_get(cs->resolvedloc.inode,cs->nfsx,&raw_ctx) == 0) {
+ ictx = (struct nfs_inode_ctx *)raw_ctx;
+ priv = cs->nfsx->private;
+ if (ictx->generation != priv->generation) {
+ ret = nfs_lookup (cs->nfsx, cs->vol, &nfu,
+ &cs->resolvedloc,
+ nfs3svc_getattr_lookup_cbk, cs);
+ goto check_err;
+ }
+ }
+
ret = nfs_stat (cs->nfsx, cs->vol, &nfu, &cs->resolvedloc,
nfs3svc_getattr_stat_cbk, cs);
+check_err:
if (ret < 0) {
gf_log (GF_NFS3, GF_LOG_ERROR, "Stat fop failed: %s: %s",
cs->oploc.path, strerror (-ret));
@@ -885,7 +918,7 @@ nfs3svc_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto nfs3err;
}
@@ -925,7 +958,7 @@ nfs3svc_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto nfs3err;
}
@@ -939,7 +972,8 @@ nfs3svc_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
* truncation and also only if this is not a directory.
*/
if ((gf_attr_size_set (cs->setattr_valid)) &&
- (!IA_ISDIR (postop->ia_type))) {
+ (!IA_ISDIR (postop->ia_type)) &&
+ (preop->ia_size != cs->stbuf.ia_size)) {
nfs_request_user_init (&nfu, cs->req);
ret = nfs_truncate (cs->nfsx, cs->vol, &nfu, &cs->resolvedloc,
cs->stbuf.ia_size, nfs3svc_truncate_cbk,cs);
@@ -980,7 +1014,7 @@ nfs3svc_setattr_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto nfs3err;
}
@@ -1194,7 +1228,7 @@ nfs3svc_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
(op_errno == ENOENT ? GF_LOG_TRACE : GF_LOG_WARNING),
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- status = nfs3_errno_to_nfsstat3 (op_errno);
+ status = nfs3_cbk_errno_status (op_ret, op_errno);
goto xmit_res;
}
@@ -1238,7 +1272,7 @@ nfs3svc_lookup_parentdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- status = nfs3_errno_to_nfsstat3 (op_errno);
+ status = nfs3_cbk_errno_status (op_ret, op_errno);
goto xmit_res;
}
@@ -1280,7 +1314,11 @@ nfs3_lookup_parentdir_resume (void *carg)
nfs3_call_state_t *cs = NULL;
inode_t *parent = NULL;
- GF_VALIDATE_OR_GOTO (GF_NFS3, carg, nfs3err);
+ if (!carg) {
+ gf_log (GF_NFS3, GF_LOG_ERROR, "Invalid argument,"
+ " carg value NULL");
+ return EINVAL;
+ }
cs = (nfs3_call_state_t *)carg;
nfs3_check_fh_resolve_status (cs, stat, nfs3err);
@@ -1351,7 +1389,11 @@ nfs3_lookup_resume (void *carg)
nfs3_call_state_t *cs = NULL;
struct nfs3_fh newfh = {{0},};
- GF_VALIDATE_OR_GOTO (GF_NFS3, carg, nfs3err);
+ if (!carg) {
+ gf_log (GF_NFS3, GF_LOG_ERROR, "Invalid argument,"
+ " carg value NULL");
+ return EINVAL;
+ }
cs = (nfs3_call_state_t *)carg;
nfs3_check_fh_resolve_status (cs, stat, nfs3err);
@@ -1463,11 +1505,12 @@ rpcerr:
int
-nfs3_access_reply (rpcsvc_request_t *req, nfsstat3 status, int32_t accbits)
+nfs3_access_reply (rpcsvc_request_t *req, nfsstat3 status, int32_t accbits,
+ int32_t reqaccbits)
{
access3res res;
- nfs3_fill_access3res (&res, status, accbits);
+ nfs3_fill_access3res (&res, status, accbits, reqaccbits);
nfs3svc_submit_reply (req, &res,
(nfs3_serializer)xdr_serialize_access3res);
return 0;
@@ -1487,11 +1530,11 @@ nfs3svc_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- status = nfs3_errno_to_nfsstat3 (op_errno);
+ status = nfs3_cbk_errno_status (op_ret, op_errno);
}
nfs3_log_common_res (rpcsvc_request_xid (cs->req), NFS3_ACCESS, status,
op_errno);
- nfs3_access_reply (cs->req, status, op_errno);
+ nfs3_access_reply (cs->req, status, op_errno, cs->accessbits);
nfs3_call_state_wipe (cs);
return 0;
@@ -1505,7 +1548,11 @@ nfs3_access_resume (void *carg)
nfs_user_t nfu = {0, };
nfs3_call_state_t *cs = NULL;
- GF_VALIDATE_OR_GOTO (GF_NFS3, carg, nfs3err);
+ if (!carg) {
+ gf_log (GF_NFS3, GF_LOG_ERROR, "Invalid argument,"
+ " carg value NULL");
+ return EINVAL;
+ }
cs = (nfs3_call_state_t *)carg;
nfs3_check_fh_resolve_status (cs, stat, nfs3err);
@@ -1520,7 +1567,7 @@ nfs3err:
if (ret < 0) {
nfs3_log_common_res (rpcsvc_request_xid (cs->req), NFS3_ACCESS,
stat, -ret);
- nfs3_access_reply (cs->req, stat, 0);
+ nfs3_access_reply (cs->req, stat, 0, 0);
nfs3_call_state_wipe (cs);
ret = 0;
}
@@ -1556,7 +1603,7 @@ nfs3err:
if (ret < 0) {
nfs3_log_common_res (rpcsvc_request_xid (req), NFS3_ACCESS,
stat, -ret);
- nfs3_access_reply (req, stat, 0);
+ nfs3_access_reply (req, stat, 0, 0);
nfs3_call_state_wipe (cs);
ret = 0;
}
@@ -1623,7 +1670,7 @@ nfs3svc_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto nfs3err;
}
@@ -1790,7 +1837,7 @@ nfs3svc_read_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto err;
} else
stat = NFS3_OK;
@@ -1976,7 +2023,7 @@ nfs3svc_write_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
} else
stat = NFS3_OK;
@@ -1989,41 +2036,6 @@ nfs3svc_write_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
-/*
- * If this logic determines that the write should return a reply to the client
- * after this function, the return value is -1 and the writetype is reset to
- * the type of write we want to signify to the client.
- *
- * In case the write should continue to serve the request according to the type
- * of stable write, a 0 is returned and writetype is left as it is.
- */
-int
-nfs3_write_how (int *writetype, int write_trusted, int sync_trusted)
-{
- int ret = -1;
-
- if (*writetype == UNSTABLE) {
- /* On an UNSTABLE write, only return STABLE when trusted-write
- * is set. TW is also set when trusted-sync is set.
- */
- if (write_trusted)
- *writetype = FILE_SYNC;
-
- goto err;
- } else if ((*writetype == DATA_SYNC) || (*writetype == FILE_SYNC)) {
-
- /* On a STABLE write, if sync-trusted is on, only then, return
- * without syncing.
- */
- if (sync_trusted)
- goto err;
- }
-
- ret = 0;
-err:
- return ret;
-}
-
/*
* Before going into the write reply logic, here is a matrix that shows the
@@ -2062,12 +2074,8 @@ nfs3svc_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *postbuf, dict_t *xdata)
{
nfsstat3 stat = NFS3ERR_SERVERFAULT;
- int ret = -EFAULT;
- nfs_user_t nfu = {0, };
nfs3_call_state_t *cs = NULL;
struct nfs3_state *nfs3 = NULL;
- int write_trusted = 0;
- int sync_trusted = 0;
cs = frame->local;
nfs3 = rpcsvc_request_program_private (cs->req);
@@ -2075,42 +2083,21 @@ nfs3svc_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto err;
}
stat = NFS3_OK;
cs->maxcount = op_ret;
- write_trusted = nfs3_export_write_trusted (cs->nfs3state,
- cs->resolvefh.exportid);
- sync_trusted = nfs3_export_sync_trusted (cs->nfs3state,
- cs->resolvefh.exportid);
- ret = nfs3_write_how (&cs->writetype, write_trusted, sync_trusted);
- if (ret == -1)
- goto err;
-
- nfs_request_user_init (&nfu, cs->req);
- /* Store the current preattr so that this can be used as the pre attr
- * when fsync returns. We dont want to use the preattr in fsync because
- * the write fop happened before the fsync.
- */
- cs->stbuf = *prebuf;
- ret = nfs_fsync (cs->nfsx, cs->vol, &nfu, cs->fd, 0,
- nfs3svc_write_fsync_cbk, cs);
- if (ret < 0)
- stat = nfs3_errno_to_nfsstat3 (-ret);
-
err:
- if (ret < 0) {
- nfs3_log_write_res (rpcsvc_request_xid (cs->req), stat,
- op_errno, cs->maxcount, cs->writetype,
- nfs3->serverstart);
- nfs3_write_reply (cs->req, stat, cs->maxcount,
- cs->writetype, nfs3->serverstart, prebuf,
- postbuf);
- nfs3_call_state_wipe (cs);
- }
+ nfs3_log_write_res (rpcsvc_request_xid (cs->req), stat,
+ op_errno, cs->maxcount, cs->writetype,
+ nfs3->serverstart);
+ nfs3_write_reply (cs->req, stat, cs->maxcount,
+ cs->writetype, nfs3->serverstart, prebuf,
+ postbuf);
+ nfs3_call_state_wipe (cs);
return 0;
}
@@ -2166,6 +2153,25 @@ nfs3_write_resume (void *carg)
}
cs->fd = fd; /* Gets unrefd when the call state is wiped. */
+
+/*
+ enum stable_how {
+ UNSTABLE = 0,
+ DATA_SYNC = 1,
+ FILE_SYNC = 2,
+ };
+*/
+ switch (cs->writetype) {
+ case UNSTABLE:
+ break;
+ case DATA_SYNC:
+ fd->flags |= O_DSYNC;
+ break;
+ case FILE_SYNC:
+ fd->flags |= O_SYNC;
+ break;
+ }
+
ret = __nfs3_write_resume (cs);
if (ret < 0)
stat = nfs3_errno_to_nfsstat3 (-ret);
@@ -2329,7 +2335,7 @@ nfs3svc_create_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto nfs3err;
}
@@ -2362,7 +2368,7 @@ nfs3svc_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto nfs3err;
}
@@ -2467,7 +2473,7 @@ nfs3svc_create_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
ret = -op_errno;
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto nfs3err;
}
@@ -2638,13 +2644,7 @@ nfs3svc_create (rpcsvc_request_t *req)
}
cval = (uint64_t *)args.how.createhow3_u.verf;
- if (cval)
- cverf = *cval;
- else {
- gf_log(GF_NFS3, GF_LOG_ERROR,
- "Error getting createverf3 from args");
- goto rpcerr;
- }
+ cverf = *cval;
ret = nfs3_create (req, &dirfh, name, args.how.mode,
&args.how.createhow3_u.obj_attributes, cverf);
@@ -2688,7 +2688,7 @@ nfs3svc_mkdir_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto nfs3err;
}
@@ -2720,7 +2720,7 @@ nfs3svc_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto nfs3err;
}
@@ -2898,7 +2898,7 @@ nfs3svc_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto nfs3err;
}
@@ -3060,7 +3060,7 @@ nfs3svc_mknod_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto nfs3err;
}
@@ -3092,7 +3092,7 @@ nfs3svc_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto nfs3err;
}
@@ -3350,7 +3350,7 @@ nfs3svc_remove_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
}
if (op_ret == 0)
@@ -3516,7 +3516,7 @@ nfs3svc_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
} else {
stat = NFS3_OK;
}
@@ -3671,7 +3671,7 @@ nfs3svc_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
"%x: rename %s -> %s => -1 (%s)",
rpcsvc_request_xid (cs->req), cs->oploc.path,
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto nfs3err;
}
@@ -3876,7 +3876,7 @@ nfs3svc_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
"%x: link %s <- %s => -1 (%s)",
rpcsvc_request_xid (cs->req), cs->oploc.path,
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
} else
stat = NFS3_OK;
@@ -4086,7 +4086,7 @@ nfs3svc_readdir_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto nfs3err;
}
@@ -4141,7 +4141,7 @@ nfs3svc_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto err;
}
@@ -4238,12 +4238,30 @@ nfs3err:
}
+int32_t
+nfs3svc_readdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd,
+ dict_t *xdata)
+{
+ /*
+ * We don't really need this, it's just an artifact of forcing the
+ * opendir to happen.
+ */
+ if (fd) {
+ fd_unref(fd);
+ }
+
+ return 0;
+}
+
+
int
nfs3_readdir_open_resume (void *carg)
{
nfsstat3 stat = NFS3ERR_SERVERFAULT;
int ret = -EFAULT;
nfs3_call_state_t *cs = NULL;
+ nfs_user_t nfu = {0, };
if (!carg)
return ret;
@@ -4256,6 +4274,22 @@ nfs3_readdir_open_resume (void *carg)
goto nfs3err;
}
+ /*
+ * NFS client will usually send us a readdirp without an opendir,
+ * which would cause us to skip our usual self-heal checks which occur
+ * in opendir for native protocol. To make sure those checks do happen,
+ * our most reliable option is to do our own opendir for any readdirp
+ * at the beginning of the directory.
+ */
+ if (cs->cookie == 0) {
+ nfs_request_user_init (&nfu, cs->req);
+ ret = nfs_opendir (cs->nfsx, cs->vol, &nfu, &cs->resolvedloc,
+ nfs3svc_readdir_opendir_cbk, cs);
+ if (ret < 0) {
+ gf_log (GF_NFS3, GF_LOG_ERROR, "auto-opendir failed");
+ }
+ }
+
ret = nfs3_readdir_read_resume (cs);
if (ret < 0)
stat = nfs3_errno_to_nfsstat3 (-ret);
@@ -4355,16 +4389,8 @@ nfs3svc_readdir (rpcsvc_request_t *req)
rpcsvc_request_seterr (req, GARBAGE_ARGS);
goto rpcerr;
}
-
cval = (uint64_t *) ra.cookieverf;
-
- if (cval)
- verf = *cval;
- else {
- gf_log(GF_NFS3, GF_LOG_ERROR,
- "Error getting cookieverf from readdir args");
- goto rpcerr;
- }
+ verf = *cval;
ret = nfs3_readdir (req, &fh, ra.cookie, verf, ra.count, 0);
if ((ret < 0) && (ret != RPCSVC_ACTOR_IGNORE)) {
@@ -4395,16 +4421,8 @@ nfs3svc_readdirp (rpcsvc_request_t *req)
rpcsvc_request_seterr (req, GARBAGE_ARGS);
goto rpcerr;
}
-
cval = (uint64_t *) ra.cookieverf;
-
- if (cval)
- cverf = *cval;
- else {
- gf_log (GF_NFS3, GF_LOG_ERROR,
- "Error getting cookieverf from readdirp args");
- goto rpcerr;
- }
+ cverf = *cval;
ret = nfs3_readdir (req, &fh, ra.cookie, cverf, ra.dircount,
ra.maxcount);
@@ -4447,7 +4465,7 @@ nfs3_fsstat_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
} else
stat = NFS3_OK;
@@ -4475,7 +4493,7 @@ nfs3_fsstat_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
ret = -op_errno;
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
goto err;
}
@@ -4634,7 +4652,7 @@ nfs3svc_fsinfo_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- status = nfs3_errno_to_nfsstat3 (op_errno);
+ status = nfs3_cbk_errno_status (op_ret, op_errno);
}else
status = NFS3_OK;
@@ -4776,7 +4794,7 @@ nfs3svc_pathconf_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
} else {
/* If stat fop failed, we can still send the other components
* in a pathconf reply.
@@ -4920,7 +4938,7 @@ nfs3svc_commit_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_log (GF_NFS, GF_LOG_WARNING,
"%x: %s => -1 (%s)", rpcsvc_request_xid (cs->req),
cs->resolvedloc.path, strerror (op_errno));
- stat = nfs3_errno_to_nfsstat3 (op_errno);
+ stat = nfs3_cbk_errno_status (op_ret, op_errno);
} else
stat = NFS3_OK;
@@ -5081,28 +5099,28 @@ rpcerr:
rpcsvc_actor_t nfs3svc_actors[NFS3_PROC_COUNT] = {
- {"NULL", NFS3_NULL, nfs3svc_null, NULL, 0},
- {"GETATTR", NFS3_GETATTR, nfs3svc_getattr,NULL, 0},
- {"SETATTR", NFS3_SETATTR, nfs3svc_setattr,NULL, 0},
- {"LOOKUP", NFS3_LOOKUP, nfs3svc_lookup, NULL, 0},
- {"ACCESS", NFS3_ACCESS, nfs3svc_access, NULL, 0},
- {"READLINK", NFS3_READLINK, nfs3svc_readlink,NULL, 0},
- {"READ", NFS3_READ, nfs3svc_read, NULL, 0},
- {"WRITE", NFS3_WRITE, nfs3svc_write, nfs3svc_write_vecsizer, 0},
- {"CREATE", NFS3_CREATE, nfs3svc_create, NULL, 0},
- {"MKDIR", NFS3_MKDIR, nfs3svc_mkdir, NULL, 0},
- {"SYMLINK", NFS3_SYMLINK, nfs3svc_symlink,NULL, 0},
- {"MKNOD", NFS3_MKNOD, nfs3svc_mknod, NULL, 0},
- {"REMOVE", NFS3_REMOVE, nfs3svc_remove, NULL, 0},
- {"RMDIR", NFS3_RMDIR, nfs3svc_rmdir, NULL, 0},
- {"RENAME", NFS3_RENAME, nfs3svc_rename, NULL, 0},
- {"LINK", NFS3_LINK, nfs3svc_link, NULL, 0},
- {"READDIR", NFS3_READDIR, nfs3svc_readdir,NULL, 0},
- {"READDIRPLUS", NFS3_READDIRP, nfs3svc_readdirp,NULL, 0},
- {"FSSTAT", NFS3_FSSTAT, nfs3svc_fsstat, NULL, 0},
- {"FSINFO", NFS3_FSINFO, nfs3svc_fsinfo, NULL, 0},
- {"PATHCONF", NFS3_PATHCONF, nfs3svc_pathconf,NULL, 0},
- {"COMMIT", NFS3_COMMIT, nfs3svc_commit, NULL, 0}
+ {"NULL", NFS3_NULL, nfs3svc_null, NULL, 0, DRC_IDEMPOTENT},
+ {"GETATTR", NFS3_GETATTR, nfs3svc_getattr, NULL, 0, DRC_IDEMPOTENT},
+ {"SETATTR", NFS3_SETATTR, nfs3svc_setattr, NULL, 0, DRC_NON_IDEMPOTENT},
+ {"LOOKUP", NFS3_LOOKUP, nfs3svc_lookup, NULL, 0, DRC_IDEMPOTENT},
+ {"ACCESS", NFS3_ACCESS, nfs3svc_access, NULL, 0, DRC_IDEMPOTENT},
+ {"READLINK", NFS3_READLINK, nfs3svc_readlink, NULL, 0, DRC_IDEMPOTENT},
+ {"READ", NFS3_READ, nfs3svc_read, NULL, 0, DRC_IDEMPOTENT},
+ {"WRITE", NFS3_WRITE, nfs3svc_write, nfs3svc_write_vecsizer, 0, DRC_NON_IDEMPOTENT},
+ {"CREATE", NFS3_CREATE, nfs3svc_create, NULL, 0, DRC_NON_IDEMPOTENT},
+ {"MKDIR", NFS3_MKDIR, nfs3svc_mkdir, NULL, 0, DRC_NON_IDEMPOTENT},
+ {"SYMLINK", NFS3_SYMLINK, nfs3svc_symlink, NULL, 0, DRC_NON_IDEMPOTENT},
+ {"MKNOD", NFS3_MKNOD, nfs3svc_mknod, NULL, 0, DRC_NON_IDEMPOTENT},
+ {"REMOVE", NFS3_REMOVE, nfs3svc_remove, NULL, 0, DRC_NON_IDEMPOTENT},
+ {"RMDIR", NFS3_RMDIR, nfs3svc_rmdir, NULL, 0, DRC_NON_IDEMPOTENT},
+ {"RENAME", NFS3_RENAME, nfs3svc_rename, NULL, 0, DRC_NON_IDEMPOTENT},
+ {"LINK", NFS3_LINK, nfs3svc_link, NULL, 0, DRC_NON_IDEMPOTENT},
+ {"READDIR", NFS3_READDIR, nfs3svc_readdir, NULL, 0, DRC_IDEMPOTENT},
+ {"READDIRPLUS", NFS3_READDIRP, nfs3svc_readdirp, NULL, 0, DRC_IDEMPOTENT},
+ {"FSSTAT", NFS3_FSSTAT, nfs3svc_fsstat, NULL, 0, DRC_IDEMPOTENT},
+ {"FSINFO", NFS3_FSINFO, nfs3svc_fsinfo, NULL, 0, DRC_IDEMPOTENT},
+ {"PATHCONF", NFS3_PATHCONF, nfs3svc_pathconf, NULL, 0, DRC_IDEMPOTENT},
+ {"COMMIT", NFS3_COMMIT, nfs3svc_commit, NULL, 0, DRC_IDEMPOTENT}
};
@@ -5119,21 +5137,48 @@ rpcsvc_program_t nfs3prog = {
.min_auth = AUTH_NULL,
};
+/*
+ * This function rounds up the input value to multiple of 4096. Min and Max
+ * supported I/O size limits are 4KB (GF_NFS3_FILE_IO_SIZE_MIN) and
+ * 1MB (GF_NFS3_FILE_IO_SIZE_MAX).
+ */
+void
+nfs3_iosize_roundup_4KB (uint64_t *ioszptr)
+{
+ uint64_t iosize;
+ uint64_t iopages;
+
+ if (!ioszptr)
+ return;
+
+ iosize = *ioszptr;
+ iopages = (iosize + GF_NFS3_IO_SIZE -1) >> GF_NFS3_IO_SHIFT;
+ iosize = (iopages * GF_NFS3_IO_SIZE);
+
+ /* Double check - boundary conditions */
+ if (iosize < GF_NFS3_FILE_IO_SIZE_MIN) {
+ iosize = GF_NFS3_FILE_IO_SIZE_MIN;
+ } else if (iosize > GF_NFS3_FILE_IO_SIZE_MAX) {
+ iosize = GF_NFS3_FILE_IO_SIZE_MAX;
+ }
+
+ *ioszptr = iosize;
+}
int
-nfs3_init_options (struct nfs3_state *nfs3, xlator_t *nfsx)
+nfs3_init_options (struct nfs3_state *nfs3, dict_t *options)
{
int ret = -1;
char *optstr = NULL;
uint64_t size64 = 0;
- if ((!nfs3) || (!nfsx))
+ if ((!nfs3) || (!options))
return -1;
/* nfs3.read-size */
nfs3->readsize = GF_NFS3_RTPREF;
- if (dict_get (nfsx->options, "nfs3.read-size")) {
- ret = dict_get_str (nfsx->options, "nfs3.read-size", &optstr);
+ if (dict_get (options, "nfs3.read-size")) {
+ ret = dict_get_str (options, "nfs3.read-size", &optstr);
if (ret < 0) {
gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to read "
" option: nfs3.read-size");
@@ -5141,20 +5186,22 @@ nfs3_init_options (struct nfs3_state *nfs3, xlator_t *nfsx)
goto err;
}
- ret = gf_string2bytesize (optstr, &size64);
- nfs3->readsize = size64;
+ ret = gf_string2uint64 (optstr, &size64);
if (ret == -1) {
gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to format"
" option: nfs3.read-size");
ret = -1;
goto err;
}
+
+ nfs3_iosize_roundup_4KB (&size64);
+ nfs3->readsize = size64;
}
/* nfs3.write-size */
nfs3->writesize = GF_NFS3_WTPREF;
- if (dict_get (nfsx->options, "nfs3.write-size")) {
- ret = dict_get_str (nfsx->options, "nfs3.write-size", &optstr);
+ if (dict_get (options, "nfs3.write-size")) {
+ ret = dict_get_str (options, "nfs3.write-size", &optstr);
if (ret < 0) {
gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to read "
" option: nfs3.write-size");
@@ -5162,20 +5209,22 @@ nfs3_init_options (struct nfs3_state *nfs3, xlator_t *nfsx)
goto err;
}
- ret = gf_string2bytesize (optstr, &size64);
- nfs3->writesize = size64;
+ ret = gf_string2uint64 (optstr, &size64);
if (ret == -1) {
gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to format"
" option: nfs3.write-size");
ret = -1;
goto err;
}
+
+ nfs3_iosize_roundup_4KB (&size64);
+ nfs3->writesize = size64;
}
/* nfs3.readdir.size */
nfs3->readdirsize = GF_NFS3_DTPREF;
- if (dict_get (nfsx->options, "nfs3.readdir-size")) {
- ret = dict_get_str (nfsx->options,"nfs3.readdir-size", &optstr);
+ if (dict_get (options, "nfs3.readdir-size")) {
+ ret = dict_get_str (options,"nfs3.readdir-size", &optstr);
if (ret < 0) {
gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to read"
" option: nfs3.readdir-size");
@@ -5183,16 +5232,17 @@ nfs3_init_options (struct nfs3_state *nfs3, xlator_t *nfsx)
goto err;
}
- ret = gf_string2bytesize (optstr, &size64);
- nfs3->readdirsize = size64;
+ ret = gf_string2uint64 (optstr, &size64);
if (ret == -1) {
gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to format"
" option: nfs3.readdir-size");
ret = -1;
goto err;
}
- }
+ nfs3_iosize_roundup_4KB (&size64);
+ nfs3->readdirsize = size64;
+ }
/* We want to use the size of the biggest param for the io buffer size.
*/
@@ -5206,16 +5256,15 @@ nfs3_init_options (struct nfs3_state *nfs3, xlator_t *nfsx)
* accommodate the NFS headers also in the same buffer. */
nfs3->iobsize = nfs3->iobsize * 2;
- /* mem-factor */
- nfs3->memfactor = GF_NFS3_DEFAULT_MEMFACTOR;
ret = 0;
err:
return ret;
}
-
int
-nfs3_init_subvolume_options (struct nfs3_state *nfs3, struct nfs3_export *exp)
+nfs3_init_subvolume_options (xlator_t *nfsx,
+ struct nfs3_export *exp,
+ dict_t *options)
{
int ret = -1;
char *optstr = NULL;
@@ -5223,14 +5272,20 @@ nfs3_init_subvolume_options (struct nfs3_state *nfs3, struct nfs3_export *exp)
char *name = NULL;
gf_boolean_t boolt = _gf_false;
uuid_t volumeid = {0, };
- dict_t *options = NULL;
- if ((!exp) || (!nfs3))
+ if ((!nfsx) || (!exp))
return -1;
- options = nfs3->nfsx->options;
+ /* For init, fetch options from xlator but for
+ * reconfigure, take the parameter */
+ if (!options)
+ options = nfsx->options;
+
+ if (!options)
+ return (-1);
+
uuid_clear (volumeid);
- if (gf_nfs_dvm_off (nfs_state (nfs3->nfsx)))
+ if (gf_nfs_dvm_off (nfs_state (nfsx)))
goto no_dvm;
ret = snprintf (searchkey, 1024, "nfs3.%s.volume-id",exp->subvol->name);
@@ -5396,7 +5451,7 @@ nfs3_init_subvolume (struct nfs3_state *nfs3, xlator_t *subvol)
INIT_LIST_HEAD (&exp->explist);
gf_log (GF_NFS3, GF_LOG_TRACE, "Initing state: %s", exp->subvol->name);
- ret = nfs3_init_subvolume_options (nfs3, exp);
+ ret = nfs3_init_subvolume_options (nfs3->nfsx, exp, NULL);
if (ret == -1) {
gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to init subvol");
goto exp_free;
@@ -5450,7 +5505,7 @@ nfs3_init_state (xlator_t *nfsx)
unsigned int localpool = 0;
struct nfs_state *nfs = NULL;
- if (!nfsx)
+ if ((!nfsx) || (!nfsx->private))
return NULL;
nfs3 = (struct nfs3_state *)GF_CALLOC (1, sizeof (*nfs3),
@@ -5461,7 +5516,7 @@ nfs3_init_state (xlator_t *nfsx)
}
nfs = nfsx->private;
- ret = nfs3_init_options (nfs3, nfsx);
+ ret = nfs3_init_options (nfs3, nfsx->options);
if (ret == -1) {
gf_log (GF_NFS3, GF_LOG_ERROR, "Failed to init options");
goto ret;
@@ -5469,7 +5524,7 @@ nfs3_init_state (xlator_t *nfsx)
nfs3->iobpool = nfsx->ctx->iobuf_pool;
- localpool = nfs3->memfactor * GF_NFS_CONCURRENT_OPS_MULT;
+ localpool = nfs->memfactor * GF_NFS_CONCURRENT_OPS_MULT;
gf_log (GF_NFS3, GF_LOG_TRACE, "local pool: %d", localpool);
nfs3->localpool = mem_pool_new (nfs3_call_state_t, localpool);
if (!nfs3->localpool) {
@@ -5493,7 +5548,7 @@ nfs3_init_state (xlator_t *nfsx)
LOCK_INIT (&nfs3->fdlrulock);
nfs3->fdcount = 0;
- rpcsvc_create_listeners (nfs->rpcsvc, nfsx->options, nfsx->name);
+ ret = rpcsvc_create_listeners (nfs->rpcsvc, nfsx->options, nfsx->name);
if (ret == -1) {
gf_log (GF_NFS, GF_LOG_ERROR, "Unable to create listeners");
goto free_localpool;
@@ -5535,4 +5590,39 @@ nfs3svc_init (xlator_t *nfsx)
return &nfs3prog;
}
+int
+nfs3_reconfigure_state (xlator_t *nfsx, dict_t *options)
+{
+ int ret = -1;
+ struct nfs3_export *exp = NULL;
+ struct nfs_state *nfs = NULL;
+ struct nfs3_state *nfs3 = NULL;
+
+ if ((!nfsx) || (!nfsx->private) || (!options))
+ goto out;
+
+ nfs = (struct nfs_state *)nfsx->private;
+ nfs3 = nfs->nfs3state;
+ if (!nfs3)
+ goto out;
+
+ ret = nfs3_init_options (nfs3, options);
+ if (ret) {
+ gf_log (GF_NFS3, GF_LOG_ERROR,
+ "Failed to reconfigure options");
+ goto out;
+ }
+ list_for_each_entry (exp, &nfs3->exports, explist) {
+ ret = nfs3_init_subvolume_options (nfsx, exp, options);
+ if (ret) {
+ gf_log (GF_NFS3, GF_LOG_ERROR,
+ "Failed to reconfigure subvol options");
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
diff --git a/xlators/nfs/server/src/nfs3.h b/xlators/nfs/server/src/nfs3.h
index 78eb01b60..e64ef9d15 100644
--- a/xlators/nfs/server/src/nfs3.h
+++ b/xlators/nfs/server/src/nfs3.h
@@ -2,19 +2,10 @@
Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _NFS3_H_
@@ -35,7 +26,8 @@
#include "xdr-nfs3.h"
#include "mem-pool.h"
#include "nlm4.h"
-
+#include "acl3-xdr.h"
+#include "acl3.h"
#include <sys/statvfs.h>
#define GF_NFS3 GF_NFS"-nfsv3"
@@ -47,16 +39,39 @@
/* Static values used for FSINFO
-FIXME: This should be configurable */
-#define GF_NFS3_RTMAX (64 * GF_UNIT_KB)
-#define GF_NFS3_RTPREF (64 * GF_UNIT_KB)
-#define GF_NFS3_RTMULT (4 * GF_UNIT_KB)
-#define GF_NFS3_WTMAX (64 * GF_UNIT_KB)
-#define GF_NFS3_WTPREF (64 * GF_UNIT_KB)
-#define GF_NFS3_WTMULT (4 * GF_UNIT_KB)
-#define GF_NFS3_DTMIN (4 * GF_UNIT_KB)
-#define GF_NFS3_DTPREF (64 * GF_UNIT_KB)
-#define GF_NFS3_MAXFILE (1 * GF_UNIT_PB)
+ * To change the maximum rsize and wsize supported by the NFS client, adjust
+ * GF_NFS3_FILE_IO_SIZE_MAX. The Gluster NFS server defaults to 1MB(1048576)
+ * (same as kernel NFS server). For slower network, rsize/wsize can be trimmed
+ * to 16/32/64-KB. rsize and wsize can be tuned through nfs.read-size and
+ * nfs.write-size respectively.
+ *
+ * NB: For Kernel-NFS, NFS_MAX_FILE_IO_SIZE is 1048576U (1MB).
+ */
+#define GF_NFS3_FILE_IO_SIZE_MAX (1 * GF_UNIT_MB) /* 1048576 */
+#define GF_NFS3_FILE_IO_SIZE_MIN (4 * GF_UNIT_KB) /* 4096 */
+
+#define GF_NFS3_FILE_IO_SIZE_DEF GF_NFS3_FILE_IO_SIZE_MAX
+
+#define GF_NFS3_RTMAX GF_NFS3_FILE_IO_SIZE_MAX
+#define GF_NFS3_RTMIN GF_NFS3_FILE_IO_SIZE_MIN
+#define GF_NFS3_RTPREF GF_NFS3_FILE_IO_SIZE_DEF
+#define GF_NFS3_RTMULT GF_NFS3_FILE_IO_SIZE_MIN
+
+#define GF_NFS3_WTMAX GF_NFS3_FILE_IO_SIZE_MAX
+#define GF_NFS3_WTMIN GF_NFS3_FILE_IO_SIZE_MIN
+#define GF_NFS3_WTPREF GF_NFS3_FILE_IO_SIZE_DEF
+#define GF_NFS3_WTMULT GF_NFS3_FILE_IO_SIZE_MIN
+
+/* This can be tuned through nfs.readdir-size */
+#define GF_NFS3_DTMAX GF_NFS3_FILE_IO_SIZE_MAX
+#define GF_NFS3_DTMIN GF_NFS3_FILE_IO_SIZE_MIN
+#define GF_NFS3_DTPREF GF_NFS3_FILE_IO_SIZE_DEF
+
+#define GF_NFS3_MAXFILESIZE (1 * GF_UNIT_PB)
+
+#define GF_NFS3_IO_SIZE 4096 /* 4-KB */
+#define GF_NFS3_IO_SHIFT 12 /* 2^12 = 4KB */
+
/* FIXME: Handle time resolutions */
#define GF_NFS3_TIMEDELTA_SECS {1,0}
#define GF_NFS3_TIMEDELTA_NSECS {0,1}
@@ -119,20 +134,19 @@ typedef struct nfs3_state {
uint64_t serverstart;
/* NFSv3 Protocol configurables */
- size_t readsize;
- size_t writesize;
- size_t readdirsize;
+ uint64_t readsize;
+ uint64_t writesize;
+ uint64_t readdirsize;
/* Size of the iobufs used, depends on the sizes of the three params
* above.
*/
- size_t iobsize;
-
- unsigned int memfactor;
+ uint64_t iobsize;
struct list_head fdlru;
gf_lock_t fdlrulock;
int fdcount;
+ uint32_t occ_logger;
} nfs3_state_t;
typedef enum nfs3_lookup_type {
@@ -155,6 +169,10 @@ typedef union args_ {
nlm4_shareargs nlm4_shareargs;
nlm4_shareres nlm4_shareres;
nlm4_freeallargs nlm4_freeallargs;
+ getaclargs getaclargs;
+ setaclargs setaclargs;
+ getaclreply getaclreply;
+ setaclreply setaclreply;
} args;
@@ -234,6 +252,14 @@ struct nfs3_local {
int monitor;
rpc_transport_t *trans;
call_frame_t *frame;
+
+ /* ACL */
+ aclentry aclentry[NFS_ACL_MAX_ENTRIES];
+ aclentry daclentry[NFS_ACL_MAX_ENTRIES];
+ int aclcount;
+ char aclxattr[NFS_ACL_MAX_ENTRIES*8 + 4];
+ int daclcount;
+ char daclxattr[NFS_ACL_MAX_ENTRIES*8 + 4];
};
#define nfs3_is_revalidate_lookup(cst) ((cst)->lookuptype == GF_NFS3_REVALIDATE)
@@ -249,9 +275,13 @@ struct inode_op_queue {
pthread_mutex_t qlock;
};
+extern rpcsvc_program_t *
+nfs3svc_init (xlator_t *nfsx);
+extern int
+nfs3_reconfigure_state (xlator_t *nfsx, dict_t *options);
+extern uint64_t
+nfs3_request_xlator_deviceid (rpcsvc_request_t *req);
-extern rpcsvc_program_t *
-nfs3svc_init (xlator_t *nfsx);
#endif
diff --git a/xlators/nfs/server/src/nlm4.c b/xlators/nfs/server/src/nlm4.c
index 05de66d56..4d0083fe2 100644
--- a/xlators/nfs/server/src/nlm4.c
+++ b/xlators/nfs/server/src/nlm4.c
@@ -2,19 +2,10 @@
Copyright (c) 2012 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _CONFIG_H
@@ -42,7 +33,6 @@
#include "nfs-generics.h"
#include "rpc-clnt.h"
#include "nsm-xdr.h"
-#include "nlmcbk-xdr.h"
#include "run.h"
#include <unistd.h>
#include <rpc/pmap_clnt.h>
@@ -149,8 +139,10 @@ nfs3_fh_to_xlator (struct nfs3_state *nfs3, struct nfs3_fh *fh);
xlatorp = nfs3_fh_to_xlator (cst->nfs3state, \
&cst->resolvefh); \
uuid_unparse (cst->resolvefh.gfid, gfid); \
- sprintf (buf, "(%s) %s : %s", trans->peerinfo.identifier,\
- xlatorp ? xlatorp->name : "ERR", gfid); \
+ snprintf (buf, sizeof (buf), "(%s) %s : %s", \
+ trans->peerinfo.identifier, \
+ xlatorp ? xlatorp->name : "ERR", \
+ gfid); \
gf_log (GF_NLM, GF_LOG_ERROR, "Unable to resolve FH"\
": %s", buf); \
nfstat = nlm4_errno_to_nlm4stat (cst->resolve_errno);\
@@ -164,9 +156,9 @@ nlm4_prep_nlm4_testargs (nlm4_testargs *args, struct nfs3_fh *fh,
nlm4_lkowner_t *oh, char *cookiebytes)
{
memset (args, 0, sizeof (*args));
- args->alock.fh.n_bytes = (void *)fh;
- args->alock.oh.n_bytes = (void *)oh;
- args->cookie.n_bytes = (void *)cookiebytes;
+ args->alock.fh.nlm4_netobj_val = (void *)fh;
+ args->alock.oh.nlm4_netobj_val = (void *)oh;
+ args->cookie.nlm4_netobj_val = (void *)cookiebytes;
}
void
@@ -174,9 +166,9 @@ nlm4_prep_nlm4_lockargs (nlm4_lockargs *args, struct nfs3_fh *fh,
nlm4_lkowner_t *oh, char *cookiebytes)
{
memset (args, 0, sizeof (*args));
- args->alock.fh.n_bytes = (void *)fh;
- args->alock.oh.n_bytes = (void *)oh;
- args->cookie.n_bytes = (void *)cookiebytes;
+ args->alock.fh.nlm4_netobj_val = (void *)fh;
+ args->alock.oh.nlm4_netobj_val = (void *)oh;
+ args->cookie.nlm4_netobj_val = (void *)cookiebytes;
}
void
@@ -184,9 +176,9 @@ nlm4_prep_nlm4_cancargs (nlm4_cancargs *args, struct nfs3_fh *fh,
nlm4_lkowner_t *oh, char *cookiebytes)
{
memset (args, 0, sizeof (*args));
- args->alock.fh.n_bytes = (void *)fh;
- args->alock.oh.n_bytes = (void *)oh;
- args->cookie.n_bytes = (void *)cookiebytes;
+ args->alock.fh.nlm4_netobj_val = (void *)fh;
+ args->alock.oh.nlm4_netobj_val = (void *)oh;
+ args->cookie.nlm4_netobj_val = (void *)cookiebytes;
}
void
@@ -194,9 +186,9 @@ nlm4_prep_nlm4_unlockargs (nlm4_unlockargs *args, struct nfs3_fh *fh,
nlm4_lkowner_t *oh, char *cookiebytes)
{
memset (args, 0, sizeof (*args));
- args->alock.fh.n_bytes = (void *)fh;
- args->alock.oh.n_bytes = (void *)oh;
- args->cookie.n_bytes = (void *)cookiebytes;
+ args->alock.fh.nlm4_netobj_val = (void *)fh;
+ args->alock.oh.nlm4_netobj_val = (void *)oh;
+ args->cookie.nlm4_netobj_val = (void *)cookiebytes;
}
void
@@ -204,9 +196,9 @@ nlm4_prep_shareargs (nlm4_shareargs *args, struct nfs3_fh *fh,
nlm4_lkowner_t *oh, char *cookiebytes)
{
memset (args, 0, sizeof (*args));
- args->share.fh.n_bytes = (void *)fh;
- args->share.oh.n_bytes = (void *)oh;
- args->cookie.n_bytes = (void *)cookiebytes;
+ args->share.fh.nlm4_netobj_val = (void *)fh;
+ args->share.oh.nlm4_netobj_val = (void *)oh;
+ args->cookie.nlm4_netobj_val = (void *)cookiebytes;
}
void
@@ -217,25 +209,25 @@ nlm4_prep_freeallargs (nlm4_freeallargs *args, nlm4_lkowner_t *oh)
}
void
-nlm_copy_lkowner (gf_lkowner_t *dst, netobj *src)
+nlm_copy_lkowner (gf_lkowner_t *dst, nlm4_netobj *src)
{
- dst->len = src->n_len;
- memcpy (dst->data, src->n_bytes, dst->len);
+ dst->len = src->nlm4_netobj_len;
+ memcpy (dst->data, src->nlm4_netobj_val, dst->len);
}
int
-nlm_is_oh_same_lkowner (gf_lkowner_t *a, netobj *b)
+nlm_is_oh_same_lkowner (gf_lkowner_t *a, nlm4_netobj *b)
{
if (!a || !b) {
gf_log (GF_NLM, GF_LOG_ERROR, "invalid args");
return -1;
}
- return (a->len == b->n_len &&
- !memcmp (a->data, b->n_bytes, a->len));
+ return (a->len == b->nlm4_netobj_len &&
+ !memcmp (a->data, b->nlm4_netobj_val, a->len));
}
-nfsstat3
+nlm4_stats
nlm4_errno_to_nlm4stat (int errnum)
{
nlm4_stats stat = nlm4_denied;
@@ -345,13 +337,12 @@ nlm_set_rpc_clnt (rpc_clnt_t *rpc_clnt, char *caller_name)
break;
}
}
+
if (!nlmclnt_found) {
nlmclnt = GF_CALLOC (1, sizeof(*nlmclnt),
gf_nfs_mt_nlm4_nlmclnt);
- if (nlmclnt == NULL) {
- gf_log (GF_NLM, GF_LOG_ERROR, "mem-alloc error");
+ if (nlmclnt == NULL)
goto ret;
- }
INIT_LIST_HEAD(&nlmclnt->fdes);
INIT_LIST_HEAD(&nlmclnt->nlm_clients);
@@ -360,6 +351,7 @@ nlm_set_rpc_clnt (rpc_clnt_t *rpc_clnt, char *caller_name)
list_add (&nlmclnt->nlm_clients, &nlm_client_list);
nlmclnt->caller_name = gf_strdup (caller_name);
}
+
if (nlmclnt->rpc_clnt == NULL) {
nlmclnt->rpc_clnt = rpc_clnt_ref (rpc_clnt);
}
@@ -434,10 +426,11 @@ ret:
int
nlm4svc_submit_reply (rpcsvc_request_t *req, void *arg, nlm4_serializer sfunc)
{
- struct iovec outmsg = {0, };
- struct iobuf *iob = NULL;
- struct nfs3_state *nfs3 = NULL;
- int ret = -1;
+ struct iovec outmsg = {0, };
+ struct iobuf *iob = NULL;
+ struct nfs3_state *nfs3 = NULL;
+ int ret = -1;
+ ssize_t msglen = 0;
struct iobref *iobref = NULL;
if (!req)
@@ -462,7 +455,12 @@ nlm4svc_submit_reply (rpcsvc_request_t *req, void *arg, nlm4_serializer sfunc)
/* Use the given serializer to translate the give C structure in arg
* to XDR format which will be written into the buffer in outmsg.
*/
- outmsg.iov_len = sfunc (outmsg, arg);
+ msglen = sfunc (outmsg, arg);
+ if (msglen < 0) {
+ gf_log (GF_NLM, GF_LOG_ERROR, "Failed to encode message");
+ goto ret;
+ }
+ outmsg.iov_len = msglen;
iobref = iobref_new ();
if (iobref == NULL) {
@@ -470,7 +468,11 @@ nlm4svc_submit_reply (rpcsvc_request_t *req, void *arg, nlm4_serializer sfunc)
goto ret;
}
- iobref_add (iobref, iob);
+ ret = iobref_add (iobref, iob);
+ if (ret) {
+ gf_log (GF_NLM, GF_LOG_ERROR, "Failed to add iob to iobref");
+ goto ret;
+ }
/* Then, submit the message for transmission. */
ret = rpcsvc_submit_message (req, &outmsg, 1, NULL, 0, iobref);
@@ -594,9 +596,15 @@ nlm4_file_open_and_resume(nfs3_call_state_t *cs, nlm4_resume_fn_t resume)
{
fd_t *fd = NULL;
int ret = -1;
+ int flags = 0;
nlm_client_t *nlmclnt = NULL;
call_frame_t *frame = NULL;
+ if (cs->args.nlm4_lockargs.exclusive == _gf_false)
+ flags = O_RDONLY;
+ else
+ flags = O_WRONLY;
+
nlmclnt = nlm_get_uniq (cs->args.nlm4_lockargs.alock.caller_name);
if (nlmclnt == NULL) {
gf_log (GF_NLM, GF_LOG_ERROR, "nlm_get_uniq() returned NULL");
@@ -630,16 +638,13 @@ nlm4_file_open_and_resume(nfs3_call_state_t *cs, nlm4_resume_fn_t resume)
}
frame->root->pid = NFS_PID;
- frame->root->uid = 0;
- frame->root->gid = 0;
+ frame->root->uid = rpcsvc_request_uid (cs->req);
+ frame->root->gid = rpcsvc_request_gid (cs->req);
frame->local = cs;
- /*
- * This is the only place that we call STACK_WIND without nfs_fix_groups,
- * because in this particular case the relevant identify is in lk_owner and
- * we don't care about the fields that nfs_fix_groups would set up.
- */
+ nfs_fix_groups (cs->nfsx, frame->root);
+
STACK_WIND_COOKIE (frame, nlm4_file_open_cbk, cs->vol, cs->vol,
- cs->vol->fops->open, &cs->resolvedloc, O_RDWR,
+ cs->vol->fops->open, &cs->resolvedloc, flags,
cs->fd, NULL);
ret = 0;
err:
@@ -647,7 +652,7 @@ err:
}
int
-nlm4_generic_reply (rpcsvc_request_t *req, netobj cookie, nlm4_stats stat)
+nlm4_generic_reply (rpcsvc_request_t *req, nlm4_netobj cookie, nlm4_stats stat)
{
nlm4_res res;
@@ -880,50 +885,66 @@ nlm4svc_send_granted_cbk (struct rpc_req *req, struct iovec *iov, int count,
return 0;
}
-int nlm_rpcclnt_notify (struct rpc_clnt *rpc, void *mydata,
- rpc_clnt_event_t fn, void *data)
+void
+nlm4svc_send_granted (nfs3_call_state_t *cs);
+
+int
+nlm_rpcclnt_notify (struct rpc_clnt *rpc_clnt, void *mydata,
+ rpc_clnt_event_t fn, void *data)
{
- nlm_condmutex_t *cm = NULL;
- int ret;
- cm = mydata;
+ int ret = 0;
+ char *caller_name = NULL;
+ nfs3_call_state_t *cs = NULL;
+
+ cs = mydata;
+ caller_name = cs->args.nlm4_lockargs.alock.caller_name;
+
switch (fn) {
case RPC_CLNT_CONNECT:
- ret = pthread_cond_broadcast (&cm->cond);
- if (ret!=0)
- gf_log (GF_NLM, GF_LOG_ERROR, "cond_broadcast error %s",
- strerror (errno));
+ ret = nlm_set_rpc_clnt (rpc_clnt, caller_name);
+ if (ret == -1) {
+ gf_log (GF_NLM, GF_LOG_ERROR, "Failed to set rpc clnt");
+ goto err;
+ }
+ rpc_clnt_unref (rpc_clnt);
+ nlm4svc_send_granted (cs);
+
break;
+
case RPC_CLNT_MSG:
break;
+
case RPC_CLNT_DISCONNECT:
- nlm_unset_rpc_clnt(rpc);
+ nlm_unset_rpc_clnt (rpc_clnt);
+ break;
+ default:
break;
}
+
+ err:
return 0;
}
-void
-nlm4svc_send_granted (nfs3_call_state_t *cs);
-
void *
nlm4_establish_callback (void *csarg)
{
- nfs3_call_state_t *cs = NULL;
- union gf_sock_union sock_union;
- dict_t *options = NULL;
- char peerip[INET6_ADDRSTRLEN+1], *portstr = NULL;
- rpc_clnt_t *rpc_clnt = NULL;
- int port = -1;
- int ret = -1;
- char *caller_name = NULL;
+ int ret = -1;
+ nfs3_call_state_t *cs = NULL;
+ union gf_sock_union sock_union;
+ dict_t *options = NULL;
+ char peerip[INET6_ADDRSTRLEN+1] = {0};
+ char *portstr = NULL;
+ char myip[INET6_ADDRSTRLEN+1] = {0};
+ rpc_clnt_t *rpc_clnt = NULL;
+ int port = -1;
+
cs = (nfs3_call_state_t *) csarg;
glusterfs_this_set (cs->nfsx);
- caller_name = cs->args.nlm4_lockargs.alock.caller_name;
-
rpc_transport_get_peeraddr (cs->trans, NULL, 0, &sock_union.storage,
sizeof (sock_union.storage));
+
switch (sock_union.sa.sa_family) {
case AF_INET6:
/* can not come here as NLM listens on IPv4 */
@@ -939,6 +960,9 @@ nlm4_establish_callback (void *csarg)
case AF_INET:
inet_ntop (AF_INET, &sock_union.sin.sin_addr, peerip,
INET6_ADDRSTRLEN+1);
+ inet_ntop (AF_INET, &(((struct sockaddr_in *)&cs->trans->myinfo.sockaddr)->sin_addr),
+ myip, INET6_ADDRSTRLEN + 1);
+
break;
default:
break;
@@ -950,8 +974,10 @@ nlm4_establish_callback (void *csarg)
NLM_V4, IPPROTO_TCP);
if (port == 0) {
- gf_log (GF_NLM, GF_LOG_ERROR, "Unable to get NLM port of the "
- "client. Is the firewall running on client?");
+ gf_log (GF_NLM, GF_LOG_ERROR,
+ "Unable to get NLM port of the client."
+ " Is the firewall running on client?"
+ " OR Are RPC services running (rpcinfo -p)?");
goto err;
}
@@ -978,6 +1004,13 @@ nlm4_establish_callback (void *csarg)
gf_log (GF_NLM, GF_LOG_ERROR, "dict_set_dynstr error");
goto err;
}
+
+ /* needed in case virtual IP is used */
+ ret = dict_set_dynstr (options, "transport.socket.source-addr",
+ gf_strdup (myip));
+ if (ret == -1)
+ goto err;
+
ret = dict_set_str (options, "auth-null", "on");
if (ret == -1) {
gf_log (GF_NLM, GF_LOG_ERROR, "dict_set_dynstr error");
@@ -990,32 +1023,25 @@ nlm4_establish_callback (void *csarg)
gf_log (GF_NLM, GF_LOG_ERROR, "rpc_clnt NULL");
goto err;
}
- nlm_condmutex_t *cm;
- cm = GF_CALLOC (1, sizeof(*cm), gf_nfs_mt_nlm4_cm);
- pthread_mutex_init (&cm->mutex, NULL);
- pthread_cond_init (&cm->cond, NULL);
- ret = rpc_clnt_register_notify (rpc_clnt, nlm_rpcclnt_notify,
- cm);
+
+ ret = rpc_clnt_register_notify (rpc_clnt, nlm_rpcclnt_notify, cs);
if (ret == -1) {
gf_log (GF_NLM, GF_LOG_ERROR,"rpc_clnt_register_connect error");
goto err;
}
+
+ /* After this connect succeeds, granted msg is sent in notify */
ret = rpc_transport_connect (rpc_clnt->conn.trans, port);
- pthread_cond_wait (&cm->cond, &cm->mutex);
- pthread_mutex_destroy (&cm->mutex);
- GF_FREE (cm);
- rpc_clnt_set_connected (&rpc_clnt->conn);
- ret = nlm_set_rpc_clnt (rpc_clnt, caller_name);
- if (ret == -1) {
- gf_log (GF_NLM, GF_LOG_ERROR, "dict_set_ptr error");
- goto err;
- }
- nlm4svc_send_granted (cs);
+
+ if (ret == -1 && EINPROGRESS == errno)
+ ret = 0;
+
err:
- if (rpc_clnt) {
+ if (ret == -1 && rpc_clnt) {
rpc_clnt_unref (rpc_clnt);
}
- return NULL;
+
+ return rpc_clnt;
}
void
@@ -1028,12 +1054,17 @@ nlm4svc_send_granted (nfs3_call_state_t *cs)
struct iobuf *iobuf = NULL;
struct iobref *iobref = NULL;
char peerip[INET6_ADDRSTRLEN+1];
- pthread_t thr;
union gf_sock_union sock_union;
+ rpc_clnt = nlm_get_rpc_clnt (cs->args.nlm4_lockargs.alock.caller_name);
+ if (rpc_clnt == NULL) {
+ nlm4_establish_callback ((void*)cs);
+ return;
+ }
rpc_transport_get_peeraddr (cs->trans, NULL, 0, &sock_union.storage,
sizeof (sock_union.storage));
+
switch (sock_union.sa.sa_family) {
case AF_INET6:
inet_ntop (AF_INET6, &sock_union.sin6.sin6_addr, peerip,
@@ -1042,15 +1073,9 @@ nlm4svc_send_granted (nfs3_call_state_t *cs)
case AF_INET:
inet_ntop (AF_INET, &sock_union.sin.sin_addr, peerip,
INET6_ADDRSTRLEN+1);
+ break;
default:
break;
- /* FIXME: handle the error */
- }
-
- rpc_clnt = nlm_get_rpc_clnt (cs->args.nlm4_lockargs.alock.caller_name);
- if (rpc_clnt == NULL) {
- pthread_create (&thr, NULL, nlm4_establish_callback, (void*)cs);
- return;
}
testargs.cookie = cs->args.nlm4_lockargs.cookie;
@@ -1075,13 +1100,15 @@ nlm4svc_send_granted (nfs3_call_state_t *cs)
goto ret;
}
- iobref_add (iobref, iobuf);
+ ret = iobref_add (iobref, iobuf);
+ if (ret) {
+ gf_log (GF_NLM, GF_LOG_ERROR, "Failed to add iob to iobref");
+ goto ret;
+ }
ret = rpc_clnt_submit (rpc_clnt, &nlm4clntprog, NLM4_GRANTED,
- nlm4svc_send_granted_cbk,
- &outmsg, 1,
- NULL,
- 0, iobref, cs->frame, NULL, 0,
+ nlm4svc_send_granted_cbk, &outmsg, 1,
+ NULL, 0, iobref, cs->frame, NULL, 0,
NULL, 0, NULL);
if (ret < 0) {
@@ -1276,12 +1303,11 @@ nlm4svc_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
cs = frame->local;
caller_name = cs->args.nlm4_lockargs.alock.caller_name;
- transit_cnt = nlm_dec_transit_count (cs->fd,
- caller_name);
+ transit_cnt = nlm_dec_transit_count (cs->fd, caller_name);
+
if (op_ret == -1) {
if (transit_cnt == 0)
- nlm_search_and_delete (cs->fd,
- caller_name);
+ nlm_search_and_delete (cs->fd, caller_name);
stat = nlm4_errno_to_nlm4stat (op_errno);
goto err;
} else {
@@ -1295,6 +1321,7 @@ nlm4svc_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
err:
if (cs->args.nlm4_lockargs.block) {
cs->frame = copy_frame (frame);
+ frame->local = NULL;
nlm4svc_send_granted (cs);
} else {
nlm4_generic_reply (cs->req, cs->args.nlm4_lockargs.cookie,
@@ -1789,25 +1816,38 @@ nlm4_add_share_to_inode (nlm_share_t *share)
struct list_head *head = NULL;
xlator_t *this = NULL;
inode_t *inode = NULL;
+ struct nfs_inode_ctx *ictx = NULL;
+ struct nfs_state *priv = NULL;
this = THIS;
+ priv = this->private;
inode = share->inode;
ret = inode_ctx_get (inode, this, &ctx);
- head = (struct list_head *)ctx;
-
- if (ret || !head) {
- head = GF_CALLOC (1, sizeof (struct list_head),
- gf_common_mt_list_head);
- if (!head ) {
+ if (ret == -1) {
+ ictx = GF_CALLOC (1, sizeof (struct nfs_inode_ctx),
+ gf_nfs_mt_inode_ctx);
+ if (!ictx ) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not allocate nfs inode ctx");
ret = -1;
goto out;
}
+ ictx->generation = priv->generation;
+ head = &ictx->shares;
INIT_LIST_HEAD (head);
- ret = inode_ctx_put (inode, this, (uint64_t)head);
- if (ret)
+
+ ret = inode_ctx_put (inode, this, (uint64_t)ictx);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not store share list");
goto out;
+ }
+ }
+ else {
+ ictx = (struct nfs_inode_ctx *)ctx;
+ head = &ictx->shares;
}
list_add (&share->inode_list, head);
@@ -1829,6 +1869,7 @@ nlm4_approve_share_reservation (nfs3_call_state_t *cs)
inode_t *inode = NULL;
nlm_share_t *share = NULL;
struct list_head *head = NULL;
+ struct nfs_inode_ctx *ictx = NULL;
if (!cs)
goto out;
@@ -1840,8 +1881,9 @@ nlm4_approve_share_reservation (nfs3_call_state_t *cs)
ret = 0;
goto out;
}
+ ictx = (struct nfs_inode_ctx *)ctx;
- head = (struct list_head *)ctx;
+ head = &ictx->shares;
if (!head || list_empty (head))
goto out;
@@ -2014,18 +2056,19 @@ nlm4svc_share (rpcsvc_request_t *req)
int
nlm4_remove_share_reservation (nfs3_call_state_t *cs)
{
- int ret = -1;
- uint64_t ctx = 0;
- fsh_mode req_mode = 0;
- fsh_access req_access = 0;
- nlm_share_t *share = NULL;
- nlm_share_t *tmp = NULL;
- nlm_client_t *client = NULL;
- char *caller = NULL;
- inode_t *inode = NULL;
- xlator_t *this = NULL;
- struct list_head *head = NULL;
- nlm4_shareargs *args = NULL;
+ int ret = -1;
+ uint64_t ctx = 0;
+ fsh_mode req_mode = 0;
+ fsh_access req_access = 0;
+ nlm_share_t *share = NULL;
+ nlm_share_t *tmp = NULL;
+ nlm_client_t *client = NULL;
+ char *caller = NULL;
+ inode_t *inode = NULL;
+ xlator_t *this = NULL;
+ struct list_head *head = NULL;
+ nlm4_shareargs *args = NULL;
+ struct nfs_inode_ctx *ictx = NULL;
LOCK (&nlm_client_list_lk);
@@ -2055,8 +2098,9 @@ nlm4_remove_share_reservation (nfs3_call_state_t *cs)
inode->gfid, caller);
goto out;
}
+ ictx = (struct nfs_inode_ctx *)ctx;
- head = (struct list_head *)ctx;
+ head = &ictx->shares;
if (list_empty (head)) {
ret = -1;
goto out;
@@ -2252,34 +2296,34 @@ nlm4svc_sm_notify (struct nlm_sm_status *status)
rpcsvc_actor_t nlm4svc_actors[NLM4_PROC_COUNT] = {
/* 0 */
- {"NULL", NLM4_NULL, nlm4svc_null, NULL},
- {"TEST", NLM4_TEST, nlm4svc_test, NULL},
- {"LOCK", NLM4_LOCK, nlm4svc_lock, NULL},
- {"CANCEL", NLM4_CANCEL, nlm4svc_cancel, NULL},
- {"UNLOCK", NLM4_UNLOCK, nlm4svc_unlock, NULL},
+ {"NULL", NLM4_NULL, nlm4svc_null, NULL, 0, DRC_IDEMPOTENT},
+ {"TEST", NLM4_TEST, nlm4svc_test, NULL, 0, DRC_IDEMPOTENT},
+ {"LOCK", NLM4_LOCK, nlm4svc_lock, NULL, 0, DRC_NON_IDEMPOTENT},
+ {"CANCEL", NLM4_CANCEL, nlm4svc_cancel, NULL, 0, DRC_NON_IDEMPOTENT},
+ {"UNLOCK", NLM4_UNLOCK, nlm4svc_unlock, NULL, 0, DRC_NON_IDEMPOTENT},
/* 5 */
- {"GRANTED", NLM4_GRANTED, NULL, NULL},
- {"TEST", NLM4_TEST_MSG, NULL, NULL},
- {"LOCK", NLM4_LOCK_MSG, NULL, NULL},
- {"CANCEL", NLM4_CANCEL_MSG, NULL, NULL},
- {"UNLOCK", NLM4_UNLOCK_MSG, NULL, NULL},
+ {"GRANTED", NLM4_GRANTED, NULL, NULL, 0, DRC_NA},
+ {"TEST", NLM4_TEST_MSG, NULL, NULL, 0, DRC_NA},
+ {"LOCK", NLM4_LOCK_MSG, NULL, NULL, 0, DRC_NA},
+ {"CANCEL", NLM4_CANCEL_MSG, NULL, NULL, 0, DRC_NA},
+ {"UNLOCK", NLM4_UNLOCK_MSG, NULL, NULL, 0, DRC_NA},
/* 10 */
- {"GRANTED", NLM4_GRANTED_MSG, NULL, NULL},
- {"TEST", NLM4_TEST_RES, NULL, NULL},
- {"LOCK", NLM4_LOCK_RES, NULL, NULL},
- {"CANCEL", NLM4_CANCEL_RES, NULL, NULL},
- {"UNLOCK", NLM4_UNLOCK_RES, NULL, NULL},
+ {"GRANTED", NLM4_GRANTED_MSG, NULL, NULL, 0, DRC_NA},
+ {"TEST", NLM4_TEST_RES, NULL, NULL, 0, DRC_NA},
+ {"LOCK", NLM4_LOCK_RES, NULL, NULL, 0, DRC_NA},
+ {"CANCEL", NLM4_CANCEL_RES, NULL, NULL, 0, DRC_NA},
+ {"UNLOCK", NLM4_UNLOCK_RES, NULL, NULL, 0, DRC_NA},
/* 15 ; procedures 17,18,19 are not defined by nlm */
- {"GRANTED", NLM4_GRANTED_RES, NULL, NULL},
- {"SM_NOTIFY", NLM4_SM_NOTIFY, NULL, NULL},
- {"SEVENTEEN", NLM4_SEVENTEEN, NULL, NULL},
- {"EIGHTEEN", NLM4_EIGHTEEN, NULL, NULL},
- {"NINETEEN", NLM4_NINETEEN, NULL, NULL},
+ {"GRANTED", NLM4_GRANTED_RES, NULL, NULL, 0, DRC_NA},
+ {"SM_NOTIFY", NLM4_SM_NOTIFY, NULL, NULL, 0, DRC_NA},
+ {"SEVENTEEN", NLM4_SEVENTEEN, NULL, NULL, 0, DRC_NA},
+ {"EIGHTEEN", NLM4_EIGHTEEN, NULL, NULL, 0, DRC_NA},
+ {"NINETEEN", NLM4_NINETEEN, NULL, NULL, 0, DRC_NA},
/* 20 */
- {"SHARE", NLM4_SHARE, nlm4svc_share, NULL},
- {"UNSHARE", NLM4_UNSHARE, nlm4svc_unshare, NULL},
- {"NM_LOCK", NLM4_NM_LOCK, nlm4svc_nm_lock, NULL},
- {"FREE_ALL", NLM4_FREE_ALL, nlm4svc_free_all, NULL},
+ {"SHARE", NLM4_SHARE, nlm4svc_share, NULL, 0, DRC_NON_IDEMPOTENT},
+ {"UNSHARE", NLM4_UNSHARE, nlm4svc_unshare, NULL, 0, DRC_NON_IDEMPOTENT},
+ {"NM_LOCK", NLM4_NM_LOCK, nlm4svc_nm_lock, NULL, 0, DRC_NON_IDEMPOTENT},
+ {"FREE_ALL", NLM4_FREE_ALL, nlm4svc_free_all, NULL, 0, DRC_IDEMPOTENT},
};
rpcsvc_program_t nlm4prog = {
@@ -2315,9 +2359,14 @@ nlm4svc_init(xlator_t *nfsx)
int ret = -1;
char *portstr = NULL;
pthread_t thr;
- struct timeval timeout = {0,};
+ struct timespec timeout = {0,};
FILE *pidfile = NULL;
pid_t pid = -1;
+ static gf_boolean_t nlm4_inited = _gf_false;
+
+ /* Already inited */
+ if (nlm4_inited)
+ return &nlm4prog;
nfs = (struct nfs_state*)nfsx->private;
@@ -2363,7 +2412,7 @@ nlm4svc_init(xlator_t *nfsx)
goto err;
}
- rpcsvc_create_listeners (nfs->rpcsvc, options, "NLM");
+ ret = rpcsvc_create_listeners (nfs->rpcsvc, options, "NLM");
if (ret == -1) {
gf_log (GF_NLM, GF_LOG_ERROR, "Unable to create listeners");
dict_unref (options);
@@ -2375,9 +2424,21 @@ nlm4svc_init(xlator_t *nfsx)
/* unlink sm-notify.pid so that when we restart rpc.statd/sm-notify
* it thinks that the machine has restarted and sends NOTIFY to clients.
*/
- ret = unlink ("/var/run/sm-notify.pid");
+
+ /* TODO:
+ notify/rpc.statd is done differently on OSX
+
+ On OSX rpc.statd is controlled by rpc.lockd and are part for launchd
+ (unified service management framework)
+
+ A runcmd() should be invoking "launchctl start com.apple.lockd"
+ instead. This is still a theory but we need to thoroughly test it
+ out. Until then NLM support is non-existent on OSX.
+ */
+ ret = unlink (GF_SM_NOTIFY_PIDFILE);
if (ret == -1 && errno != ENOENT) {
- gf_log (GF_NLM, GF_LOG_ERROR, "unable to unlink sm-notify");
+ gf_log (GF_NLM, GF_LOG_ERROR, "unable to unlink %s: %d",
+ GF_SM_NOTIFY_PIDFILE, errno);
goto err;
}
/* temporary work around to restart statd, not distro/OS independant.
@@ -2385,41 +2446,50 @@ nlm4svc_init(xlator_t *nfsx)
* killall will cause problems on solaris.
*/
- pidfile = fopen ("/var/run/rpc.statd.pid", "r");
+ char *pid_file = GF_RPC_STATD_PIDFILE;
+ if (nfs->rpc_statd_pid_file)
+ pid_file = nfs->rpc_statd_pid_file;
+ pidfile = fopen (pid_file, "r");
if (pidfile) {
ret = fscanf (pidfile, "%d", &pid);
if (ret <= 0) {
gf_log (GF_NLM, GF_LOG_WARNING, "unable to get pid of "
- "rpc.statd");
+ "rpc.statd from %s ", GF_RPC_STATD_PIDFILE);
ret = runcmd ("killall", "-9", "rpc.statd", NULL);
} else
kill (pid, SIGKILL);
fclose (pidfile);
} else {
- gf_log (GF_NLM, GF_LOG_WARNING, "opening the pid file of "
- "rpc.statd failed (%s)", strerror (errno));
+ gf_log (GF_NLM, GF_LOG_WARNING, "opening %s of "
+ "rpc.statd failed (%s)", pid_file, strerror (errno));
/* if ret == -1, do nothing - case either statd was not
* running or was running in valgrind mode
*/
ret = runcmd ("killall", "-9", "rpc.statd", NULL);
}
- ret = unlink ("/var/run/rpc.statd.pid");
+ ret = unlink (GF_RPC_STATD_PIDFILE);
if (ret == -1 && errno != ENOENT) {
- gf_log (GF_NLM, GF_LOG_ERROR, "unable to unlink rpc.statd");
+ gf_log (GF_NLM, GF_LOG_ERROR, "unable to unlink %s", pid_file);
goto err;
}
- ret = runcmd ("/sbin/rpc.statd", NULL);
+ ret = runcmd (nfs->rpc_statd, NULL);
if (ret == -1) {
- gf_log (GF_NLM, GF_LOG_ERROR, "unable to start rpc.statd");
+ gf_log (GF_NLM, GF_LOG_ERROR, "unable to start %s",
+ nfs->rpc_statd);
goto err;
}
+
+
pthread_create (&thr, NULL, nsm_thread, (void*)NULL);
timeout.tv_sec = nlm_grace_period;
+ timeout.tv_nsec = 0;
+
gf_timer_call_after (nfsx->ctx, timeout, nlm_grace_period_over, NULL);
+ nlm4_inited = _gf_true;
return &nlm4prog;
err:
return NULL;
@@ -2438,7 +2508,8 @@ nlm_priv (xlator_t *this)
gf_proc_dump_add_section("nfs.nlm");
- LOCK (&nlm_client_list_lk);
+ if (TRY_LOCK (&nlm_client_list_lk))
+ goto out;
list_for_each_entry (client, &nlm_client_list, nlm_clients) {
@@ -2461,7 +2532,15 @@ nlm_priv (xlator_t *this)
gf_proc_dump_build_key (key, "nlm", "client-count");
gf_proc_dump_write (key, "%d", client_count);
-
+ ret = 0;
UNLOCK (&nlm_client_list_lk);
+
+ out:
+ if (ret) {
+ gf_proc_dump_build_key (key, "nlm", "statedump_error");
+ gf_proc_dump_write (key, "Unable to dump nlm state because "
+ "nlm_client_list_lk lock couldn't be acquired");
+ }
+
return ret;
}
diff --git a/xlators/nfs/server/src/nlm4.h b/xlators/nfs/server/src/nlm4.h
index 0cc82f162..e234b6944 100644
--- a/xlators/nfs/server/src/nlm4.h
+++ b/xlators/nfs/server/src/nlm4.h
@@ -2,19 +2,10 @@
Copyright (c) 2012 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _NLM4_H_
@@ -40,9 +31,44 @@
#include "nlm4-xdr.h"
#include "lkowner.h"
+#define NLM4_NULL 0
+#define NLM4_TEST 1
+#define NLM4_LOCK 2
+#define NLM4_CANCEL 3
+#define NLM4_UNLOCK 4
+#define NLM4_GRANTED 5
+#define NLM4_TEST_MSG 6
+#define NLM4_LOCK_MSG 7
+#define NLM4_CANCEL_MSG 8
+#define NLM4_UNLOCK_MSG 9
+#define NLM4_GRANTED_MSG 10
+#define NLM4_TEST_RES 11
+#define NLM4_LOCK_RES 12
+#define NLM4_CANCEL_RES 13
+#define NLM4_UNLOCK_RES 14
+#define NLM4_GRANTED_RES 15
+#define NLM4_SM_NOTIFY 16
+#define NLM4_SEVENTEEN 17
+#define NLM4_EIGHTEEN 18
+#define NLM4_NINETEEN 19
+#define NLM4_SHARE 20
+#define NLM4_UNSHARE 21
+#define NLM4_NM_LOCK 22
+#define NLM4_FREE_ALL 23
+#define NLM4_PROC_COUNT 24
+
/* Registered with portmap */
#define GF_NLM4_PORT 38468
#define GF_NLM GF_NFS"-NLM"
+#ifdef GF_DARWIN_HOST_OS
+#define GF_RPC_STATD_PROG "/usr/sbin/rpc.statd"
+#define GF_RPC_STATD_PIDFILE "/var/run/statd.pid"
+#define GF_SM_NOTIFY_PIDFILE "/var/run/statd.notify.pid"
+#else
+#define GF_RPC_STATD_PROG "/sbin/rpc.stat"
+#define GF_RPC_STATD_PIDFILE "/var/run/rpc.statd.pid"
+#define GF_SM_NOTIFY_PIDFILE "/var/run/sm-notify.pid"
+#endif
extern rpcsvc_program_t *
nlm4svc_init (xlator_t *nfsx);
@@ -83,9 +109,4 @@ typedef struct nlm_fde {
int transit_cnt;
} nlm_fde_t;
-typedef struct {
- pthread_cond_t cond;
- pthread_mutex_t mutex;
-} nlm_condmutex_t;
-
#endif
diff --git a/xlators/nfs/server/src/nlmcbk_svc.c b/xlators/nfs/server/src/nlmcbk_svc.c
index 5401dc39b..20d3728d0 100644
--- a/xlators/nfs/server/src/nlmcbk_svc.c
+++ b/xlators/nfs/server/src/nlmcbk_svc.c
@@ -2,19 +2,10 @@
Copyright (c) 2012 Gluster, Inc. <http://www.gluster.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
/*
@@ -22,7 +13,6 @@
* It was generated using rpcgen.
*/
-#include "nlmcbk-xdr.h"
#include "nlm4.h"
#include "logging.h"
#include <stdio.h>
@@ -92,10 +82,14 @@ void *
nsm_thread (void *argv)
{
register SVCXPRT *transp;
+ int ret = 0;
- pmap_unset (NLMCBK_PROGRAM, NLMCBK_V1);
-
- transp = svcudp_create(RPC_ANYSOCK);
+ ret = pmap_unset (NLMCBK_PROGRAM, NLMCBK_V1);
+ if (ret == 0) {
+ gf_log (GF_NLM, GF_LOG_ERROR, "pmap_unset failed");
+ return NULL;
+ }
+ transp = svcudp_create(RPC_ANYSOCK);
if (transp == NULL) {
gf_log (GF_NLM, GF_LOG_ERROR, "cannot create udp service.");
return NULL;
diff --git a/xlators/performance/Makefile.am b/xlators/performance/Makefile.am
index eb94d8d6a..a494190ba 100644
--- a/xlators/performance/Makefile.am
+++ b/xlators/performance/Makefile.am
@@ -1,3 +1,3 @@
-SUBDIRS = write-behind read-ahead io-threads io-cache symlink-cache quick-read md-cache
+SUBDIRS = write-behind read-ahead readdir-ahead io-threads io-cache symlink-cache quick-read md-cache open-behind
CLEANFILES =
diff --git a/xlators/performance/io-cache/src/Makefile.am b/xlators/performance/io-cache/src/Makefile.am
index 838e5f597..155be9988 100644
--- a/xlators/performance/io-cache/src/Makefile.am
+++ b/xlators/performance/io-cache/src/Makefile.am
@@ -1,7 +1,7 @@
xlator_LTLIBRARIES = io-cache.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
-io_cache_la_LDFLAGS = -module -avoidversion
+io_cache_la_LDFLAGS = -module -avoid-version
io_cache_la_SOURCES = io-cache.c page.c ioc-inode.c
io_cache_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
diff --git a/xlators/performance/io-cache/src/io-cache.c b/xlators/performance/io-cache/src/io-cache.c
index 0793e6043..facff5038 100644
--- a/xlators/performance/io-cache/src/io-cache.c
+++ b/xlators/performance/io-cache/src/io-cache.c
@@ -31,7 +31,7 @@ ioc_get_priority (ioc_table_t *table, const char *path);
struct volume_options options[];
-inline uint32_t
+static inline uint32_t
ioc_hashfn (void *data, int len)
{
off_t offset;
@@ -41,7 +41,10 @@ ioc_hashfn (void *data, int len)
return (offset >> ioc_log2_page_size);
}
-inline ioc_inode_t *
+/* TODO: This function is not used, uncomment when we find a
+ usage for this function.
+
+static inline ioc_inode_t *
ioc_inode_reupdate (ioc_inode_t *ioc_inode)
{
ioc_table_t *table = NULL;
@@ -54,7 +57,8 @@ ioc_inode_reupdate (ioc_inode_t *ioc_inode)
return ioc_inode;
}
-inline ioc_inode_t *
+
+static inline ioc_inode_t *
ioc_get_inode (dict_t *dict, char *name)
{
ioc_inode_t *ioc_inode = NULL;
@@ -77,6 +81,7 @@ ioc_get_inode (dict_t *dict, char *name)
return ioc_inode;
}
+*/
int32_t
ioc_inode_need_revalidate (ioc_inode_t *ioc_inode)
@@ -316,9 +321,11 @@ ioc_forget (xlator_t *this, inode_t *inode)
static int32_t
ioc_invalidate(xlator_t *this, inode_t *inode)
{
+ uint64_t ioc_addr = 0;
ioc_inode_t *ioc_inode = NULL;
- inode_ctx_get(inode, this, (uint64_t *) &ioc_inode);
+ inode_ctx_get(inode, this, (uint64_t *) &ioc_addr);
+ ioc_inode = (void *) ioc_addr;
if (ioc_inode)
ioc_inode_flush(ioc_inode);
@@ -489,7 +496,7 @@ out:
return ret;
}
-inline uint32_t
+static inline uint32_t
is_match (const char *path, const char *pattern)
{
int32_t ret = 0;
@@ -551,6 +558,13 @@ ioc_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
inode_ctx_get (fd->inode, this, &tmp_ioc_inode);
ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode;
+ //TODO: see why inode context is NULL and handle it.
+ if (!ioc_inode) {
+ gf_log (this->name, GF_LOG_ERROR, "inode context is "
+ "NULL (%s)", uuid_utoa (fd->inode->gfid));
+ goto out;
+ }
+
ioc_table_lock (ioc_inode->table);
{
list_move_tail (&ioc_inode->inode_lru,
@@ -982,6 +996,7 @@ ioc_dispatch_requests (call_frame_t *frame, ioc_inode_t *ioc_inode, fd_t *fd,
"out of memory");
local->op_ret = -1;
local->op_errno = ENOMEM;
+ ioc_inode_unlock (ioc_inode);
goto out;
}
}
@@ -1415,6 +1430,58 @@ ioc_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
return 0;
}
+static int32_t
+ioc_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata)
+{
+ STACK_UNWIND_STRICT(discard, frame, op_ret, op_errno, pre, post, xdata);
+ return 0;
+}
+
+static int32_t
+ioc_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ uint64_t ioc_inode = 0;
+
+ inode_ctx_get (fd->inode, this, &ioc_inode);
+
+ if (ioc_inode)
+ ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode);
+
+ STACK_WIND(frame, ioc_discard_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->discard, fd, offset, len, xdata);
+ return 0;
+}
+
+static int32_t
+ioc_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata)
+{
+ STACK_UNWIND_STRICT(zerofill, frame, op_ret,
+ op_errno, pre, post, xdata);
+ return 0;
+}
+
+static int32_t
+ioc_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata)
+{
+ uint64_t ioc_inode = 0;
+
+ inode_ctx_get (fd->inode, this, &ioc_inode);
+
+ if (ioc_inode)
+ ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode);
+
+ STACK_WIND(frame, ioc_zerofill_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->zerofill, fd, offset, len, xdata);
+ return 0;
+}
+
+
int32_t
ioc_get_priority_list (const char *opt_str, struct list_head *first)
{
@@ -1601,12 +1668,12 @@ reconfigure (xlator_t *this, dict_t *options)
}
GF_OPTION_RECONF ("max-file-size", table->max_file_size,
- options, size, unlock);
+ options, size_uint64, unlock);
GF_OPTION_RECONF ("min-file-size", table->min_file_size,
- options, size, unlock);
+ options, size_uint64, unlock);
- if ((table->max_file_size >= 0) &&
+ if ((table->max_file_size <= UINT64_MAX) &&
(table->min_file_size > table->max_file_size)) {
gf_log (this->name, GF_LOG_ERROR, "minimum size (%"
PRIu64") of a file that can be cached is "
@@ -1617,7 +1684,7 @@ reconfigure (xlator_t *this, dict_t *options)
}
GF_OPTION_RECONF ("cache-size", cache_size_new,
- options, size, unlock);
+ options, size_uint64, unlock);
if (!check_cache_size_ok (this, cache_size_new)) {
ret = -1;
gf_log (this->name, GF_LOG_ERROR,
@@ -1674,13 +1741,13 @@ init (xlator_t *this)
table->xl = this;
table->page_size = this->ctx->page_size;
- GF_OPTION_INIT ("cache-size", table->cache_size, size, out);
+ GF_OPTION_INIT ("cache-size", table->cache_size, size_uint64, out);
GF_OPTION_INIT ("cache-timeout", table->cache_timeout, int32, out);
- GF_OPTION_INIT ("min-file-size", table->min_file_size, size, out);
+ GF_OPTION_INIT ("min-file-size", table->min_file_size, size_uint64, out);
- GF_OPTION_INIT ("max-file-size", table->max_file_size, size, out);
+ GF_OPTION_INIT ("max-file-size", table->max_file_size, size_uint64, out);
if (!check_cache_size_ok (this, table->cache_size)) {
ret = -1;
@@ -1706,7 +1773,7 @@ init (xlator_t *this)
INIT_LIST_HEAD (&table->inodes);
- if ((table->max_file_size >= 0)
+ if ((table->max_file_size <= UINT64_MAX)
&& (table->min_file_size > table->max_file_size)) {
gf_log ("io-cache", GF_LOG_ERROR, "minimum size (%"
PRIu64") of a file that can be cached is "
@@ -1878,11 +1945,11 @@ int
ioc_inode_dump (xlator_t *this, inode_t *inode)
{
- char *path = NULL;
+ char *path = NULL;
int ret = -1;
char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, };
uint64_t tmp_ioc_inode = 0;
- ioc_inode_t *ioc_inode = NULL;
+ ioc_inode_t *ioc_inode = NULL;
gf_boolean_t section_added = _gf_false;
char uuid_str[64] = {0,};
@@ -1896,9 +1963,6 @@ ioc_inode_dump (xlator_t *this, inode_t *inode)
if (ioc_inode == NULL)
goto out;
- gf_proc_dump_add_section (key_prefix);
- section_added = _gf_true;
-
/* Similar to ioc_page_dump function its better to use
* pthread_mutex_trylock and not to use gf_log in statedump
* to avoid deadlocks.
@@ -1906,24 +1970,30 @@ ioc_inode_dump (xlator_t *this, inode_t *inode)
ret = pthread_mutex_trylock (&ioc_inode->inode_lock);
if (ret)
goto out;
- else
+
{
- gf_proc_dump_write ("inode.weight", "%d", ioc_inode->weight);
+ if (uuid_is_null (ioc_inode->inode->gfid))
+ goto unlock;
+
+ gf_proc_dump_add_section (key_prefix);
+ section_added = _gf_true;
- //inode_path takes blocking lock on the itable.
__inode_path (ioc_inode->inode, NULL, &path);
+ gf_proc_dump_write ("inode.weight", "%d", ioc_inode->weight);
+
if (path) {
gf_proc_dump_write ("path", "%s", path);
GF_FREE (path);
}
+
gf_proc_dump_write ("uuid", "%s", uuid_utoa_r
(ioc_inode->inode->gfid, uuid_str));
__ioc_cache_dump (ioc_inode, key_prefix);
__ioc_inode_waitq_dump (ioc_inode, key_prefix);
-
- pthread_mutex_unlock (&ioc_inode->inode_lock);
}
+unlock:
+ pthread_mutex_unlock (&ioc_inode->inode_lock);
out:
if (ret && ioc_inode) {
@@ -2037,6 +2107,8 @@ struct xlator_fops fops = {
.mknod = ioc_mknod,
.readdirp = ioc_readdirp,
+ .discard = ioc_discard,
+ .zerofill = ioc_zerofill,
};
diff --git a/xlators/performance/io-cache/src/io-cache.h b/xlators/performance/io-cache/src/io-cache.h
index 41bbeea8b..46d758a66 100644
--- a/xlators/performance/io-cache/src/io-cache.h
+++ b/xlators/performance/io-cache/src/io-cache.h
@@ -329,6 +329,4 @@ ioc_prune (ioc_table_t *table);
int32_t
ioc_need_prune (ioc_table_t *table);
-inline uint32_t
-ioc_hashfn (void *data, int len);
#endif /* __IO_CACHE_H */
diff --git a/xlators/performance/io-cache/src/page.c b/xlators/performance/io-cache/src/page.c
index c18c04a0b..416cd5fe4 100644
--- a/xlators/performance/io-cache/src/page.c
+++ b/xlators/performance/io-cache/src/page.c
@@ -136,6 +136,7 @@ int64_t
ioc_page_destroy (ioc_page_t *page)
{
int64_t ret = 0;
+ struct ioc_inode *inode = NULL;
if (page == NULL) {
goto out;
@@ -143,9 +144,10 @@ ioc_page_destroy (ioc_page_t *page)
ioc_inode_lock (page->inode);
{
+ inode = page->inode;
ret = __ioc_page_destroy (page);
}
- ioc_inode_unlock (page->inode);
+ ioc_inode_unlock (inode);
out:
return ret;
@@ -315,6 +317,7 @@ __ioc_wait_on_page (ioc_page_t *page, call_frame_t *frame, off_t offset,
local->op_errno = ENOMEM;
gf_log (frame->this->name, GF_LOG_WARNING,
"asked to wait on a NULL page");
+ goto out;
}
waitq = GF_CALLOC (1, sizeof (*waitq), gf_ioc_mt_ioc_waitq_t);
@@ -476,6 +479,7 @@ ioc_fault_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
iobref_unref (page->iobref);
GF_FREE (page->vector);
page->vector = NULL;
+ page->iobref = NULL;
}
/* keep a copy of the page for our cache */
@@ -804,7 +808,7 @@ ioc_frame_unwind (call_frame_t *frame)
int32_t copied = 0;
struct iobref *iobref = NULL;
struct iatt stbuf = {0,};
- int32_t op_ret = 0;
+ int32_t op_ret = 0, op_errno = 0;
GF_ASSERT (frame);
@@ -813,16 +817,21 @@ ioc_frame_unwind (call_frame_t *frame)
gf_log (frame->this->name, GF_LOG_WARNING,
"local is NULL");
op_ret = -1;
- local->op_errno = ENOMEM;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+
+ if (local->op_ret < 0) {
+ op_ret = local->op_ret;
+ op_errno = local->op_errno;
goto unwind;
}
// ioc_local_lock (local);
- frame->local = NULL;
iobref = iobref_new ();
if (iobref == NULL) {
op_ret = -1;
- local->op_errno = ENOMEM;
+ op_errno = ENOMEM;
}
if (list_empty (&local->fill_list)) {
@@ -839,7 +848,7 @@ ioc_frame_unwind (call_frame_t *frame)
vector = GF_CALLOC (count, sizeof (*vector), gf_ioc_mt_iovec);
if (vector == NULL) {
op_ret = -1;
- local->op_errno = ENOMEM;
+ op_errno = ENOMEM;
}
list_for_each_entry_safe (fill, next, &local->fill_list, list) {
@@ -850,7 +859,10 @@ ioc_frame_unwind (call_frame_t *frame)
copied += (fill->count * sizeof (*vector));
- iobref_merge (iobref, fill->iobref);
+ if (iobref_merge (iobref, fill->iobref)) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ }
}
list_del (&fill->list);
@@ -869,7 +881,8 @@ unwind:
// ioc_local_unlock (local);
- STACK_UNWIND_STRICT (readv, frame, op_ret, local->op_errno, vector,
+ frame->local = NULL;
+ STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, vector,
count, &stbuf, iobref, NULL);
if (iobref != NULL) {
@@ -882,7 +895,8 @@ unwind:
}
pthread_mutex_destroy (&local->local_lock);
- mem_put (local);
+ if (local)
+ mem_put (local);
return;
}
@@ -980,7 +994,7 @@ __ioc_page_error (ioc_page_t *page, int32_t op_ret, int32_t op_errno)
waitq = page->waitq;
page->waitq = NULL;
- gf_log (page->inode->table->xl->name, GF_LOG_WARNING,
+ gf_log (page->inode->table->xl->name, GF_LOG_DEBUG,
"page error for page = %p & waitq = %p", page, waitq);
for (trav = waitq; trav; trav = trav->next) {
@@ -1020,6 +1034,7 @@ ioc_waitq_t *
ioc_page_error (ioc_page_t *page, int32_t op_ret, int32_t op_errno)
{
ioc_waitq_t *waitq = NULL;
+ struct ioc_inode *inode = NULL;
if (page == NULL) {
goto out;
@@ -1027,9 +1042,10 @@ ioc_page_error (ioc_page_t *page, int32_t op_ret, int32_t op_errno)
ioc_inode_lock (page->inode);
{
+ inode = page->inode;
waitq = __ioc_page_error (page, op_ret, op_errno);
}
- ioc_inode_unlock (page->inode);
+ ioc_inode_unlock (inode);
out:
return waitq;
diff --git a/xlators/performance/io-threads/src/Makefile.am b/xlators/performance/io-threads/src/Makefile.am
index 0f5a3b181..d63042e7c 100644
--- a/xlators/performance/io-threads/src/Makefile.am
+++ b/xlators/performance/io-threads/src/Makefile.am
@@ -1,7 +1,7 @@
xlator_LTLIBRARIES = io-threads.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
-io_threads_la_LDFLAGS = -module -avoidversion
+io_threads_la_LDFLAGS = -module -avoid-version
io_threads_la_SOURCES = io-threads.c
io_threads_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
diff --git a/xlators/performance/io-threads/src/io-threads.c b/xlators/performance/io-threads/src/io-threads.c
index 2a5cec7ed..55bcfd16e 100644
--- a/xlators/performance/io-threads/src/io-threads.c
+++ b/xlators/performance/io-threads/src/io-threads.c
@@ -14,6 +14,7 @@
#endif
#include "call-stub.h"
+#include "defaults.h"
#include "glusterfs.h"
#include "logging.h"
#include "dict.h"
@@ -29,17 +30,92 @@ int iot_workers_scale (iot_conf_t *conf);
int __iot_workers_scale (iot_conf_t *conf);
struct volume_options options[];
+#define IOT_FOP(name, frame, this, args ...) \
+ do { \
+ call_stub_t *__stub = NULL; \
+ int __ret = -1; \
+ \
+ __stub = fop_##name##_stub(frame, default_##name##_resume, args); \
+ if (!__stub) { \
+ __ret = -ENOMEM; \
+ goto out; \
+ } \
+ \
+ __ret = iot_schedule (frame, this, __stub); \
+ \
+ out: \
+ if (__ret < 0) { \
+ default_##name##_failure_cbk (frame, -__ret); \
+ if (__stub != NULL) { \
+ call_stub_destroy (__stub); \
+ } \
+ } \
+ } while (0)
+
call_stub_t *
-__iot_dequeue (iot_conf_t *conf, int *pri)
+__iot_dequeue (iot_conf_t *conf, int *pri, struct timespec *sleep)
{
call_stub_t *stub = NULL;
int i = 0;
+ struct timeval curtv = {0,}, difftv = {0,};
*pri = -1;
+ sleep->tv_sec = 0;
+ sleep->tv_nsec = 0;
for (i = 0; i < IOT_PRI_MAX; i++) {
if (list_empty (&conf->reqs[i]) ||
(conf->ac_iot_count[i] >= conf->ac_iot_limit[i]))
continue;
+
+ if (i == IOT_PRI_LEAST) {
+ pthread_mutex_lock(&conf->throttle.lock);
+ if (!conf->throttle.sample_time.tv_sec) {
+ /* initialize */
+ gettimeofday(&conf->throttle.sample_time, NULL);
+ } else {
+ /*
+ * Maintain a running count of least priority
+ * operations that are handled over a particular
+ * time interval. The count is provided via
+ * state dump and is used as a measure against
+ * least priority op throttling.
+ */
+ gettimeofday(&curtv, NULL);
+ timersub(&curtv, &conf->throttle.sample_time,
+ &difftv);
+ if (difftv.tv_sec >= IOT_LEAST_THROTTLE_DELAY) {
+ conf->throttle.cached_rate =
+ conf->throttle.sample_cnt;
+ conf->throttle.sample_cnt = 0;
+ conf->throttle.sample_time = curtv;
+ }
+
+ /*
+ * If we're over the configured rate limit,
+ * provide an absolute time to the caller that
+ * represents the soonest we're allowed to
+ * return another least priority request.
+ */
+ if (conf->throttle.rate_limit &&
+ conf->throttle.sample_cnt >=
+ conf->throttle.rate_limit) {
+ struct timeval delay;
+ delay.tv_sec = IOT_LEAST_THROTTLE_DELAY;
+ delay.tv_usec = 0;
+
+ timeradd(&conf->throttle.sample_time,
+ &delay, &curtv);
+ TIMEVAL_TO_TIMESPEC(&curtv, sleep);
+
+ pthread_mutex_unlock(
+ &conf->throttle.lock);
+ break;
+ }
+ }
+ conf->throttle.sample_cnt++;
+ pthread_mutex_unlock(&conf->throttle.lock);
+ }
+
stub = list_entry (conf->reqs[i].next, call_stub_t, list);
conf->ac_iot_count[i]++;
*pri = i;
@@ -83,6 +159,7 @@ iot_worker (void *data)
int pri = -1;
char timeout = 0;
char bye = 0;
+ struct timespec sleep = {0,};
conf = data;
this = conf->this;
@@ -123,7 +200,13 @@ iot_worker (void *data)
}
}
- stub = __iot_dequeue (conf, &pri);
+ stub = __iot_dequeue (conf, &pri, &sleep);
+ if (!stub && (sleep.tv_sec || sleep.tv_nsec)) {
+ pthread_cond_timedwait(&conf->cond,
+ &conf->mutex, &sleep);
+ pthread_mutex_unlock(&conf->mutex);
+ continue;
+ }
}
pthread_mutex_unlock (&conf->mutex);
@@ -247,6 +330,9 @@ iot_schedule (call_frame_t *frame, xlator_t *this, call_stub_t *stub)
case GF_FOP_XATTROP:
case GF_FOP_FXATTROP:
case GF_FOP_RCHECKSUM:
+ case GF_FOP_FALLOCATE:
+ case GF_FOP_DISCARD:
+ case GF_FOP_ZEROFILL:
pri = IOT_PRI_LO;
break;
@@ -255,90 +341,23 @@ iot_schedule (call_frame_t *frame, xlator_t *this, call_stub_t *stub)
case GF_FOP_RELEASE:
case GF_FOP_RELEASEDIR:
case GF_FOP_GETSPEC:
+ case GF_FOP_IPC:
case GF_FOP_MAXVALUE:
//fail compilation on missing fop
//new fop must choose priority.
break;
}
out:
- ret = do_iot_schedule (this->private, stub, pri);
gf_log (this->name, GF_LOG_DEBUG, "%s scheduled as %s fop",
gf_fop_list[stub->fop], iot_get_pri_meaning (pri));
+ ret = do_iot_schedule (this->private, stub, pri);
return ret;
}
int
-iot_lookup_cbk (call_frame_t *frame, void * cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf, dict_t *xdata,
- struct iatt *postparent)
-{
- STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf, xdata,
- postparent);
- return 0;
-}
-
-
-int
-iot_lookup_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc,
- dict_t *xdata)
-{
- STACK_WIND (frame, iot_lookup_cbk,
- FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->lookup,
- loc, xdata);
- return 0;
-}
-
-
-int
iot_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_lookup_stub (frame, iot_lookup_wrapper, loc, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR,
- "cannot create lookup stub (out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- STACK_UNWIND_STRICT (lookup, frame, -1, -ret, NULL, NULL, NULL,
- NULL);
- }
-
- return 0;
-}
-
-
-int
-iot_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preop, struct iatt *postop, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (setattr, frame, op_ret, op_errno, preop, postop,
- xdata);
- return 0;
-}
-
-
-int
-iot_setattr_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc,
- struct iatt *stbuf, int32_t valid, dict_t *xdata)
-{
- STACK_WIND (frame, iot_setattr_cbk,
- FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->setattr,
- loc, stbuf, valid, xdata);
+ IOT_FOP (lookup, frame, this, loc, xdata);
return 0;
}
@@ -347,51 +366,7 @@ int
iot_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
struct iatt *stbuf, int32_t valid, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_setattr_stub (frame, iot_setattr_wrapper, loc, stbuf, valid,
- xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "Cannot create setattr stub"
- "(Out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
-
- STACK_UNWIND_STRICT (setattr, frame, -1, -ret, NULL, NULL, NULL);
- }
-
- return 0;
-}
-
-
-int
-iot_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preop, struct iatt *postop, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (fsetattr, frame, op_ret, op_errno, preop, postop,
- xdata);
- return 0;
-}
-
-
-int
-iot_fsetattr_wrapper (call_frame_t *frame, xlator_t *this,
- fd_t *fd, struct iatt *stbuf, int32_t valid, dict_t *xdata)
-{
- STACK_WIND (frame, iot_fsetattr_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fsetattr, fd, stbuf, valid,
- xdata);
+ IOT_FOP (setattr, frame, this, loc, stbuf, valid, xdata);
return 0;
}
@@ -400,47 +375,7 @@ int
iot_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iatt *stbuf, int32_t valid, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_fsetattr_stub (frame, iot_fsetattr_wrapper, fd, stbuf,
- valid, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create fsetattr stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (fsetattr, frame, -1, -ret, NULL, NULL,
- NULL);
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (access, frame, op_ret, op_errno, xdata);
- return 0;
-}
-
-
-int
-iot_access_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc,
- int32_t mask, dict_t *xdata)
-{
- STACK_WIND (frame, iot_access_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->access, loc, mask, xdata);
+ IOT_FOP (fsetattr, frame, this, fd, stbuf, valid, xdata);
return 0;
}
@@ -449,49 +384,7 @@ int
iot_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_access_stub (frame, iot_access_wrapper, loc, mask, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create access stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (access, frame, -1, -ret, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, const char *path,
- struct iatt *stbuf, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (readlink, frame, op_ret, op_errno, path, stbuf,
- xdata);
- return 0;
-}
-
-
-int
-iot_readlink_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc,
- size_t size, dict_t *xdata)
-{
- STACK_WIND (frame, iot_readlink_cbk,
- FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->readlink,
- loc, size, xdata);
+ IOT_FOP (access, frame, this, loc, mask, xdata);
return 0;
}
@@ -499,51 +392,7 @@ iot_readlink_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc,
int
iot_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_readlink_stub (frame, iot_readlink_wrapper, loc, size, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create readlink stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (readlink, frame, -1, -ret, NULL, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
-
- return 0;
-}
-
-
-int
-iot_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (mknod, frame, op_ret, op_errno, inode, buf,
- preparent, postparent, xdata);
- return 0;
-}
-
-
-int
-iot_mknod_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
- dev_t rdev, mode_t umask, dict_t *xdata)
-{
- STACK_WIND (frame, iot_mknod_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->mknod, loc, mode, rdev, umask,
- xdata);
+ IOT_FOP (readlink, frame, this, loc, size, xdata);
return 0;
}
@@ -552,51 +401,7 @@ int
iot_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
dev_t rdev, mode_t umask, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_mknod_stub (frame, iot_mknod_wrapper, loc, mode, rdev,
- umask, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create mknod stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (mknod, frame, -1, -ret, NULL, NULL, NULL,
- NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_mkdir_cbk (call_frame_t *frame, void * cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (mkdir, frame, op_ret, op_errno, inode, buf,
- preparent, postparent, xdata);
- return 0;
-}
-
-
-int
-iot_mkdir_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
- mode_t umask, dict_t *xdata)
-{
- STACK_WIND (frame, iot_mkdir_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->mkdir, loc, mode, umask, xdata);
+ IOT_FOP (mknod, frame, this, loc, mode, rdev, umask, xdata);
return 0;
}
@@ -605,49 +410,7 @@ int
iot_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
mode_t umask, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_mkdir_stub (frame, iot_mkdir_wrapper, loc, mode, umask,
- xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create mkdir stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (mkdir, frame, -1, -ret, NULL, NULL, NULL,
- NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (rmdir, frame, op_ret, op_errno, preparent,
- postparent, xdata);
- return 0;
-}
-
-
-int
-iot_rmdir_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, dict_t *xdata)
-{
- STACK_WIND (frame, iot_rmdir_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->rmdir, loc, flags, xdata);
+ IOT_FOP (mkdir, frame, this, loc, mode, umask, xdata);
return 0;
}
@@ -655,49 +418,7 @@ iot_rmdir_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, d
int
iot_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_rmdir_stub (frame, iot_rmdir_wrapper, loc, flags, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create rmdir stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (rmdir, frame, -1, -ret, NULL, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_symlink_cbk (call_frame_t *frame, void * cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (symlink, frame, op_ret, op_errno, inode, buf,
- preparent, postparent, xdata);
- return 0;
-}
-
-
-int
-iot_symlink_wrapper (call_frame_t *frame, xlator_t *this, const char *linkname,
- loc_t *loc, mode_t umask, dict_t *xdata)
-{
- STACK_WIND (frame, iot_symlink_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->symlink, linkname, loc, umask,
- xdata);
+ IOT_FOP (rmdir, frame, this, loc, flags, xdata);
return 0;
}
@@ -706,52 +427,7 @@ int
iot_symlink (call_frame_t *frame, xlator_t *this, const char *linkname,
loc_t *loc, mode_t umask, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_symlink_stub (frame, iot_symlink_wrapper, linkname, loc,
- umask, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create symlink stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (symlink, frame, -1, -ret, NULL, NULL, NULL,
- NULL, NULL);
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
-
- return 0;
-}
-
-
-int
-iot_rename_cbk (call_frame_t *frame, void * cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf,
- struct iatt *preoldparent, struct iatt *postoldparent,
- struct iatt *prenewparent, struct iatt *postnewparent,
- dict_t *xdata)
-{
- STACK_UNWIND_STRICT (rename, frame, op_ret, op_errno, buf, preoldparent,
- postoldparent, prenewparent, postnewparent, xdata);
- return 0;
-}
-
-
-int
-iot_rename_wrapper (call_frame_t *frame, xlator_t *this, loc_t *oldloc,
- loc_t *newloc, dict_t *xdata)
-{
- STACK_WIND (frame, iot_rename_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->rename, oldloc, newloc, xdata);
+ IOT_FOP (symlink, frame, this, linkname, loc, umask, xdata);
return 0;
}
@@ -760,106 +436,17 @@ int
iot_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_rename_stub (frame, iot_rename_wrapper, oldloc, newloc, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_DEBUG, "cannot create rename stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (rename, frame, -1, -ret, NULL, NULL, NULL,
- NULL, NULL, NULL);
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
-
+ IOT_FOP (rename, frame, this, oldloc, newloc, xdata);
return 0;
}
int
-iot_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, fd_t *fd, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (open, frame, op_ret, op_errno, fd, xdata);
- return 0;
-}
-
-
-int
-iot_open_wrapper (call_frame_t * frame, xlator_t * this, loc_t *loc,
- int32_t flags, fd_t * fd, dict_t *xdata)
-{
- STACK_WIND (frame, iot_open_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->open, loc, flags, fd,
- xdata);
- return 0;
-}
-
-
-int
iot_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
fd_t *fd, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_open_stub (frame, iot_open_wrapper, loc, flags, fd,
- xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR,
- "cannot create open call stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (open, frame, -1, -ret, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
-
- return 0;
-}
-
-
-int
-iot_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode,
- struct iatt *stbuf, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (create, frame, op_ret, op_errno, fd, inode, stbuf,
- preparent, postparent, xdata);
- return 0;
-}
-
-
-int
-iot_create_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc,
- int32_t flags, mode_t mode, mode_t umask, fd_t *fd,
- dict_t *xdata)
-{
- STACK_WIND (frame, iot_create_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->create,
- loc, flags, mode, umask, fd, xdata);
- return 0;
+ IOT_FOP (open, frame, this, loc, flags, fd, xdata);
+ return 0;
}
@@ -867,159 +454,25 @@ int
iot_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_create_stub (frame, iot_create_wrapper, loc, flags, mode,
- umask, fd, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR,
- "cannot create \"create\" call stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (create, frame, -1, -ret, NULL, NULL, NULL,
- NULL, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
-
+ IOT_FOP (create, frame, this, loc, flags, mode, umask, fd, xdata);
return 0;
}
int
-iot_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iovec *vector,
- int32_t count, struct iatt *stbuf, struct iobref *iobref,
- dict_t *xdata)
-{
- STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, vector, count,
- stbuf, iobref, xdata);
-
- return 0;
-}
-
-
-int
-iot_readv_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t offset, uint32_t flags, dict_t *xdata)
-{
- STACK_WIND (frame, iot_readv_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->readv,
- fd, size, offset, flags, xdata);
- return 0;
-}
-
-
-int
iot_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
off_t offset, uint32_t flags, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_readv_stub (frame, iot_readv_wrapper, fd, size, offset,
- flags, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR,
- "cannot create readv call stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (readv, frame, -1, -ret, NULL, -1, NULL,
- NULL, NULL);
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (flush, frame, op_ret, op_errno, xdata);
- return 0;
-}
-
-
-int
-iot_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
-{
- STACK_WIND (frame, iot_flush_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->flush,
- fd, xdata);
- return 0;
+ IOT_FOP (readv, frame, this, fd, size, offset, flags, xdata);
+ return 0;
}
int
iot_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_flush_stub (frame, iot_flush_wrapper, fd, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR,
- "cannot create flush_cbk call stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (flush, frame, -1, -ret, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (fsync, frame, op_ret, op_errno, prebuf, postbuf,
- xdata);
- return 0;
-}
-
-
-int
-iot_fsync_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- int32_t datasync, dict_t *xdata)
-{
- STACK_WIND (frame, iot_fsync_cbk,
- FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fsync,
- fd, datasync, xdata);
- return 0;
+ IOT_FOP (flush, frame, this, fd, xdata);
+ return 0;
}
@@ -1027,54 +480,8 @@ int
iot_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_fsync_stub (frame, iot_fsync_wrapper, fd, datasync, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR,
- "cannot create fsync_cbk call stub"
- "(out of memory)");
- ret = -1;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (fsync, frame, -1, -ret, NULL, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf,
- xdata);
- return 0;
-}
-
-
-int
-iot_writev_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- struct iovec *vector, int32_t count,
- off_t offset, uint32_t flags, struct iobref *iobref,
- dict_t *xdata)
-{
- STACK_WIND (frame, iot_writev_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->writev,
- fd, vector, count, offset, flags, iobref, xdata);
- return 0;
+ IOT_FOP (fsync, frame, this, fd, datasync, xdata);
+ return 0;
}
@@ -1083,53 +490,9 @@ iot_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iovec *vector, int32_t count, off_t offset,
uint32_t flags, struct iobref *iobref, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_writev_stub (frame, iot_writev_wrapper, fd, vector,
- count, offset, flags, iobref, xdata);
-
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR,
- "cannot create writev call stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (writev, frame, -1, -ret, NULL, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
-
- return 0;
-}
-
-
-int32_t
-iot_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct gf_flock *flock,
- dict_t *xdata)
-{
- STACK_UNWIND_STRICT (lk, frame, op_ret, op_errno, flock, xdata);
- return 0;
-}
-
-
-int
-iot_lk_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- int32_t cmd, struct gf_flock *flock, dict_t *xdata)
-{
- STACK_WIND (frame, iot_lk_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->lk,
- fd, cmd, flock, xdata);
- return 0;
+ IOT_FOP (writev, frame, this, fd, vector, count, offset,
+ flags, iobref, xdata);
+ return 0;
}
@@ -1137,149 +500,24 @@ int
iot_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
struct gf_flock *flock, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_lk_stub (frame, iot_lk_wrapper, fd, cmd, flock, xdata);
-
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR,
- "cannot create fop_lk call stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (lk, frame, -1, -ret, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (stat, frame, op_ret, op_errno, buf, xdata);
- return 0;
-}
-
-
-int
-iot_stat_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
-{
- STACK_WIND (frame, iot_stat_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->stat,
- loc, xdata);
- return 0;
+ IOT_FOP (lk, frame, this, fd, cmd, flock, xdata);
+ return 0;
}
int
iot_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_stat_stub (frame, iot_stat_wrapper, loc, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR,
- "cannot create fop_stat call stub"
- "(out of memory)");
- ret = -1;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (stat, frame, -1, -ret, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (fstat, frame, op_ret, op_errno, buf, xdata);
- return 0;
-}
-
-
-int
-iot_fstat_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
-{
- STACK_WIND (frame, iot_fstat_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fstat,
- fd, xdata);
- return 0;
+ IOT_FOP (stat, frame, this, loc, xdata);
+ return 0;
}
int
iot_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_fstat_stub (frame, iot_fstat_wrapper, fd, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR,
- "cannot create fop_fstat call stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (fstat, frame, -1, -ret, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf,
- postbuf, xdata);
- return 0;
-}
-
-
-int
-iot_truncate_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc,
- off_t offset, dict_t *xdata)
-{
- STACK_WIND (frame, iot_truncate_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->truncate,
- loc, offset, xdata);
- return 0;
+ IOT_FOP (fstat, frame, this, fd, xdata);
+ return 0;
}
@@ -1287,55 +525,8 @@ int
iot_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
dict_t *xdata)
{
- call_stub_t *stub;
- int ret = -1;
-
- stub = fop_truncate_stub (frame, iot_truncate_wrapper, loc, offset,
- xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR,
- "cannot create fop_stat call stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (truncate, frame, -1, -ret, NULL, NULL,
- NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
-
- return 0;
-}
-
-
-int
-iot_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, prebuf,
- postbuf, xdata);
- return 0;
-}
-
-
-int
-iot_ftruncate_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- off_t offset, dict_t *xdata)
-{
- STACK_WIND (frame, iot_ftruncate_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->ftruncate,
- fd, offset, xdata);
- return 0;
+ IOT_FOP (truncate, frame, this, loc, offset, xdata);
+ return 0;
}
@@ -1343,106 +534,17 @@ int
iot_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_ftruncate_stub (frame, iot_ftruncate_wrapper, fd, offset,
- xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR,
- "cannot create fop_ftruncate call stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (ftruncate, frame, -1, -ret, NULL, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-
-int
-iot_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (unlink, frame, op_ret, op_errno, preparent,
- postparent, xdata);
- return 0;
+ IOT_FOP (ftruncate, frame, this, fd, offset, xdata);
+ return 0;
}
-int
-iot_unlink_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc,
- int32_t xflag, dict_t *xdata)
-{
- STACK_WIND (frame, iot_unlink_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->unlink,
- loc, xflag, xdata);
- return 0;
-}
-
int
iot_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t xflag,
dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_unlink_stub (frame, iot_unlink_wrapper, loc, xflag, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR,
- "cannot create fop_unlink call stub"
- "(out of memory)");
- ret = -1;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (unlink, frame, -1, -ret, NULL, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
-
- return 0;
-}
-
-
-int
-iot_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent, struct iatt *postparent,
- dict_t *xdata)
-{
- STACK_UNWIND_STRICT (link, frame, op_ret, op_errno, inode, buf,
- preparent, postparent, xdata);
- return 0;
-}
-
-
-int
-iot_link_wrapper (call_frame_t *frame, xlator_t *this, loc_t *old, loc_t *new,
- dict_t *xdata)
-{
- STACK_WIND (frame, iot_link_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->link, old, new, xdata);
-
+ IOT_FOP (unlink, frame, this, loc, xflag, xdata);
return 0;
}
@@ -1451,46 +553,7 @@ int
iot_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_link_stub (frame, iot_link_wrapper, oldloc, newloc, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create link stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (link, frame, -1, -ret, NULL, NULL, NULL,
- NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (opendir, frame, op_ret, op_errno, fd, xdata);
- return 0;
-}
-
-
-int
-iot_opendir_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
- dict_t *xdata)
-{
- STACK_WIND (frame, iot_opendir_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->opendir, loc, fd, xdata);
+ IOT_FOP (link, frame, this, oldloc, newloc, xdata);
return 0;
}
@@ -1499,45 +562,7 @@ int
iot_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_opendir_stub (frame, iot_opendir_wrapper, loc, fd, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create opendir stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (opendir, frame, -1, -ret, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (fsyncdir, frame, op_ret, op_errno, xdata);
- return 0;
-}
-
-
-int
-iot_fsyncdir_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- int datasync, dict_t *xdata)
-{
- STACK_WIND (frame, iot_fsyncdir_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fsyncdir, fd, datasync, xdata);
+ IOT_FOP (opendir, frame, this, loc, fd, xdata);
return 0;
}
@@ -1546,47 +571,7 @@ int
iot_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync,
dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_fsyncdir_stub (frame, iot_fsyncdir_wrapper, fd, datasync,
- xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create fsyncdir stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (fsyncdir, frame, -1, -ret, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct statvfs *buf,
- dict_t *xdata)
-{
- STACK_UNWIND_STRICT (statfs, frame, op_ret, op_errno, buf, xdata);
- return 0;
-}
-
-
-int
-iot_statfs_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc,
- dict_t *xdata)
-{
- STACK_WIND (frame, iot_statfs_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->statfs, loc, xdata);
+ IOT_FOP (fsyncdir, frame, this, fd, datasync, xdata);
return 0;
}
@@ -1594,45 +579,7 @@ iot_statfs_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc,
int
iot_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_statfs_stub (frame, iot_statfs_wrapper, loc, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create statfs stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (statfs, frame, -1, -ret, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (setxattr, frame, op_ret, op_errno, xdata);
- return 0;
-}
-
-
-int
-iot_setxattr_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc,
- dict_t *dict, int32_t flags, dict_t *xdata)
-{
- STACK_WIND (frame, iot_setxattr_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->setxattr, loc, dict, flags, xdata);
+ IOT_FOP (statfs, frame, this, loc, xdata);
return 0;
}
@@ -1641,47 +588,7 @@ int
iot_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
int32_t flags, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_setxattr_stub (frame, iot_setxattr_wrapper, loc, dict,
- flags, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create setxattr stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (setxattr, frame, -1, -ret, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (getxattr, frame, op_ret, op_errno, dict, xdata);
- return 0;
-}
-
-
-int
-iot_getxattr_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc,
- const char *name, dict_t *xdata)
-{
- STACK_WIND (frame, iot_getxattr_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->getxattr, loc, name, xdata);
+ IOT_FOP (setxattr, frame, this, loc, dict, flags, xdata);
return 0;
}
@@ -1690,47 +597,7 @@ int
iot_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
const char *name, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_getxattr_stub (frame, iot_getxattr_wrapper, loc, name, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create getxattr stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (getxattr, frame, -1, -ret, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict,
- dict_t *xdata)
-{
- STACK_UNWIND_STRICT (fgetxattr, frame, op_ret, op_errno, dict, xdata);
- return 0;
-}
-
-
-int
-iot_fgetxattr_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- const char *name, dict_t *xdata)
-{
- STACK_WIND (frame, iot_fgetxattr_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fgetxattr, fd, name, xdata);
+ IOT_FOP (getxattr, frame, this, loc, name, xdata);
return 0;
}
@@ -1739,46 +606,7 @@ int
iot_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
const char *name, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_fgetxattr_stub (frame, iot_fgetxattr_wrapper, fd, name, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create fgetxattr stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (fgetxattr, frame, -1, -ret, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (fsetxattr, frame, op_ret, op_errno, xdata);
- return 0;
-}
-
-
-int
-iot_fsetxattr_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- dict_t *dict, int32_t flags, dict_t *xdata)
-{
- STACK_WIND (frame, iot_fsetxattr_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fsetxattr, fd, dict, flags,
- xdata);
+ IOT_FOP (fgetxattr, frame, this, fd, name, xdata);
return 0;
}
@@ -1787,46 +615,7 @@ int
iot_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
int32_t flags, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_fsetxattr_stub (frame, iot_fsetxattr_wrapper, fd, dict,
- flags, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create fsetxattr stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (fsetxattr, frame, -1, -ret, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (removexattr, frame, op_ret, op_errno, xdata);
- return 0;
-}
-
-
-int
-iot_removexattr_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc,
- const char *name, dict_t *xdata)
-{
- STACK_WIND (frame, iot_removexattr_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->removexattr, loc, name, xdata);
+ IOT_FOP (fsetxattr, frame, this, fd, dict, flags, xdata);
return 0;
}
@@ -1835,94 +624,15 @@ int
iot_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
const char *name, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_removexattr_stub (frame, iot_removexattr_wrapper, loc,
- name, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR,"cannot get removexattr fop"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (removexattr, frame, -1, -ret, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-int
-iot_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (fremovexattr, frame, op_ret, op_errno, xdata);
- return 0;
-}
-
-
-int
-iot_fremovexattr_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- const char *name, dict_t *xdata)
-{
- STACK_WIND (frame, iot_fremovexattr_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fremovexattr, fd, name, xdata);
+ IOT_FOP (removexattr, frame, this, loc, name, xdata);
return 0;
}
-
int
iot_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
const char *name, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_fremovexattr_stub (frame, iot_fremovexattr_wrapper, fd,
- name, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR,"cannot get fremovexattr fop"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (fremovexattr, frame, -1, -ret, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
- dict_t *xdata)
-{
- STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries, xdata);
- return 0;
-}
-
-
-int
-iot_readdirp_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- size_t size, off_t offset, dict_t *xdata)
-{
- STACK_WIND (frame, iot_readdirp_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->readdirp, fd, size, offset, xdata);
+ IOT_FOP (fremovexattr, frame, this, fd, name, xdata);
return 0;
}
@@ -1931,47 +641,7 @@ int
iot_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
off_t offset, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_readdirp_stub (frame, iot_readdirp_wrapper, fd, size,
- offset, xdata);
- if (!stub) {
- gf_log (this->private, GF_LOG_ERROR,"cannot get readdir stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (readdirp, frame, -1, -ret, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int
-iot_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
- dict_t *xdata)
-{
- STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, entries, xdata);
- return 0;
-}
-
-
-int
-iot_readdir_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- size_t size, off_t offset, dict_t *xdata)
-{
- STACK_WIND (frame, iot_readdir_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->readdir, fd, size, offset, xdata);
+ IOT_FOP (readdirp, frame, this, fd, size, offset, xdata);
return 0;
}
@@ -1980,369 +650,94 @@ int
iot_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
off_t offset, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_readdir_stub (frame, iot_readdir_wrapper, fd, size, offset,
- xdata);
- if (!stub) {
- gf_log (this->private, GF_LOG_ERROR,"cannot get readdir stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (readdir, frame, -1, -ret, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-int
-iot_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (inodelk, frame, op_ret, op_errno, xdata);
+ IOT_FOP (readdir, frame, this, fd, size, offset, xdata);
return 0;
}
-
-int
-iot_inodelk_wrapper (call_frame_t *frame, xlator_t *this, const char *volume,
- loc_t *loc, int32_t cmd, struct gf_flock *lock,
- dict_t *xdata)
-{
- STACK_WIND (frame, iot_inodelk_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->inodelk, volume, loc, cmd, lock,
- xdata);
- return 0;
-}
-
-
int
iot_inodelk (call_frame_t *frame, xlator_t *this,
const char *volume, loc_t *loc, int32_t cmd, struct gf_flock *lock,
dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_inodelk_stub (frame, iot_inodelk_wrapper,
- volume, loc, cmd, lock, xdata);
- if (!stub) {
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (inodelk, frame, -1, -ret, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-int
-iot_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (finodelk, frame, op_ret, op_errno, xdata);
+ IOT_FOP (inodelk, frame, this, volume, loc, cmd, lock, xdata);
return 0;
}
-
-int
-iot_finodelk_wrapper (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd, int32_t cmd,
- struct gf_flock *lock, dict_t *xdata)
-{
- STACK_WIND (frame, iot_finodelk_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->finodelk, volume, fd, cmd, lock,
- xdata);
- return 0;
-}
-
-
int
iot_finodelk (call_frame_t *frame, xlator_t *this,
const char *volume, fd_t *fd, int32_t cmd, struct gf_flock *lock,
dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_finodelk_stub (frame, iot_finodelk_wrapper,
- volume, fd, cmd, lock, xdata);
- if (!stub) {
- gf_log (this->private, GF_LOG_ERROR,"cannot get finodelk stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (finodelk, frame, -1, -ret, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
+ IOT_FOP (finodelk, frame, this, volume, fd, cmd, lock, xdata);
return 0;
}
int
-iot_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (entrylk, frame, op_ret, op_errno, xdata);
- return 0;
-}
-
-
-int
-iot_entrylk_wrapper (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc, const char *basename,
- entrylk_cmd cmd, entrylk_type type, dict_t *xdata)
-{
- STACK_WIND (frame, iot_entrylk_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->entrylk,
- volume, loc, basename, cmd, type, xdata);
- return 0;
-}
-
-
-int
iot_entrylk (call_frame_t *frame, xlator_t *this,
const char *volume, loc_t *loc, const char *basename,
entrylk_cmd cmd, entrylk_type type, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_entrylk_stub (frame, iot_entrylk_wrapper,
- volume, loc, basename, cmd, type, xdata);
- if (!stub) {
- gf_log (this->private, GF_LOG_ERROR,"cannot get entrylk stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (entrylk, frame, -1, -ret, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-int
-iot_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (fentrylk, frame, op_ret, op_errno, xdata);
- return 0;
-}
-
-
-int
-iot_fentrylk_wrapper (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd, const char *basename,
- entrylk_cmd cmd, entrylk_type type, dict_t *xdata)
-{
- STACK_WIND (frame, iot_fentrylk_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fentrylk,
- volume, fd, basename, cmd, type, xdata);
+ IOT_FOP (entrylk, frame, this, volume, loc, basename, cmd, type, xdata);
return 0;
}
-
int
iot_fentrylk (call_frame_t *frame, xlator_t *this,
const char *volume, fd_t *fd, const char *basename,
entrylk_cmd cmd, entrylk_type type, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_fentrylk_stub (frame, iot_fentrylk_wrapper,
- volume, fd, basename, cmd, type, xdata);
- if (!stub) {
- gf_log (this->private, GF_LOG_ERROR,"cannot get fentrylk stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (fentrylk, frame, -1, -ret, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
+ IOT_FOP (fentrylk, frame, this, volume, fd, basename, cmd, type, xdata);
return 0;
}
int
-iot_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xattr, dict_t *xdata)
+iot_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
{
- STACK_UNWIND_STRICT (xattrop, frame, op_ret, op_errno, xattr, xdata);
+ IOT_FOP (xattrop, frame, this, loc, optype, xattr, xdata);
return 0;
}
int
-iot_xattrop_wrapper (call_frame_t *frame, xlator_t *this, loc_t *loc,
- gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+iot_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
{
- STACK_WIND (frame, iot_xattrop_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->xattrop, loc, optype, xattr, xdata);
+ IOT_FOP (fxattrop, frame, this, fd, optype, xattr, xdata);
return 0;
}
-int
-iot_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
- gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+int32_t
+iot_rchecksum (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ int32_t len, dict_t *xdata)
{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_xattrop_stub (frame, iot_xattrop_wrapper, loc, optype,
- xattr, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create xattrop stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (xattrop, frame, -1, -ret, NULL, NULL);
-
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
+ IOT_FOP (rchecksum, frame, this, fd, offset, len, xdata);
return 0;
}
-
int
-iot_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xattr, dict_t *xdata)
+iot_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
{
- STACK_UNWIND_STRICT (fxattrop, frame, op_ret, op_errno, xattr, xdata);
+ IOT_FOP (fallocate, frame, this, fd, mode, offset, len, xdata);
return 0;
}
int
-iot_fxattrop_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+iot_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
{
- STACK_WIND (frame, iot_fxattrop_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fxattrop, fd, optype, xattr, xdata);
+ IOT_FOP (discard, frame, this, fd, offset, len, xdata);
return 0;
}
-
int
-iot_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
- gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
-{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_fxattrop_stub (frame, iot_fxattrop_wrapper, fd, optype,
- xattr, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create fxattrop stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (fxattrop, frame, -1, -ret, NULL, NULL);
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
- return 0;
-}
-
-
-int32_t
-iot_rchecksum_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, uint32_t weak_checksum,
- uint8_t *strong_checksum, dict_t *xdata)
-{
- STACK_UNWIND_STRICT (rchecksum, frame, op_ret, op_errno, weak_checksum,
- strong_checksum, xdata);
- return 0;
-}
-
-
-int32_t
-iot_rchecksum_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- off_t offset, int32_t len, dict_t *xdata)
+iot_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata)
{
- STACK_WIND (frame, iot_rchecksum_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->rchecksum, fd, offset, len, xdata);
- return 0;
-}
-
-
-int32_t
-iot_rchecksum (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
- int32_t len, dict_t *xdata)
-{
- call_stub_t *stub = NULL;
- int ret = -1;
-
- stub = fop_rchecksum_stub (frame, iot_rchecksum_wrapper, fd, offset,
- len, xdata);
- if (!stub) {
- gf_log (this->name, GF_LOG_ERROR, "cannot create rchecksum stub"
- "(out of memory)");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = iot_schedule (frame, this, stub);
-out:
- if (ret < 0) {
- STACK_UNWIND_STRICT (rchecksum, frame, -1, -ret, -1, NULL, NULL);
- if (stub != NULL) {
- call_stub_destroy (stub);
- }
- }
-
+ IOT_FOP (zerofill, frame, this, fd, offset, len, xdata);
return 0;
}
@@ -2372,7 +767,7 @@ __iot_workers_scale (iot_conf_t *conf)
while (diff) {
diff --;
- ret = pthread_create (&thread, &conf->w_attr, iot_worker, conf);
+ ret = gf_thread_create (&thread, &conf->w_attr, iot_worker, conf);
if (ret == 0) {
conf->curr_count++;
gf_log (conf->this->name, GF_LOG_DEBUG,
@@ -2485,6 +880,10 @@ iot_priv_dump (xlator_t *this)
gf_proc_dump_write("least_priority_threads", "%d",
conf->ac_iot_limit[IOT_PRI_LEAST]);
+ gf_proc_dump_write("cached least rate", "%u",
+ conf->throttle.cached_rate);
+ gf_proc_dump_write("least rate limit", "%u", conf->throttle.rate_limit);
+
return 0;
}
@@ -2516,6 +915,9 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("enable-least-priority", conf->least_priority,
options, bool, out);
+ GF_OPTION_RECONF("least-rate-limit", conf->throttle.rate_limit, options,
+ int32, out);
+
ret = 0;
out:
return ret;
@@ -2580,6 +982,14 @@ init (xlator_t *this)
GF_OPTION_INIT ("enable-least-priority", conf->least_priority,
bool, out);
+ GF_OPTION_INIT("least-rate-limit", conf->throttle.rate_limit, int32,
+ out);
+ if ((ret = pthread_mutex_init(&conf->throttle.lock, NULL)) != 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "pthread_mutex_init failed (%d)", ret);
+ goto out;
+ }
+
conf->this = this;
for (i = 0; i < IOT_PRI_MAX; i++) {
@@ -2661,10 +1071,12 @@ struct xlator_fops fops = {
.xattrop = iot_xattrop,
.fxattrop = iot_fxattrop,
.rchecksum = iot_rchecksum,
+ .fallocate = iot_fallocate,
+ .discard = iot_discard,
+ .zerofill = iot_zerofill,
};
-struct xlator_cbks cbks = {
-};
+struct xlator_cbks cbks;
struct volume_options options[] = {
{ .key = {"thread-count"},
@@ -2722,6 +1134,14 @@ struct volume_options options[] = {
.max = 0x7fffffff,
.default_value = "120",
},
+ {.key = {"least-rate-limit"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .max = INT_MAX,
+ .default_value = "0",
+ .description = "Max number of least priority operations to handle "
+ "per-second"
+ },
{ .key = {NULL},
},
};
diff --git a/xlators/performance/io-threads/src/io-threads.h b/xlators/performance/io-threads/src/io-threads.h
index a6b640884..1a9dee9ae 100644
--- a/xlators/performance/io-threads/src/io-threads.h
+++ b/xlators/performance/io-threads/src/io-threads.h
@@ -53,6 +53,14 @@ typedef enum {
IOT_PRI_MAX,
} iot_pri_t;
+#define IOT_LEAST_THROTTLE_DELAY 1 /* sample interval in seconds */
+struct iot_least_throttle {
+ struct timeval sample_time; /* timestamp of current sample */
+ uint32_t sample_cnt; /* sample count for active interval */
+ uint32_t cached_rate; /* the most recently measured rate */
+ int32_t rate_limit; /* user-specified rate limit */
+ pthread_mutex_t lock;
+};
struct iot_conf {
pthread_mutex_t mutex;
@@ -75,6 +83,8 @@ struct iot_conf {
xlator_t *this;
size_t stack_size;
+
+ struct iot_least_throttle throttle;
};
typedef struct iot_conf iot_conf_t;
diff --git a/xlators/performance/md-cache/src/Makefile.am b/xlators/performance/md-cache/src/Makefile.am
index bd09c15c2..8c9f5a858 100644
--- a/xlators/performance/md-cache/src/Makefile.am
+++ b/xlators/performance/md-cache/src/Makefile.am
@@ -1,7 +1,7 @@
xlator_LTLIBRARIES = md-cache.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
-md_cache_la_LDFLAGS = -module -avoidversion
+md_cache_la_LDFLAGS = -module -avoid-version
md_cache_la_SOURCES = md-cache.c
md_cache_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
diff --git a/xlators/performance/md-cache/src/md-cache.c b/xlators/performance/md-cache/src/md-cache.c
index 237acab9f..ef156e309 100644
--- a/xlators/performance/md-cache/src/md-cache.c
+++ b/xlators/performance/md-cache/src/md-cache.c
@@ -18,6 +18,7 @@
#include "dict.h"
#include "xlator.h"
#include "md-cache-mem-types.h"
+#include "glusterfs-acl.h"
#include <assert.h>
#include <sys/time.h>
@@ -32,6 +33,7 @@ struct mdc_conf {
int timeout;
gf_boolean_t cache_posix_acl;
gf_boolean_t cache_selinux;
+ gf_boolean_t force_readdirp;
};
@@ -41,17 +43,17 @@ static struct mdc_key {
int check;
} mdc_keys[] = {
{
- .name = "system.posix_acl_access",
+ .name = POSIX_ACL_ACCESS_XATTR,
.load = 0,
.check = 1,
},
{
- .name = "system.posix_acl_default",
+ .name = POSIX_ACL_DEFAULT_XATTR,
.load = 0,
.check = 1,
},
{
- .name = "security.selinux",
+ .name = GF_SELINUX_XATTR_KEY,
.load = 0,
.check = 1,
},
@@ -65,7 +67,11 @@ static struct mdc_key {
.load = 0,
.check = 1,
},
- {},
+ {
+ .name = NULL,
+ .load = 0,
+ .check = 0,
+ }
};
@@ -127,6 +133,7 @@ struct mdc_local {
loc_t loc2;
fd_t *fd;
char *linkname;
+ char *key;
dict_t *xattr;
};
@@ -169,7 +176,7 @@ __mdc_inode_ctx_set (xlator_t *this, inode_t *inode, struct md_cache *mdc)
uint64_t mdc_int = 0;
mdc_int = (long) mdc;
- ret = __inode_ctx_set2 (inode, this, &mdc_int, 0);
+ ret = __inode_ctx_set (inode, this, &mdc_int);
return ret;
}
@@ -224,6 +231,8 @@ mdc_local_wipe (xlator_t *this, mdc_local_t *local)
GF_FREE (local->linkname);
+ GF_FREE (local->key);
+
if (local->xattr)
dict_unref (local->xattr);
@@ -580,6 +589,31 @@ out:
int
+mdc_inode_xatt_unset (xlator_t *this, inode_t *inode, char *name)
+{
+ int ret = -1;
+ struct md_cache *mdc = NULL;
+
+ mdc = mdc_inode_prep (this, inode);
+ if (!mdc)
+ goto out;
+
+ if (!name)
+ goto out;
+
+ LOCK (&mdc->lock);
+ {
+ dict_del (mdc->xattr, name);
+ }
+ UNLOCK (&mdc->lock);
+
+ ret = 0;
+out:
+ return ret;
+}
+
+
+int
mdc_inode_xatt_get (xlator_t *this, inode_t *inode, dict_t **dict)
{
int ret = -1;
@@ -593,13 +627,15 @@ mdc_inode_xatt_get (xlator_t *this, inode_t *inode, dict_t **dict)
LOCK (&mdc->lock);
{
+ ret = 0;
+ /* Missing xattr only means no keys were there, i.e
+ a negative cache for the "loaded" keys
+ */
if (!mdc->xattr)
goto unlock;
if (dict)
*dict = dict_ref (mdc->xattr);
-
- ret = 0;
}
unlock:
UNLOCK (&mdc->lock);
@@ -609,6 +645,46 @@ out:
}
+int
+mdc_inode_iatt_invalidate (xlator_t *this, inode_t *inode)
+{
+ int ret = -1;
+ struct md_cache *mdc = NULL;
+
+ if (mdc_inode_ctx_get (this, inode, &mdc) != 0)
+ goto out;
+
+ LOCK (&mdc->lock);
+ {
+ mdc->ia_time = 0;
+ }
+ UNLOCK (&mdc->lock);
+
+out:
+ return ret;
+}
+
+
+int
+mdc_inode_xatt_invalidate (xlator_t *this, inode_t *inode)
+{
+ int ret = -1;
+ struct md_cache *mdc = NULL;
+
+ if (mdc_inode_ctx_get (this, inode, &mdc) != 0)
+ goto out;
+
+ LOCK (&mdc->lock);
+ {
+ mdc->xa_time = 0;
+ }
+ UNLOCK (&mdc->lock);
+
+out:
+ return ret;
+}
+
+
void
mdc_load_reqs (xlator_t *this, dict_t *dict)
{
@@ -642,7 +718,7 @@ is_mdc_key_satisfied (const char *key)
return 0;
for (mdc_key = mdc_keys[i].name; (mdc_key = mdc_keys[i].name); i++) {
- if (!mdc_keys[i].check)
+ if (!mdc_keys[i].load)
continue;
if (strcmp (mdc_key, key) == 0)
return 1;
@@ -716,6 +792,7 @@ mdc_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
struct iatt stbuf = {0, };
struct iatt postparent = {0, };
dict_t *xattr_rsp = NULL;
+ dict_t *xattr_alloc = NULL;
mdc_local_t *local = NULL;
@@ -723,6 +800,13 @@ mdc_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
if (!local)
goto uncached;
+ if (!loc->name)
+ /* A nameless discovery is dangerous to cache. We
+ perform nameless lookup with the intention of
+ re-establishing an inode "properly"
+ */
+ goto uncached;
+
loc_copy (&local->loc, loc);
ret = mdc_inode_iatt_get (this, loc->inode, &stbuf);
@@ -747,6 +831,8 @@ mdc_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
return 0;
uncached:
+ if (!xdata)
+ xdata = xattr_alloc = dict_new ();
if (xdata)
mdc_load_reqs (this, xdata);
@@ -755,7 +841,8 @@ uncached:
if (xattr_rsp)
dict_unref (xattr_rsp);
-
+ if (xattr_alloc)
+ dict_unref (xattr_alloc);
return 0;
}
@@ -1568,6 +1655,8 @@ mdc_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
mdc_inode_xatt_update (this, local->loc.inode, local->xattr);
+ mdc_inode_iatt_invalidate (this, local->loc.inode);
+
out:
MDC_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata);
@@ -1609,6 +1698,7 @@ mdc_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
mdc_inode_xatt_update (this, local->fd->inode, local->xattr);
+ mdc_inode_iatt_invalidate (this, local->fd->inode);
out:
MDC_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, xdata);
@@ -1661,6 +1751,7 @@ mdc_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key,
dict_t *xdata)
{
int ret;
+ int op_errno = ENODATA;
mdc_local_t *local = NULL;
dict_t *xattr = NULL;
@@ -1677,10 +1768,12 @@ mdc_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key,
if (ret != 0)
goto uncached;
- if (!dict_get (xattr, (char *)key))
- goto uncached;
+ if (!xattr || !dict_get (xattr, (char *)key)) {
+ ret = -1;
+ op_errno = ENODATA;
+ }
- MDC_STACK_UNWIND (getxattr, frame, 0, 0, xattr, xdata);
+ MDC_STACK_UNWIND (getxattr, frame, ret, op_errno, xattr, xdata);
return 0;
@@ -1722,6 +1815,7 @@ mdc_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key,
int ret;
mdc_local_t *local = NULL;
dict_t *xattr = NULL;
+ int op_errno = ENODATA;
local = mdc_local_get (frame);
if (!local)
@@ -1736,10 +1830,12 @@ mdc_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key,
if (ret != 0)
goto uncached;
- if (!dict_get (xattr, (char *)key))
- goto uncached;
+ if (!xattr || !dict_get (xattr, (char *)key)) {
+ ret = -1;
+ op_errno = ENODATA;
+ }
- MDC_STACK_UNWIND (fgetxattr, frame, 0, 0, xattr, xdata);
+ MDC_STACK_UNWIND (fgetxattr, frame, ret, op_errno, xattr, xdata);
return 0;
@@ -1750,6 +1846,97 @@ uncached:
return 0;
}
+int
+mdc_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ mdc_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (op_ret != 0)
+ goto out;
+
+ if (!local)
+ goto out;
+
+ if (local->key)
+ mdc_inode_xatt_unset (this, local->loc.inode, local->key);
+ else
+ mdc_inode_xatt_invalidate (this, local->loc.inode);
+
+ mdc_inode_iatt_invalidate (this, local->loc.inode);
+out:
+ MDC_STACK_UNWIND (removexattr, frame, op_ret, op_errno, xdata);
+
+ return 0;
+}
+
+
+int
+mdc_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
+{
+ mdc_local_t *local = NULL;
+
+ local = mdc_local_get (frame);
+
+ loc_copy (&local->loc, loc);
+
+ local->key = gf_strdup (name);
+
+ STACK_WIND (frame, mdc_removexattr_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->removexattr,
+ loc, name, xdata);
+ return 0;
+}
+
+
+int
+mdc_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ mdc_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (op_ret != 0)
+ goto out;
+
+ if (!local)
+ goto out;
+
+ if (local->key)
+ mdc_inode_xatt_unset (this, local->fd->inode, local->key);
+ else
+ mdc_inode_xatt_invalidate (this, local->fd->inode);
+
+ mdc_inode_iatt_invalidate (this, local->fd->inode);
+out:
+ MDC_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, xdata);
+
+ return 0;
+}
+
+
+int
+mdc_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata)
+{
+ mdc_local_t *local = NULL;
+
+ local = mdc_local_get (frame);
+
+ local->fd = fd_ref (fd);
+
+ local->key = gf_strdup (name);
+
+ STACK_WIND (frame, mdc_fremovexattr_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->fremovexattr,
+ fd, name, xdata);
+ return 0;
+}
+
int
mdc_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -1777,18 +1964,42 @@ int
mdc_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd,
size_t size, off_t offset, dict_t *xdata)
{
+ dict_t *xattr_alloc = NULL;
+
+ if (!xdata)
+ xdata = xattr_alloc = dict_new ();
+ if (xdata)
+ mdc_load_reqs (this, xdata);
+
STACK_WIND (frame, mdc_readdirp_cbk,
FIRST_CHILD (this), FIRST_CHILD (this)->fops->readdirp,
fd, size, offset, xdata);
+ if (xattr_alloc)
+ dict_unref (xattr_alloc);
return 0;
}
+int
+mdc_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, gf_dirent_t *entries, dict_t *xdata)
+{
+ STACK_UNWIND_STRICT (readdir, frame, op_ret, op_errno, entries, xdata);
+ return 0;
+}
int
mdc_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd,
size_t size, off_t offset, dict_t *xdata)
{
int need_unref = 0;
+ struct mdc_conf *conf = this->private;
+
+ if (!conf->force_readdirp) {
+ STACK_WIND(frame, mdc_readdir_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdir, fd, size, offset,
+ xdata);
+ return 0;
+ }
if (!xdata) {
xdata = dict_new ();
@@ -1798,9 +2009,9 @@ mdc_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd,
if (xdata)
mdc_load_reqs (this, xdata);
- STACK_WIND (frame, mdc_readdirp_cbk,
- FIRST_CHILD (this), FIRST_CHILD (this)->fops->readdirp,
- fd, size, offset, xdata);
+ STACK_WIND(frame, mdc_readdirp_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdirp, fd, size, offset,
+ xdata);
if (need_unref && xdata)
dict_unref (xdata);
@@ -1808,6 +2019,123 @@ mdc_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd,
return 0;
}
+int
+mdc_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)
+{
+ mdc_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (op_ret != 0)
+ goto out;
+
+ if (!local)
+ goto out;
+
+ mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf);
+
+out:
+ MDC_STACK_UNWIND (fallocate, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+
+ return 0;
+}
+
+int mdc_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ mdc_local_t *local;
+
+ local = mdc_local_get(frame);
+ local->fd = fd_ref(fd);
+
+ STACK_WIND(frame, mdc_fallocate_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fallocate, fd, mode, offset, len,
+ xdata);
+
+ return 0;
+}
+
+int
+mdc_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)
+{
+ mdc_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (op_ret != 0)
+ goto out;
+
+ if (!local)
+ goto out;
+
+ mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf);
+
+out:
+ MDC_STACK_UNWIND(discard, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+
+ return 0;
+}
+
+int mdc_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ mdc_local_t *local;
+
+ local = mdc_local_get(frame);
+ local->fd = fd_ref(fd);
+
+ STACK_WIND(frame, mdc_discard_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->discard, fd, offset, len,
+ xdata);
+
+ return 0;
+}
+
+int
+mdc_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)
+{
+ mdc_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (op_ret != 0)
+ goto out;
+
+ if (!local)
+ goto out;
+
+ mdc_inode_iatt_set_validate(this, local->fd->inode, prebuf, postbuf);
+
+out:
+ MDC_STACK_UNWIND(zerofill, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+
+ return 0;
+}
+
+int mdc_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata)
+{
+ mdc_local_t *local;
+
+ local = mdc_local_get(frame);
+ local->fd = fd_ref(fd);
+
+ STACK_WIND(frame, mdc_zerofill_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->zerofill, fd, offset, len,
+ xdata);
+
+ return 0;
+}
+
int
mdc_forget (xlator_t *this, inode_t *inode)
@@ -1862,6 +2190,8 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("cache-posix-acl", conf->cache_posix_acl, options, bool, out);
mdc_key_load_set (mdc_keys, "system.posix_acl_", conf->cache_posix_acl);
+ GF_OPTION_RECONF("force-readdirp", conf->force_readdirp, options, bool, out);
+
out:
return 0;
}
@@ -1894,6 +2224,8 @@ init (xlator_t *this)
GF_OPTION_INIT ("cache-posix-acl", conf->cache_posix_acl, bool, out);
mdc_key_load_set (mdc_keys, "system.posix_acl_", conf->cache_posix_acl);
+
+ GF_OPTION_INIT("force-readdirp", conf->force_readdirp, bool, out);
out:
this->private = conf;
@@ -1931,8 +2263,13 @@ struct xlator_fops fops = {
.fsetxattr = mdc_fsetxattr,
.getxattr = mdc_getxattr,
.fgetxattr = mdc_fgetxattr,
+ .removexattr = mdc_removexattr,
+ .fremovexattr= mdc_fremovexattr,
.readdirp = mdc_readdirp,
- .readdir = mdc_readdir
+ .readdir = mdc_readdir,
+ .fallocate = mdc_fallocate,
+ .discard = mdc_discard,
+ .zerofill = mdc_zerofill,
};
@@ -1956,4 +2293,11 @@ struct volume_options options[] = {
.default_value = "1",
.description = "Time period after which cache has to be refreshed",
},
+ { .key = {"force-readdirp"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "true",
+ .description = "Convert all readdir requests to readdirplus to "
+ "collect stat info on each entry.",
+ },
+ { .key = {NULL} },
};
diff --git a/xlators/performance/open-behind/Makefile.am b/xlators/performance/open-behind/Makefile.am
new file mode 100644
index 000000000..af437a64d
--- /dev/null
+++ b/xlators/performance/open-behind/Makefile.am
@@ -0,0 +1 @@
+SUBDIRS = src
diff --git a/xlators/performance/open-behind/src/Makefile.am b/xlators/performance/open-behind/src/Makefile.am
new file mode 100644
index 000000000..125285707
--- /dev/null
+++ b/xlators/performance/open-behind/src/Makefile.am
@@ -0,0 +1,15 @@
+xlator_LTLIBRARIES = open-behind.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
+
+open_behind_la_LDFLAGS = -module -avoid-version
+
+open_behind_la_SOURCES = open-behind.c
+open_behind_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = open-behind-mem-types.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/performance/open-behind/src/open-behind-mem-types.h b/xlators/performance/open-behind/src/open-behind-mem-types.h
new file mode 100644
index 000000000..1e94296f4
--- /dev/null
+++ b/xlators/performance/open-behind/src/open-behind-mem-types.h
@@ -0,0 +1,21 @@
+/*
+ Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __OB_MEM_TYPES_H__
+#define __OB_MEM_TYPES_H__
+
+#include "mem-types.h"
+
+enum gf_ob_mem_types_ {
+ gf_ob_mt_fd_t = gf_common_mt_end + 1,
+ gf_ob_mt_conf_t,
+ gf_ob_mt_end
+};
+#endif
diff --git a/xlators/performance/open-behind/src/open-behind.c b/xlators/performance/open-behind/src/open-behind.c
new file mode 100644
index 000000000..742e4df3f
--- /dev/null
+++ b/xlators/performance/open-behind/src/open-behind.c
@@ -0,0 +1,1020 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "open-behind-mem-types.h"
+#include "xlator.h"
+#include "statedump.h"
+#include "call-stub.h"
+#include "defaults.h"
+
+typedef struct ob_conf {
+ gf_boolean_t use_anonymous_fd; /* use anonymous FDs wherever safe
+ e.g - fstat() readv()
+
+ whereas for fops like writev(), lk(),
+ the fd is important for side effects
+ like mandatory locks
+ */
+ gf_boolean_t lazy_open; /* delay backend open as much as possible */
+ gf_boolean_t read_after_open; /* instead of sending readvs on
+ anonymous fds, open the file
+ first and then send readv i.e
+ similar to what writev does
+ */
+} ob_conf_t;
+
+
+typedef struct ob_fd {
+ call_frame_t *open_frame;
+ loc_t loc;
+ dict_t *xdata;
+ int flags;
+ int op_errno;
+ struct list_head list;
+} ob_fd_t;
+
+
+ob_fd_t *
+__ob_fd_ctx_get (xlator_t *this, fd_t *fd)
+{
+ uint64_t value = 0;
+ int ret = -1;
+ ob_fd_t *ob_fd = NULL;
+
+ ret = __fd_ctx_get (fd, this, &value);
+ if (ret)
+ return NULL;
+
+ ob_fd = (void *) ((long) value);
+
+ return ob_fd;
+}
+
+
+ob_fd_t *
+ob_fd_ctx_get (xlator_t *this, fd_t *fd)
+{
+ ob_fd_t *ob_fd = NULL;
+
+ LOCK (&fd->lock);
+ {
+ ob_fd = __ob_fd_ctx_get (this, fd);
+ }
+ UNLOCK (&fd->lock);
+
+ return ob_fd;
+}
+
+
+int
+__ob_fd_ctx_set (xlator_t *this, fd_t *fd, ob_fd_t *ob_fd)
+{
+ uint64_t value = 0;
+ int ret = -1;
+
+ value = (long) ((void *) ob_fd);
+
+ ret = __fd_ctx_set (fd, this, value);
+
+ return ret;
+}
+
+
+int
+ob_fd_ctx_set (xlator_t *this, fd_t *fd, ob_fd_t *ob_fd)
+{
+ int ret = -1;
+
+ LOCK (&fd->lock);
+ {
+ ret = __ob_fd_ctx_set (this, fd, ob_fd);
+ }
+ UNLOCK (&fd->lock);
+
+ return ret;
+}
+
+
+ob_fd_t *
+ob_fd_new (void)
+{
+ ob_fd_t *ob_fd = NULL;
+
+ ob_fd = GF_CALLOC (1, sizeof (*ob_fd), gf_ob_mt_fd_t);
+
+ INIT_LIST_HEAD (&ob_fd->list);
+
+ return ob_fd;
+}
+
+
+void
+ob_fd_free (ob_fd_t *ob_fd)
+{
+ loc_wipe (&ob_fd->loc);
+
+ if (ob_fd->xdata)
+ dict_unref (ob_fd->xdata);
+
+ if (ob_fd->open_frame)
+ STACK_DESTROY (ob_fd->open_frame->root);
+
+ GF_FREE (ob_fd);
+}
+
+
+int
+ob_wake_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, fd_t *fd_ret, dict_t *xdata)
+{
+ fd_t *fd = NULL;
+ struct list_head list;
+ ob_fd_t *ob_fd = NULL;
+ call_stub_t *stub = NULL, *tmp = NULL;
+
+ fd = frame->local;
+ frame->local = NULL;
+
+ INIT_LIST_HEAD (&list);
+
+ LOCK (&fd->lock);
+ {
+ ob_fd = __ob_fd_ctx_get (this, fd);
+
+ list_splice_init (&ob_fd->list, &list);
+
+ if (op_ret < 0) {
+ /* mark fd BAD for ever */
+ ob_fd->op_errno = op_errno;
+ } else {
+ __fd_ctx_del (fd, this, NULL);
+ ob_fd_free (ob_fd);
+ }
+ }
+ UNLOCK (&fd->lock);
+
+ list_for_each_entry_safe (stub, tmp, &list, list) {
+ list_del_init (&stub->list);
+
+ if (op_ret < 0)
+ call_unwind_error (stub, -1, op_errno);
+ else
+ call_resume (stub);
+ }
+
+ fd_unref (fd);
+
+ STACK_DESTROY (frame->root);
+
+ return 0;
+}
+
+
+int
+ob_fd_wake (xlator_t *this, fd_t *fd)
+{
+ call_frame_t *frame = NULL;
+ ob_fd_t *ob_fd = NULL;
+
+ LOCK (&fd->lock);
+ {
+ ob_fd = __ob_fd_ctx_get (this, fd);
+ if (!ob_fd)
+ goto unlock;
+
+ frame = ob_fd->open_frame;
+ ob_fd->open_frame = NULL;
+ }
+unlock:
+ UNLOCK (&fd->lock);
+
+ if (frame) {
+ frame->local = fd_ref (fd);
+
+ STACK_WIND (frame, ob_wake_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->open,
+ &ob_fd->loc, ob_fd->flags, fd, ob_fd->xdata);
+ }
+
+ return 0;
+}
+
+
+int
+open_and_resume (xlator_t *this, fd_t *fd, call_stub_t *stub)
+{
+ ob_fd_t *ob_fd = NULL;
+ int op_errno = 0;
+
+ if (!fd)
+ goto nofd;
+
+ LOCK (&fd->lock);
+ {
+ ob_fd = __ob_fd_ctx_get (this, fd);
+ if (!ob_fd)
+ goto unlock;
+
+ if (ob_fd->op_errno) {
+ op_errno = ob_fd->op_errno;
+ goto unlock;
+ }
+
+ list_add_tail (&stub->list, &ob_fd->list);
+ }
+unlock:
+ UNLOCK (&fd->lock);
+
+nofd:
+ if (op_errno)
+ call_unwind_error (stub, -1, op_errno);
+ else if (ob_fd)
+ ob_fd_wake (this, fd);
+ else
+ call_resume (stub);
+
+ return 0;
+}
+
+
+int
+ob_open_behind (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+ fd_t *fd, dict_t *xdata)
+{
+ ob_fd_t *ob_fd = NULL;
+ int ret = -1;
+ ob_conf_t *conf = NULL;
+
+
+ conf = this->private;
+
+ if (flags & O_TRUNC) {
+ STACK_WIND (frame, default_open_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->open,
+ loc, flags, fd, xdata);
+ return 0;
+ }
+
+ ob_fd = ob_fd_new ();
+ if (!ob_fd)
+ goto enomem;
+
+ ob_fd->open_frame = copy_frame (frame);
+ if (!ob_fd->open_frame)
+ goto enomem;
+ ret = loc_copy (&ob_fd->loc, loc);
+ if (ret)
+ goto enomem;
+
+ ob_fd->flags = flags;
+ if (xdata)
+ ob_fd->xdata = dict_ref (xdata);
+
+ ret = ob_fd_ctx_set (this, fd, ob_fd);
+ if (ret)
+ goto enomem;
+
+ fd_ref (fd);
+
+ STACK_UNWIND_STRICT (open, frame, 0, 0, fd, xdata);
+
+ if (!conf->lazy_open)
+ ob_fd_wake (this, fd);
+
+ fd_unref (fd);
+
+ return 0;
+enomem:
+ if (ob_fd) {
+ if (ob_fd->open_frame)
+ STACK_DESTROY (ob_fd->open_frame->root);
+ loc_wipe (&ob_fd->loc);
+ if (ob_fd->xdata)
+ dict_unref (ob_fd->xdata);
+ GF_FREE (ob_fd);
+ }
+
+ return -1;
+}
+
+
+int
+ob_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+ fd_t *fd, dict_t *xdata)
+{
+ fd_t *old_fd = NULL;
+ int ret = -1;
+ int op_errno = 0;
+ call_stub_t *stub = NULL;
+
+ old_fd = fd_lookup (fd->inode, 0);
+ if (old_fd) {
+ /* open-behind only when this is the first FD */
+ stub = fop_open_stub (frame, default_open_resume,
+ loc, flags, fd, xdata);
+ if (!stub) {
+ op_errno = ENOMEM;
+ fd_unref (old_fd);
+ goto err;
+ }
+
+ open_and_resume (this, old_fd, stub);
+
+ fd_unref (old_fd);
+
+ return 0;
+ }
+
+ ret = ob_open_behind (frame, this, loc, flags, fd, xdata);
+ if (ret) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ return 0;
+err:
+ gf_log (this->name, GF_LOG_ERROR, "%s: %s", loc->path,
+ strerror (op_errno));
+
+ STACK_UNWIND_STRICT (open, frame, -1, op_errno, 0, 0);
+
+ return 0;
+}
+
+
+fd_t *
+ob_get_wind_fd (xlator_t *this, fd_t *fd)
+{
+ ob_conf_t *conf = NULL;
+ ob_fd_t *ob_fd = NULL;
+
+ conf = this->private;
+
+ ob_fd = ob_fd_ctx_get (this, fd);
+
+ if (ob_fd && conf->use_anonymous_fd)
+ return fd_anonymous (fd->inode);
+
+ return fd_ref (fd);
+}
+
+
+int
+ob_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+ fd_t *wind_fd = NULL;
+ ob_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ if (!conf->read_after_open)
+ wind_fd = ob_get_wind_fd (this, fd);
+ else
+ wind_fd = fd_ref (fd);
+
+ stub = fop_readv_stub (frame, default_readv_resume, wind_fd,
+ size, offset, flags, xdata);
+ fd_unref (wind_fd);
+
+ if (!stub)
+ goto err;
+
+ open_and_resume (this, wind_fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (readv, frame, -1, ENOMEM, 0, 0, 0, 0, 0);
+
+ return 0;
+}
+
+
+int
+ob_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *iov,
+ int count, off_t offset, uint32_t flags, struct iobref *iobref,
+ dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+
+ stub = fop_writev_stub (frame, default_writev_resume, fd, iov, count,
+ offset, flags, iobref, xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume (this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (writev, frame, -1, ENOMEM, 0, 0, 0);
+
+ return 0;
+}
+
+
+int
+ob_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+ fd_t *wind_fd = NULL;
+
+ wind_fd = ob_get_wind_fd (this, fd);
+
+ stub = fop_fstat_stub (frame, default_fstat_resume, wind_fd, xdata);
+
+ fd_unref (wind_fd);
+
+ if (!stub)
+ goto err;
+
+ open_and_resume (this, wind_fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (fstat, frame, -1, ENOMEM, 0, 0);
+
+ return 0;
+}
+
+
+int
+ob_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+ ob_fd_t *ob_fd = NULL;
+ gf_boolean_t unwind = _gf_false;
+
+ LOCK (&fd->lock);
+ {
+ ob_fd = __ob_fd_ctx_get (this, fd);
+ if (ob_fd && ob_fd->open_frame)
+ /* if open() was never wound to backend,
+ no need to wind flush() either.
+ */
+ unwind = _gf_true;
+ }
+ UNLOCK (&fd->lock);
+
+ if (unwind)
+ goto unwind;
+
+ stub = fop_flush_stub (frame, default_flush_resume, fd, xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume (this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (flush, frame, -1, ENOMEM, 0);
+
+ return 0;
+
+unwind:
+ STACK_UNWIND_STRICT (flush, frame, 0, 0, 0);
+
+ return 0;
+}
+
+
+int
+ob_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int flag,
+ dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+
+ stub = fop_fsync_stub (frame, default_fsync_resume, fd, flag, xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume (this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (fsync, frame, -1, ENOMEM, 0, 0, 0);
+
+ return 0;
+}
+
+
+int
+ob_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int cmd,
+ struct gf_flock *flock, dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+
+ stub = fop_lk_stub (frame, default_lk_resume, fd, cmd, flock, xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume (this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (lk, frame, -1, ENOMEM, 0, 0);
+
+ return 0;
+}
+
+int
+ob_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+
+ stub = fop_ftruncate_stub (frame, default_ftruncate_resume, fd, offset,
+ xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume (this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (ftruncate, frame, -1, ENOMEM, 0, 0, 0);
+
+ return 0;
+}
+
+
+int
+ob_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xattr,
+ int flags, dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+
+ stub = fop_fsetxattr_stub (frame, default_fsetxattr_resume, fd, xattr,
+ flags, xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume (this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (fsetxattr, frame, -1, ENOMEM, 0);
+
+ return 0;
+}
+
+
+int
+ob_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+ dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+
+ stub = fop_fgetxattr_stub (frame, default_fgetxattr_resume, fd, name,
+ xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume (this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (fgetxattr, frame, -1, ENOMEM, 0, 0);
+
+ return 0;
+}
+
+
+int
+ob_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+
+ stub = fop_fremovexattr_stub (frame, default_fremovexattr_resume, fd,
+ name, xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume (this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (fremovexattr, frame, -1, ENOMEM, 0);
+
+ return 0;
+}
+
+
+int
+ob_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+ int cmd, struct gf_flock *flock, dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+
+ stub = fop_finodelk_stub (frame, default_finodelk_resume, volume, fd,
+ cmd, flock, xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume (this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (finodelk, frame, -1, ENOMEM, 0);
+
+ return 0;
+}
+
+
+int
+ob_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+ const char *basename, entrylk_cmd cmd, entrylk_type type,
+ dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+
+ stub = fop_fentrylk_stub (frame, default_fentrylk_resume, volume, fd,
+ basename, cmd, type, xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume (this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (fentrylk, frame, -1, ENOMEM, 0);
+
+ return 0;
+}
+
+
+int
+ob_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+
+ stub = fop_fxattrop_stub (frame, default_fxattrop_resume, fd, optype,
+ xattr, xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume (this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (fxattrop, frame, -1, ENOMEM, 0, 0);
+
+ return 0;
+}
+
+
+int
+ob_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iatt *iatt, int valid, dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+
+ stub = fop_fsetattr_stub (frame, default_fsetattr_resume, fd,
+ iatt, valid, xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume (this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (fsetattr, frame, -1, ENOMEM, 0, 0, 0);
+
+ return 0;
+}
+
+int
+ob_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ call_stub_t *stub;
+
+ stub = fop_fallocate_stub(frame, default_fallocate_resume, fd, mode,
+ offset, len, xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume(this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT(fallocate, frame, -1, ENOMEM, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+ob_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ call_stub_t *stub;
+
+ stub = fop_discard_stub(frame, default_discard_resume, fd, offset, len,
+ xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume(this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT(discard, frame, -1, ENOMEM, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+ob_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata)
+{
+ call_stub_t *stub;
+
+ stub = fop_zerofill_stub(frame, default_zerofill_resume, fd,
+ offset, len, xdata);
+ if (!stub)
+ goto err;
+
+ open_and_resume(this, fd, stub);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT(zerofill, frame, -1, ENOMEM, NULL, NULL, NULL);
+ return 0;
+}
+
+
+int
+ob_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags,
+ dict_t *xdata)
+{
+ fd_t *fd = NULL;
+ call_stub_t *stub = NULL;
+
+ stub = fop_unlink_stub (frame, default_unlink_resume, loc,
+ xflags, xdata);
+ if (!stub)
+ goto err;
+
+ fd = fd_lookup (loc->inode, 0);
+
+ open_and_resume (this, fd, stub);
+ if (fd)
+ fd_unref (fd);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (unlink, frame, -1, ENOMEM, 0, 0, 0);
+
+ return 0;
+}
+
+
+int
+ob_rename (call_frame_t *frame, xlator_t *this, loc_t *src, loc_t *dst,
+ dict_t *xdata)
+{
+ fd_t *fd = NULL;
+ call_stub_t *stub = NULL;
+
+ stub = fop_rename_stub (frame, default_rename_resume, src, dst, xdata);
+ if (!stub)
+ goto err;
+
+ if (dst->inode)
+ fd = fd_lookup (dst->inode, 0);
+
+ open_and_resume (this, fd, stub);
+ if (fd)
+ fd_unref (fd);
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (rename, frame, -1, ENOMEM, 0, 0, 0, 0, 0, 0);
+
+ return 0;
+}
+
+
+int
+ob_release (xlator_t *this, fd_t *fd)
+{
+ ob_fd_t *ob_fd = NULL;
+
+ ob_fd = ob_fd_ctx_get (this, fd);
+
+ ob_fd_free (ob_fd);
+
+ return 0;
+}
+
+
+int
+ob_priv_dump (xlator_t *this)
+{
+ ob_conf_t *conf = NULL;
+ char key_prefix[GF_DUMP_MAX_BUF_LEN];
+
+ conf = this->private;
+
+ if (!conf)
+ return -1;
+
+ gf_proc_dump_build_key (key_prefix, "xlator.performance.open-behind",
+ "priv");
+
+ gf_proc_dump_add_section (key_prefix);
+
+ gf_proc_dump_write ("use_anonymous_fd", "%d", conf->use_anonymous_fd);
+
+ gf_proc_dump_write ("lazy_open", "%d", conf->lazy_open);
+
+ return 0;
+}
+
+
+int
+ob_fdctx_dump (xlator_t *this, fd_t *fd)
+{
+ ob_fd_t *ob_fd = NULL;
+ char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, };
+ int ret = 0;
+
+ ret = TRY_LOCK (&fd->lock);
+ if (ret)
+ return 0;
+
+ ob_fd = __ob_fd_ctx_get (this, fd);
+ if (!ob_fd) {
+ UNLOCK (&fd->lock);
+ return 0;
+ }
+
+ gf_proc_dump_build_key (key_prefix, "xlator.performance.open-behind",
+ "file");
+ gf_proc_dump_add_section (key_prefix);
+
+ gf_proc_dump_write ("fd", "%p", fd);
+
+ gf_proc_dump_write ("open_frame", "%p", ob_fd->open_frame);
+
+ gf_proc_dump_write ("open_frame.root.unique", "%p",
+ ob_fd->open_frame->root->unique);
+
+ gf_proc_dump_write ("loc.path", "%s", ob_fd->loc.path);
+
+ gf_proc_dump_write ("loc.ino", "%s", uuid_utoa (ob_fd->loc.gfid));
+
+ gf_proc_dump_write ("flags", "%p", ob_fd->open_frame);
+
+ UNLOCK (&fd->lock);
+
+ return 0;
+}
+
+
+int
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ ret = xlator_mem_acct_init (this, gf_ob_mt_end + 1);
+
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR, "Memory accounting failed");
+
+ return ret;
+}
+
+
+int
+reconfigure (xlator_t *this, dict_t *options)
+{
+ ob_conf_t *conf = NULL;
+ int ret = -1;
+
+ conf = this->private;
+
+ GF_OPTION_RECONF ("use-anonymous-fd", conf->use_anonymous_fd, options,
+ bool, out);
+
+ GF_OPTION_RECONF ("lazy-open", conf->lazy_open, options, bool, out);
+ GF_OPTION_RECONF ("read-after-open", conf->read_after_open, options,
+ bool, out);
+
+ ret = 0;
+out:
+ return ret;
+}
+
+
+int
+init (xlator_t *this)
+{
+ ob_conf_t *conf = NULL;
+
+ if (!this->children || this->children->next) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "FATAL: volume (%s) not configured with exactly one "
+ "child", this->name);
+ return -1;
+ }
+
+ if (!this->parents)
+ gf_log (this->name, GF_LOG_WARNING,
+ "dangling volume. check volfile ");
+
+ conf = GF_CALLOC (1, sizeof (*conf), gf_ob_mt_conf_t);
+ if (!conf)
+ goto err;
+
+ GF_OPTION_INIT ("use-anonymous-fd", conf->use_anonymous_fd, bool, err);
+
+ GF_OPTION_INIT ("lazy-open", conf->lazy_open, bool, err);
+ GF_OPTION_INIT ("read-after-open", conf->read_after_open, bool, err);
+ this->private = conf;
+
+ return 0;
+err:
+ if (conf)
+ GF_FREE (conf);
+
+ return -1;
+}
+
+
+void
+fini (xlator_t *this)
+{
+ ob_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ GF_FREE (conf);
+
+ return;
+}
+
+
+struct xlator_fops fops = {
+ .open = ob_open,
+ .readv = ob_readv,
+ .writev = ob_writev,
+ .flush = ob_flush,
+ .fsync = ob_fsync,
+ .fstat = ob_fstat,
+ .ftruncate = ob_ftruncate,
+ .fsetxattr = ob_fsetxattr,
+ .fgetxattr = ob_fgetxattr,
+ .fremovexattr = ob_fremovexattr,
+ .finodelk = ob_finodelk,
+ .fentrylk = ob_fentrylk,
+ .fxattrop = ob_fxattrop,
+ .fsetattr = ob_fsetattr,
+ .fallocate = ob_fallocate,
+ .discard = ob_discard,
+ .zerofill = ob_zerofill,
+ .unlink = ob_unlink,
+ .rename = ob_rename,
+ .lk = ob_lk,
+};
+
+struct xlator_cbks cbks = {
+ .release = ob_release,
+};
+
+struct xlator_dumpops dumpops = {
+ .priv = ob_priv_dump,
+ .fdctx = ob_fdctx_dump,
+};
+
+
+struct volume_options options[] = {
+ { .key = {"use-anonymous-fd"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "yes",
+ .description = "For read operations, use anonymous FD when "
+ "original FD is open-behind and not yet opened in the backend.",
+ },
+ { .key = {"lazy-open"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "yes",
+ .description = "Perform open in the backend only when a necessary "
+ "FOP arrives (e.g writev on the FD, unlink of the file). When option "
+ "is disabled, perform backend open right after unwinding open().",
+ },
+ { .key = {"read-after-open"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ .description = "read is sent only after actual open happens and real "
+ "fd is obtained, instead of doing on anonymous fd (similar to write)",
+ },
+ { .key = {NULL} }
+
+};
diff --git a/xlators/performance/quick-read/src/Makefile.am b/xlators/performance/quick-read/src/Makefile.am
index 790f1e943..4906f408a 100644
--- a/xlators/performance/quick-read/src/Makefile.am
+++ b/xlators/performance/quick-read/src/Makefile.am
@@ -1,7 +1,7 @@
xlator_LTLIBRARIES = quick-read.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
-quick_read_la_LDFLAGS = -module -avoidversion
+quick_read_la_LDFLAGS = -module -avoid-version
quick_read_la_SOURCES = quick-read.c
quick_read_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
diff --git a/xlators/performance/quick-read/src/quick-read-mem-types.h b/xlators/performance/quick-read/src/quick-read-mem-types.h
index 73c87c819..78547f641 100644
--- a/xlators/performance/quick-read/src/quick-read-mem-types.h
+++ b/xlators/performance/quick-read/src/quick-read-mem-types.h
@@ -15,6 +15,7 @@
enum gf_qr_mem_types_ {
gf_qr_mt_qr_inode_t = gf_common_mt_end + 1,
+ gf_qr_mt_content_t,
gf_qr_mt_qr_fd_ctx_t,
gf_qr_mt_iovec,
gf_qr_mt_qr_conf_t,
diff --git a/xlators/performance/quick-read/src/quick-read.c b/xlators/performance/quick-read/src/quick-read.c
index 867900b90..b8b4c5326 100644
--- a/xlators/performance/quick-read/src/quick-read.c
+++ b/xlators/performance/quick-read/src/quick-read.c
@@ -11,182 +11,103 @@
#include "quick-read.h"
#include "statedump.h"
-#define QR_DEFAULT_CACHE_SIZE 134217728
+qr_inode_t *qr_inode_ctx_get (xlator_t *this, inode_t *inode);
+void __qr_inode_prune (qr_inode_table_t *table, qr_inode_t *qr_inode);
-struct volume_options options[];
-void
-_fd_unref (fd_t *fd);
-
-void
-qr_local_free (qr_local_t *local)
+int
+__qr_inode_ctx_set (xlator_t *this, inode_t *inode, qr_inode_t *qr_inode)
{
- if (local == NULL) {
- goto out;
- }
-
- if (local->stub != NULL) {
- call_stub_destroy (local->stub);
- }
+ uint64_t value = 0;
+ int ret = -1;
- GF_FREE (local->path);
+ value = (long) qr_inode;
- mem_put (local);
+ ret = __inode_ctx_set (inode, this, &value);
-out:
- return;
+ return ret;
}
-qr_local_t *
-qr_local_new (xlator_t *this)
-{
- qr_local_t *local = NULL;
-
- local = mem_get0 (this->local_pool);
- if (local == NULL) {
- goto out;
- }
-
- LOCK_INIT (&local->lock);
- INIT_LIST_HEAD (&local->list);
-out:
- return local;
-}
-
-
-int32_t
-qr_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t offset, uint32_t flags, dict_t *xdata);
-
-
-static void
-qr_loc_wipe (loc_t *loc)
+qr_inode_t *
+__qr_inode_ctx_get (xlator_t *this, inode_t *inode)
{
- if (loc == NULL) {
- goto out;
- }
+ qr_inode_t *qr_inode = NULL;
+ uint64_t value = 0;
+ int ret = -1;
- if (loc->path) {
- GF_FREE ((char *)loc->path);
- loc->path = NULL;
- }
+ ret = __inode_ctx_get (inode, this, &value);
+ if (ret)
+ return NULL;
- if (loc->inode) {
- inode_unref (loc->inode);
- loc->inode = NULL;
- }
-
- if (loc->parent) {
- inode_unref (loc->parent);
- loc->parent = NULL;
- }
+ qr_inode = (void *) ((long) value);
-out:
- return;
+ return qr_inode;
}
-static int32_t
-qr_loc_fill (loc_t *loc, inode_t *inode, char *path)
+qr_inode_t *
+qr_inode_ctx_get (xlator_t *this, inode_t *inode)
{
- int32_t ret = -1;
+ qr_inode_t *qr_inode = NULL;
- GF_VALIDATE_OR_GOTO_WITH_ERROR ("quick-read", loc, out, errno, EINVAL);
- GF_VALIDATE_OR_GOTO_WITH_ERROR ("quick-read", inode, out, errno,
- EINVAL);
- GF_VALIDATE_OR_GOTO_WITH_ERROR ("quick-read", path, out, errno, EINVAL);
+ LOCK (&inode->lock);
+ {
+ qr_inode = __qr_inode_ctx_get (this, inode);
+ }
+ UNLOCK (&inode->lock);
- loc->inode = inode_ref (inode);
- uuid_copy (loc->gfid, inode->gfid);
-
- loc->path = gf_strdup (path);
- if (!loc->path)
- goto out;
-
- ret = 0;
-out:
- if (ret == -1) {
- qr_loc_wipe (loc);
- }
-
- return ret;
+ return qr_inode;
}
-void
-qr_resume_pending_ops (qr_fd_ctx_t *qr_fd_ctx, int32_t op_ret, int32_t op_errno)
+qr_inode_t *
+qr_inode_new (xlator_t *this, inode_t *inode)
{
- call_stub_t *stub = NULL, *tmp = NULL;
- struct list_head waiting_ops = {0, };
-
- GF_VALIDATE_OR_GOTO ("quick-read", qr_fd_ctx, out);
-
- INIT_LIST_HEAD (&waiting_ops);
-
- LOCK (&qr_fd_ctx->lock);
- {
- qr_fd_ctx->open_in_transit = 0;
- list_splice_init (&qr_fd_ctx->waiting_ops,
- &waiting_ops);
- }
- UNLOCK (&qr_fd_ctx->lock);
+ qr_inode_t *qr_inode = NULL;
- if (!list_empty (&waiting_ops)) {
- list_for_each_entry_safe (stub, tmp, &waiting_ops, list) {
- list_del_init (&stub->list);
- if (op_ret < 0) {
- qr_local_t *local = NULL;
+ qr_inode = GF_CALLOC (1, sizeof (*qr_inode), gf_qr_mt_qr_inode_t);
+ if (!qr_inode)
+ return NULL;
- local = stub->frame->local;
- local->op_ret = op_ret;
- local->op_errno = op_errno;
- }
+ INIT_LIST_HEAD (&qr_inode->lru);
- call_resume (stub);
- }
- }
+ qr_inode->priority = 0; /* initial priority */
-out:
- return;
+ return qr_inode;
}
-static void
-qr_fd_ctx_free (qr_fd_ctx_t *qr_fd_ctx)
+qr_inode_t *
+qr_inode_ctx_get_or_new (xlator_t *this, inode_t *inode)
{
- GF_VALIDATE_OR_GOTO ("quick-read", qr_fd_ctx, out);
+ qr_inode_t *qr_inode = NULL;
+ int ret = -1;
+ qr_private_t *priv = NULL;
- GF_ASSERT (list_empty (&qr_fd_ctx->waiting_ops));
+ priv = this->private;
- LOCK (&qr_fd_ctx->fd->inode->lock);
- {
- list_del_init (&qr_fd_ctx->inode_list);
- }
- UNLOCK (&qr_fd_ctx->fd->inode->lock);
-
- GF_FREE (qr_fd_ctx->path);
- GF_FREE (qr_fd_ctx);
-
-out:
- return;
-}
-
-
-static inline uint32_t
-is_match (const char *path, const char *pattern)
-{
- int32_t ret = 0;
- uint32_t match = 0;
+ LOCK (&inode->lock);
+ {
+ qr_inode = __qr_inode_ctx_get (this, inode);
+ if (qr_inode)
+ goto unlock;
- GF_VALIDATE_OR_GOTO ("quick-read", path, out);
- GF_VALIDATE_OR_GOTO ("quick-read", pattern, out);
+ qr_inode = qr_inode_new (this, inode);
+ if (!qr_inode)
+ goto unlock;
- ret = fnmatch (pattern, path, FNM_NOESCAPE);
- match = (ret == 0);
+ ret = __qr_inode_ctx_set (this, inode, qr_inode);
+ if (ret) {
+ __qr_inode_prune (&priv->table, qr_inode);
+ GF_FREE (qr_inode);
+ qr_inode = NULL;
+ }
+ }
+unlock:
+ UNLOCK (&inode->lock);
-out:
- return match;
+ return qr_inode;
}
@@ -196,3230 +117,578 @@ qr_get_priority (qr_conf_t *conf, const char *path)
uint32_t priority = 0;
struct qr_priority *curr = NULL;
- GF_VALIDATE_OR_GOTO ("quick-read", conf, out);
- GF_VALIDATE_OR_GOTO ("quick-read", path, out);
-
list_for_each_entry (curr, &conf->priority_list, list) {
- if (is_match (path, curr->pattern))
+ if (fnmatch (curr->pattern, path, FNM_NOESCAPE) == 0)
priority = curr->priority;
}
-out:
return priority;
}
-/* To be called with this-priv->table.lock held */
-qr_inode_t *
-__qr_inode_alloc (xlator_t *this, char *path, inode_t *inode)
+void
+__qr_inode_register (qr_inode_table_t *table, qr_inode_t *qr_inode)
{
- qr_inode_t *qr_inode = NULL;
- qr_private_t *priv = NULL;
- int priority = 0;
-
- GF_VALIDATE_OR_GOTO ("quick-read", this, out);
- GF_VALIDATE_OR_GOTO (this->name, path, out);
- GF_VALIDATE_OR_GOTO (this->name, inode, out);
-
- priv = this->private;
- GF_VALIDATE_OR_GOTO (this->name, priv, out);
-
- qr_inode = GF_CALLOC (1, sizeof (*qr_inode), gf_qr_mt_qr_inode_t);
- if (qr_inode == NULL) {
- goto out;
- }
-
- INIT_LIST_HEAD (&qr_inode->lru);
- INIT_LIST_HEAD (&qr_inode->fd_list);
-
- priority = qr_get_priority (&priv->conf, path);
+ if (!qr_inode->data)
+ return;
- list_add_tail (&qr_inode->lru, &priv->table.lru[priority]);
+ if (list_empty (&qr_inode->lru))
+ /* first time addition of this qr_inode into table */
+ table->cache_used += qr_inode->size;
+ else
+ list_del_init (&qr_inode->lru);
- qr_inode->inode = inode;
- qr_inode->priority = priority;
-out:
- return qr_inode;
+ list_add_tail (&qr_inode->lru, &table->lru[qr_inode->priority]);
}
-/* To be called with qr_inode->table->lock held */
void
-__qr_inode_free (qr_inode_t *qr_inode)
+qr_inode_set_priority (xlator_t *this, inode_t *inode, const char *path)
{
- qr_fd_ctx_t *fdctx = NULL, *tmp_fdctx = NULL;
+ uint32_t priority = 0;
+ qr_inode_table_t *table = NULL;
+ qr_inode_t *qr_inode = NULL;
+ qr_private_t *priv = NULL;
+ qr_conf_t *conf = NULL;
- GF_VALIDATE_OR_GOTO ("quick-read", qr_inode, out);
+ qr_inode = qr_inode_ctx_get (this, inode);
+ if (!qr_inode)
+ return;
- if (qr_inode->xattr) {
- dict_unref (qr_inode->xattr);
- }
+ priv = this->private;
+ table = &priv->table;
+ conf = &priv->conf;
- list_del (&qr_inode->lru);
+ if (path)
+ priority = qr_get_priority (conf, path);
+ else
+ /* retain existing priority, just bump LRU */
+ priority = qr_inode->priority;
- LOCK (&qr_inode->inode->lock);
- {
- list_for_each_entry_safe (fdctx, tmp_fdctx, &qr_inode->fd_list,
- inode_list) {
- list_del_init (&fdctx->inode_list);
- }
- }
- UNLOCK (&qr_inode->inode->lock);
+ LOCK (&table->lock);
+ {
+ qr_inode->priority = priority;
- GF_FREE (qr_inode);
-out:
- return;
+ __qr_inode_register (table, qr_inode);
+ }
+ UNLOCK (&table->lock);
}
+
/* To be called with priv->table.lock held */
void
-__qr_cache_prune (xlator_t *this)
+__qr_inode_prune (qr_inode_table_t *table, qr_inode_t *qr_inode)
{
- qr_private_t *priv = NULL;
- qr_conf_t *conf = NULL;
- qr_inode_table_t *table = NULL;
- qr_inode_t *curr = NULL, *next = NULL;
- int32_t index = 0;
- uint64_t size_to_prune = 0;
- uint64_t size_pruned = 0;
-
- GF_VALIDATE_OR_GOTO ("quick-read", this, out);
- priv = this->private;
- GF_VALIDATE_OR_GOTO (this->name, priv, out);
-
- table = &priv->table;
- conf = &priv->conf;
-
- size_to_prune = table->cache_used - conf->cache_size;
-
- for (index=0; index < conf->max_pri; index++) {
- list_for_each_entry_safe (curr, next, &table->lru[index], lru) {
- size_pruned += curr->stbuf.ia_size;
- inode_ctx_del (curr->inode, this, NULL);
- __qr_inode_free (curr);
- if (size_pruned >= size_to_prune)
- goto done;
- }
- }
-
-done:
- table->cache_used -= size_pruned;
-
-out:
- return;
-}
-
-/* To be called with table->lock held */
-inline char
-__qr_need_cache_prune (qr_conf_t *conf, qr_inode_table_t *table)
-{
- char need_prune = 0;
-
- GF_VALIDATE_OR_GOTO ("quick-read", conf, out);
- GF_VALIDATE_OR_GOTO ("quick-read", table, out);
-
- need_prune = (table->cache_used > conf->cache_size);
-
-out:
- return need_prune;
-}
-
-
-int32_t
-qr_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, dict_t *xdata, struct iatt *postparent)
-{
- data_t *content = NULL;
- qr_inode_t *qr_inode = NULL;
- uint64_t value = 0;
- int ret = -1;
- qr_conf_t *conf = NULL;
- qr_inode_table_t *table = NULL;
- qr_private_t *priv = NULL;
- qr_local_t *local = NULL;
-
- GF_ASSERT (frame);
-
- if ((op_ret == -1) || (xdata == NULL)) {
- goto out;
- }
-
- if ((this == NULL) || (this->private == NULL)) {
- gf_log (frame->this->name, GF_LOG_WARNING,
- (this == NULL) ? "xlator object (this) is NULL"
- : "quick-read configuration is not found");
- op_ret = -1;
- op_errno = EINVAL;
- goto out;
- }
-
- priv = this->private;
- conf = &priv->conf;
- table = &priv->table;
+ GF_FREE (qr_inode->data);
+ qr_inode->data = NULL;
- local = frame->local;
+ if (!list_empty (&qr_inode->lru)) {
+ table->cache_used -= qr_inode->size;
+ qr_inode->size = 0;
- if (buf->ia_size > conf->max_file_size) {
- goto out;
- }
-
- if (IA_ISDIR (buf->ia_type)) {
- goto out;
- }
-
- if (inode == NULL) {
- op_ret = -1;
- op_errno = EINVAL;
- gf_log (this->name, GF_LOG_WARNING,
- "lookup returned a NULL inode");
- goto out;
- }
-
- content = dict_get (xdata, GF_CONTENT_KEY);
- if (content == NULL) {
- goto out;
- }
-
- LOCK (&table->lock);
- {
- ret = inode_ctx_get (inode, this, &value);
- if (ret == -1) {
- qr_inode = __qr_inode_alloc (this, local->path, inode);
- if (qr_inode == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- goto unlock;
- }
-
- ret = inode_ctx_put (inode, this,
- (uint64_t)(long)qr_inode);
- if (ret == -1) {
- __qr_inode_free (qr_inode);
- qr_inode = NULL;
- op_ret = -1;
- op_errno = EINVAL;
- gf_log (this->name, GF_LOG_WARNING,
- "cannot set quick-read context in "
- "inode (gfid:%s)",
- uuid_utoa (inode->gfid));
- goto unlock;
- }
- } else {
- qr_inode = (qr_inode_t *)(long)value;
- if (qr_inode == NULL) {
- op_ret = -1;
- op_errno = EINVAL;
- gf_log (this->name, GF_LOG_WARNING,
- "cannot find quick-read context in "
- "inode (gfid:%s)",
- uuid_utoa (inode->gfid));
- goto unlock;
- }
- }
-
- /*
- * Create our own internal dict and migrate the file content
- * over to it so it isn't floating around in other translator
- * caches.
- */
- if (qr_inode->xattr) {
- dict_unref (qr_inode->xattr);
- qr_inode->xattr = NULL;
- table->cache_used -= qr_inode->stbuf.ia_size;
- }
-
- qr_inode->xattr = dict_new();
- if (!qr_inode->xattr) {
- op_ret = -1;
- op_errno = ENOMEM;
- goto unlock;
- }
-
- if (dict_set(qr_inode->xattr, GF_CONTENT_KEY, content) < 0) {
- op_ret = -1;
- op_errno = ENOMEM;
- goto unlock;
- }
-
- dict_del(xdata, GF_CONTENT_KEY);
-
- qr_inode->stbuf = *buf;
- table->cache_used += buf->ia_size;
-
- gettimeofday (&qr_inode->tv, NULL);
- if (__qr_need_cache_prune (conf, table)) {
- __qr_cache_prune (this);
- }
- }
-unlock:
- UNLOCK (&table->lock);
+ list_del_init (&qr_inode->lru);
+ }
-out:
- /*
- * FIXME: content size in dict can be greater than the size application
- * requested for. Applications need to be careful till this is fixed.
- */
- QR_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, buf, xdata,
- postparent);
-
- return 0;
+ memset (&qr_inode->buf, 0, sizeof (qr_inode->buf));
}
-int32_t
-qr_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
- dict_t *xdata)
+void
+qr_inode_prune (xlator_t *this, inode_t *inode)
{
- qr_conf_t *conf = NULL;
- dict_t *new_req_dict = NULL;
- int32_t op_ret = -1, op_errno = EINVAL;
- data_t *content = NULL;
- uint64_t requested_size = 0, size = 0, value = 0;
- char cached = 0;
- qr_inode_t *qr_inode = NULL;
- qr_private_t *priv = NULL;
- qr_inode_table_t *table = NULL;
- qr_local_t *local = NULL;
-
- GF_ASSERT (frame);
- GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind);
- GF_VALIDATE_OR_GOTO (frame->this->name, loc, unwind);
-
- priv = this->private;
- GF_VALIDATE_OR_GOTO (frame->this->name, priv, unwind);
-
- conf = &priv->conf;
- if (conf == NULL) {
- op_ret = -1;
- op_errno = EINVAL;
- goto unwind;
- }
-
- table = &priv->table;
-
- local = qr_local_new (this);
- GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, local, unwind, op_errno,
- ENOMEM);
-
- frame->local = local;
-
- local->path = gf_strdup (loc->path);
- GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, local, unwind, op_errno,
- ENOMEM);
- LOCK (&table->lock);
- {
- op_ret = inode_ctx_get (loc->inode, this, &value);
- if (op_ret == 0) {
- qr_inode = (qr_inode_t *)(long)value;
- if (qr_inode != NULL) {
- if (qr_inode->xattr) {
- cached = 1;
- }
- }
- }
- }
- UNLOCK (&table->lock);
-
- if ((xdata == NULL) && (conf->max_file_size > 0)) {
- new_req_dict = xdata = dict_new ();
- if (xdata == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- goto unwind;
- }
- }
-
- if (!cached) {
- if (xdata) {
- content = dict_get (xdata, GF_CONTENT_KEY);
- if (content) {
- requested_size = data_to_uint64 (content);
- }
- }
-
- if ((conf->max_file_size > 0)
- && (conf->max_file_size != requested_size)) {
- size = (conf->max_file_size > requested_size) ?
- conf->max_file_size : requested_size;
-
- op_ret = dict_set (xdata, GF_CONTENT_KEY,
- data_from_uint64 (size));
- if (op_ret < 0) {
- op_ret = -1;
- op_errno = ENOMEM;
- gf_log (this->name, GF_LOG_WARNING,
- "cannot set key in request dict to "
- "request file "
- "content during lookup cbk");
- goto unwind;
- }
- }
- }
-
- STACK_WIND (frame, qr_lookup_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->lookup, loc, xdata);
-
- if (new_req_dict) {
- dict_unref (new_req_dict);
- }
+ qr_private_t *priv = NULL;
+ qr_inode_table_t *table = NULL;
+ qr_inode_t *qr_inode = NULL;
- return 0;
+ qr_inode = qr_inode_ctx_get (this, inode);
+ if (!qr_inode)
+ return;
-unwind:
- QR_STACK_UNWIND (lookup, frame, op_ret, op_errno, NULL, NULL,
- NULL, NULL);
+ priv = this->private;
+ table = &priv->table;
- if (new_req_dict) {
- dict_unref (new_req_dict);
- }
-
- return 0;
+ LOCK (&table->lock);
+ {
+ __qr_inode_prune (table, qr_inode);
+ }
+ UNLOCK (&table->lock);
}
-int32_t
-qr_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, fd_t *fd, dict_t *xdata)
+/* To be called with priv->table.lock held */
+void
+__qr_cache_prune (qr_inode_table_t *table, qr_conf_t *conf)
{
- uint64_t value = 0;
- int32_t ret = -1;
- qr_local_t *local = NULL;
- qr_inode_t *qr_inode = NULL;
- qr_fd_ctx_t *qr_fd_ctx = NULL;
- call_stub_t *stub = NULL, *tmp = NULL;
- char is_open = 0;
- qr_private_t *priv = NULL;
- qr_inode_table_t *table = NULL;
- struct list_head waiting_ops;
-
- GF_ASSERT (frame);
-
- priv = this->private;
- table = &priv->table;
+ qr_inode_t *curr = NULL;
+ qr_inode_t *next = NULL;
+ int index = 0;
+ size_t size_pruned = 0;
- local = frame->local;
- if (local != NULL) {
- is_open = local->is_open;
- }
-
- INIT_LIST_HEAD (&waiting_ops);
-
- ret = fd_ctx_get (fd, this, &value);
- if ((ret == -1) && (op_ret != -1)) {
- op_ret = -1;
- op_errno = EINVAL;
- gf_log (this->name, GF_LOG_WARNING,
- "cannot find quick-read context in fd (%p) opened on "
- "inode (gfid: %s)", fd, uuid_utoa (fd->inode->gfid));
- goto out;
- }
-
- if (value) {
- qr_fd_ctx = (qr_fd_ctx_t *) (long)value;
- }
-
- if (qr_fd_ctx) {
- LOCK (&qr_fd_ctx->lock);
- {
- qr_fd_ctx->open_in_transit = 0;
-
- if (op_ret == 0) {
- qr_fd_ctx->opened = 1;
- }
- list_splice_init (&qr_fd_ctx->waiting_ops,
- &waiting_ops);
- }
- UNLOCK (&qr_fd_ctx->lock);
-
- if (local && local->is_open
- && ((local->open_flags & O_TRUNC) == O_TRUNC)) {
- LOCK (&table->lock);
- {
- ret = inode_ctx_del (fd->inode, this, &value);
- if (ret == 0) {
- qr_inode = (qr_inode_t *)(long) value;
-
- if (qr_inode != NULL) {
- __qr_inode_free (qr_inode);
- }
- }
- }
- UNLOCK (&table->lock);
- }
+ for (index = 0; index < conf->max_pri; index++) {
+ list_for_each_entry_safe (curr, next, &table->lru[index], lru) {
- if (!list_empty (&waiting_ops)) {
- list_for_each_entry_safe (stub, tmp, &waiting_ops,
- list) {
- list_del_init (&stub->list);
- if (op_ret < 0) {
- qr_local_t *local = NULL;
+ size_pruned += curr->size;
- local = stub->frame->local;
- local->op_ret = op_ret;
- local->op_errno = op_errno;
- }
+ __qr_inode_prune (table, curr);
- call_resume (stub);
- }
+ if (table->cache_used < conf->cache_size)
+ return;
}
}
-out:
- if (is_open) {
- QR_STACK_UNWIND (open, frame, op_ret, op_errno, fd, xdata);
- } else {
- STACK_DESTROY (frame->root);
- }
- return 0;
+ return;
}
-int32_t
-qr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
- fd_t *fd, dict_t *xdata)
+void
+qr_cache_prune (xlator_t *this)
{
- qr_inode_t *qr_inode = NULL;
- int32_t ret = -1;
- uint64_t filep = 0;
- char content_cached = 0;
- qr_fd_ctx_t *qr_fd_ctx = NULL, *tmp_fd_ctx = NULL;
- int32_t op_ret = -1, op_errno = EINVAL;
- qr_local_t *local = NULL;
- qr_private_t *priv = NULL;
- qr_inode_table_t *table = NULL;
-
- GF_ASSERT (frame);
- GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind);
- GF_VALIDATE_OR_GOTO (frame->this->name, this->private, unwind);
- GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind);
+ qr_private_t *priv = NULL;
+ qr_conf_t *conf = NULL;
+ qr_inode_table_t *table = NULL;
priv = this->private;
table = &priv->table;
+ conf = &priv->conf;
- tmp_fd_ctx = qr_fd_ctx = GF_CALLOC (1, sizeof (*qr_fd_ctx),
- gf_qr_mt_qr_fd_ctx_t);
- if (qr_fd_ctx == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- goto unwind;
- }
-
- LOCK_INIT (&qr_fd_ctx->lock);
- INIT_LIST_HEAD (&qr_fd_ctx->waiting_ops);
- INIT_LIST_HEAD (&qr_fd_ctx->inode_list);
- qr_fd_ctx->fd = fd;
-
- qr_fd_ctx->path = gf_strdup (loc->path);
- if (qr_fd_ctx->path == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- goto unwind;
- }
-
- qr_fd_ctx->flags = flags;
-
- ret = fd_ctx_set (fd, this, (uint64_t)(long)qr_fd_ctx);
- if (ret == -1) {
- op_ret = -1;
- op_errno = EINVAL;
- gf_log (this->name, GF_LOG_WARNING,
- "cannot set quick-read context in "
- "fd (%p) opened on inode (gfid:%s)", fd,
- uuid_utoa (fd->inode->gfid));
- goto unwind;
- }
-
- tmp_fd_ctx = NULL;
-
- local = qr_local_new (this);
- if (local == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- goto unwind;
- }
-
- local->is_open = 1;
- local->open_flags = flags;
- frame->local = local;
- LOCK (&table->lock);
- {
- ret = inode_ctx_get (fd->inode, this, &filep);
- if (ret == 0) {
- qr_inode = (qr_inode_t *)(long) filep;
- if (qr_inode) {
- if (qr_inode->xattr) {
- content_cached = 1;
- }
- }
- }
- }
- UNLOCK (&table->lock);
-
- if (content_cached && (flags & O_DIRECTORY)) {
- op_ret = -1;
- op_errno = ENOTDIR;
- gf_log (this->name, GF_LOG_WARNING,
- "open with O_DIRECTORY flag received on non-directory");
- goto unwind;
- }
-
- if (!content_cached || ((flags & O_ACCMODE) == O_WRONLY)
- || ((flags & O_TRUNC) == O_TRUNC)
- || ((flags & O_DIRECT) == O_DIRECT)) {
- LOCK (&qr_fd_ctx->lock);
- {
- /*
- * we really need not set this flag, since open is
- * not yet unwound.
- */
-
- qr_fd_ctx->open_in_transit = 1;
- if ((flags & O_DIRECT) == O_DIRECT) {
- qr_fd_ctx->disabled = 1;
- }
- }
- UNLOCK (&qr_fd_ctx->lock);
- goto wind;
- } else {
- op_ret = 0;
- op_errno = 0;
-
- LOCK (&fd->inode->lock);
- {
- list_add_tail (&qr_fd_ctx->inode_list,
- &qr_inode->fd_list);
- }
- UNLOCK (&fd->inode->lock);
- }
-
-unwind:
- if (tmp_fd_ctx != NULL) {
- qr_fd_ctx_free (tmp_fd_ctx);
- }
-
- QR_STACK_UNWIND (open, frame, op_ret, op_errno, fd, NULL);
- return 0;
-
-wind:
- STACK_WIND (frame, qr_open_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->open, loc, flags, fd,
- xdata);
- return 0;
-}
-
-
-static inline time_t
-qr_time_elapsed (struct timeval *now, struct timeval *then)
-{
- time_t time_elapsed = 0;
-
- GF_VALIDATE_OR_GOTO ("quick-read", now, out);
- GF_VALIDATE_OR_GOTO ("quick-read", then, out);
-
- time_elapsed = now->tv_sec - then->tv_sec;
-
-out:
- return time_elapsed;
+ LOCK (&table->lock);
+ {
+ if (table->cache_used > conf->cache_size)
+ __qr_cache_prune (table, conf);
+ }
+ UNLOCK (&table->lock);
}
-static inline char
-qr_need_validation (qr_conf_t *conf, qr_inode_t *qr_inode)
+void *
+qr_content_extract (dict_t *xdata)
{
- struct timeval now = {0, };
- char need_validation = 0;
+ data_t *data = NULL;
+ void *content = NULL;
- GF_VALIDATE_OR_GOTO ("quick-read", conf, out);
- GF_VALIDATE_OR_GOTO ("quick-read", qr_inode, out);
+ data = dict_get (xdata, GF_CONTENT_KEY);
+ if (!data)
+ return NULL;
- gettimeofday (&now, NULL);
+ content = GF_CALLOC (1, data->len, gf_qr_mt_content_t);
+ if (!content)
+ return NULL;
- if (qr_time_elapsed (&now, &qr_inode->tv) >= conf->cache_timeout)
- need_validation = 1;
+ memcpy (content, data->data, data->len);
-out:
- return need_validation;
+ return content;
}
-static int32_t
-qr_validate_cache_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf,
- dict_t *xdata)
+void
+qr_content_update (xlator_t *this, qr_inode_t *qr_inode, void *data,
+ struct iatt *buf)
{
- qr_inode_t *qr_inode = NULL;
- qr_local_t *local = NULL;
- uint64_t value = 0;
- int32_t ret = 0;
- qr_private_t *priv = NULL;
- qr_inode_table_t *table = NULL;
- call_stub_t *stub = NULL;
-
- GF_ASSERT (frame);
- if (this == NULL) {
- op_ret = -1;
- op_errno = EINVAL;
- gf_log (frame->this->name, GF_LOG_WARNING,
- "xlator object (this) is NULL");
- goto unwind;
- }
-
- local = frame->local;
- if ((local == NULL) || ((local->fd) == NULL)) {
- op_ret = -1;
- op_errno = EINVAL;
- gf_log (frame->this->name, GF_LOG_WARNING,
- (local == NULL) ? "local is NULL"
- : "fd is not stored in local");
- goto unwind;
- }
-
- local->just_validated = 1;
-
- if (op_ret == -1) {
- goto unwind;
- }
+ qr_private_t *priv = NULL;
+ qr_inode_table_t *table = NULL;
priv = this->private;
table = &priv->table;
- LOCK (&table->lock);
- {
- ret = inode_ctx_get (local->fd->inode, this, &value);
- if (ret == 0) {
- qr_inode = (qr_inode_t *)(long) value;
- }
-
- if (qr_inode != NULL) {
- gettimeofday (&qr_inode->tv, NULL);
-
- if ((qr_inode->stbuf.ia_mtime != buf->ia_mtime)
- || (qr_inode->stbuf.ia_mtime_nsec
- != buf->ia_mtime_nsec)) {
- inode_ctx_del (local->fd->inode, this, NULL);
- __qr_inode_free (qr_inode);
- }
- }
- }
- UNLOCK (&table->lock);
-
- stub = local->stub;
- local->stub = NULL;
-
- call_resume (stub);
-
- return 0;
-
-unwind:
- /* this is actually unwind of readv */
- QR_STACK_UNWIND (readv, frame, op_ret, op_errno, NULL, -1, NULL, NULL,
- NULL);
- return 0;
-}
-
-
-int32_t
-qr_validate_cache_helper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- dict_t *xdata)
-{
- qr_local_t *local = NULL;
- int32_t op_ret = -1, op_errno = -1;
-
- GF_ASSERT (frame);
- GF_VALIDATE_OR_GOTO (frame->this->name, this, out);
-
- local = frame->local;
- if (local == NULL) {
- op_ret = -1;
- op_errno = EINVAL;
- } else {
- op_ret = local->op_ret;
- op_errno = local->op_errno;
- }
-
-out:
- if (op_ret == -1) {
- qr_validate_cache_cbk (frame, NULL, this, op_ret, op_errno,
- NULL, NULL);
- } else {
- STACK_WIND (frame, qr_validate_cache_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fstat, fd, xdata);
- }
-
- return 0;
-}
-
-
-int
-qr_validate_cache (call_frame_t *frame, xlator_t *this, fd_t *fd,
- call_stub_t *stub)
-{
- int ret = -1;
- int flags = 0;
- uint64_t value = 0;
- loc_t loc = {0, };
- char *path = NULL;
- qr_local_t *local = NULL;
- qr_fd_ctx_t *qr_fd_ctx = NULL;
- call_stub_t *validate_stub = NULL;
- char need_open = 0, can_wind = 0, validate_cbk_called = 0;
- call_frame_t *open_frame = NULL;
-
- GF_ASSERT (frame);
- GF_VALIDATE_OR_GOTO (frame->this->name, this, out);
- GF_VALIDATE_OR_GOTO (frame->this->name, fd, out);
- GF_VALIDATE_OR_GOTO (frame->this->name, stub, out);
-
- if (frame->local == NULL) {
- local = qr_local_new (this);
- if (local == NULL) {
- goto out;
- }
- } else {
- local = frame->local;
- }
-
- local->fd = fd;
- local->stub = stub;
- frame->local = local;
-
- ret = fd_ctx_get (fd, this, &value);
- if (ret == 0) {
- qr_fd_ctx = (qr_fd_ctx_t *)(long) value;
- }
-
- if (qr_fd_ctx) {
- LOCK (&qr_fd_ctx->lock);
- {
- path = qr_fd_ctx->path;
- flags = qr_fd_ctx->flags;
-
- if (!(qr_fd_ctx->opened
- || qr_fd_ctx->open_in_transit)) {
- need_open = 1;
- qr_fd_ctx->open_in_transit = 1;
- }
-
- if (qr_fd_ctx->opened) {
- can_wind = 1;
- } else {
- validate_stub = fop_fstat_stub (frame,
- qr_validate_cache_helper,
- fd, NULL);
- if (validate_stub == NULL) {
- ret = -1;
- if (need_open) {
- qr_fd_ctx->open_in_transit = 0;
- }
- goto unlock;
- }
-
- list_add_tail (&validate_stub->list,
- &qr_fd_ctx->waiting_ops);
- }
- }
- unlock:
- UNLOCK (&qr_fd_ctx->lock);
-
- if (ret == -1) {
- goto out;
- }
- } else {
- can_wind = 1;
- }
+ LOCK (&table->lock);
+ {
+ __qr_inode_prune (table, qr_inode);
- if (need_open) {
- open_frame = create_frame (this, this->ctx->pool);
- if (open_frame == NULL) {
- qr_resume_pending_ops (qr_fd_ctx, -1, ENOMEM);
- validate_cbk_called = 1;
- goto out;
- }
+ qr_inode->data = data;
+ qr_inode->size = buf->ia_size;
- ret = qr_loc_fill (&loc, fd->inode, path);
- if (ret == -1) {
- qr_resume_pending_ops (qr_fd_ctx, -1, errno);
- validate_cbk_called = 1;
- STACK_DESTROY (open_frame->root);
- goto out;
- }
+ qr_inode->ia_mtime = buf->ia_mtime;
+ qr_inode->ia_mtime_nsec = buf->ia_mtime_nsec;
- STACK_WIND (open_frame, qr_open_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->open,
- &loc, flags, fd, NULL);
+ qr_inode->buf = *buf;
- qr_loc_wipe (&loc);
- } else if (can_wind) {
- STACK_WIND (frame, qr_validate_cache_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fstat, fd, NULL);
- }
+ gettimeofday (&qr_inode->last_refresh, NULL);
- ret = 0;
-out:
- if ((ret < 0) && !validate_cbk_called) {
- if (frame->local == NULL) {
- call_stub_destroy (stub);
- }
+ __qr_inode_register (table, qr_inode);
+ }
+ UNLOCK (&table->lock);
- qr_validate_cache_cbk (frame, NULL, this, -1, errno, NULL, NULL);
- }
- return ret;
+ qr_cache_prune (this);
}
-int32_t
-qr_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, struct iovec *vector, int32_t count,
- struct iatt *stbuf, struct iobref *iobref, dict_t *xdata)
+gf_boolean_t
+qr_size_fits (qr_conf_t *conf, struct iatt *buf)
{
- GF_ASSERT (frame);
-
- QR_STACK_UNWIND (readv, frame, op_ret, op_errno, vector, count,
- stbuf, iobref, xdata);
- return 0;
+ return (buf->ia_size <= conf->max_file_size);
}
-int32_t
-qr_readv_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t offset, uint32_t flags, dict_t *xdata)
+gf_boolean_t
+qr_mtime_equal (qr_inode_t *qr_inode, struct iatt *buf)
{
- qr_local_t *local = NULL;
- int32_t op_errno = EINVAL, ret = 0;
- uint64_t value = 0;
- qr_fd_ctx_t *fdctx = NULL;
-
- GF_ASSERT (frame);
-
- local = frame->local;
- GF_VALIDATE_OR_GOTO (frame->this->name, local, unwind);
- GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind);
- GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind);
-
- if (local->op_ret < 0) {
- op_errno = local->op_errno;
-
- ret = fd_ctx_get (fd, this, &value);
- if (ret == 0) {
- fdctx = (qr_fd_ctx_t *)(long) value;
- }
-
- gf_log (this->name, GF_LOG_WARNING,
- "open failed on path (%s) (%s), unwinding read call",
- fdctx ? fdctx->path : NULL, strerror (errno));
- goto unwind;
- }
-
- STACK_WIND (frame, qr_readv_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->readv, fd, size, offset, flags,
- xdata);
- return 0;
-
-unwind:
- QR_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL);
- return 0;
+ return (qr_inode->ia_mtime == buf->ia_mtime &&
+ qr_inode->ia_mtime_nsec == buf->ia_mtime_nsec);
}
-int32_t
-qr_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t offset, uint32_t read_flags, dict_t *xdata)
+void
+__qr_content_refresh (xlator_t *this, qr_inode_t *qr_inode, struct iatt *buf)
{
- qr_inode_t *qr_inode = NULL;
- int32_t ret = -1, op_ret = -1, op_errno = -1;
- uint64_t value = 0;
- int count = -1, flags = 0, i = 0;
- char content_cached = 0, need_validation = 0;
- char need_open = 0, can_wind = 0, need_unwind = 0;
- struct iobuf *iobuf = NULL;
- struct iobref *iobref = NULL;
- struct iatt stbuf = {0, };
- data_t *content = NULL;
- qr_fd_ctx_t *qr_fd_ctx = NULL;
- call_stub_t *stub = NULL;
- loc_t loc = {0, };
- qr_conf_t *conf = NULL;
- struct iovec *vector = NULL;
- char *path = NULL;
- off_t start = 0, end = 0;
- size_t len = 0;
- struct iobuf_pool *iobuf_pool = NULL;
- qr_local_t *local = NULL;
- char just_validated = 0;
- qr_private_t *priv = NULL;
- qr_inode_table_t *table = NULL;
- call_frame_t *open_frame = NULL;
-
- op_ret = 0;
+ qr_private_t *priv = NULL;
+ qr_inode_table_t *table = NULL;
+ qr_conf_t *conf = NULL;
priv = this->private;
- conf = &priv->conf;
table = &priv->table;
+ conf = &priv->conf;
- local = frame->local;
-
- if (local != NULL) {
- just_validated = local->just_validated;
- }
-
- ret = fd_ctx_get (fd, this, &value);
- if (ret == 0) {
- qr_fd_ctx = (qr_fd_ctx_t *)(long) value;
- if (qr_fd_ctx != NULL) {
- if (qr_fd_ctx->disabled) {
- goto out;
- }
- }
- }
-
- iobuf_pool = this->ctx->iobuf_pool;
-
- LOCK (&table->lock);
- {
- ret = inode_ctx_get (fd->inode, this, &value);
- if (ret)
- goto unlock;
-
- qr_inode = (qr_inode_t *)(long)value;
- if (!qr_inode || !qr_inode->xattr)
- goto unlock;
-
- if (!just_validated
- && qr_need_validation (conf, qr_inode)) {
- need_validation = 1;
- goto unlock;
- }
-
- content = dict_get (qr_inode->xattr, GF_CONTENT_KEY);
-
- stbuf = qr_inode->stbuf;
- content_cached = 1;
- list_move_tail (&qr_inode->lru,
- &table->lru[qr_inode->priority]);
-
- if (offset > content->len) {
- op_ret = 0;
- end = content->len;
- } else {
- if ((offset + size) > content->len) {
- op_ret = content->len - offset;
- end = content->len;
- } else {
- op_ret = size;
- end = offset + size;
- }
- }
-
- count = (op_ret / iobuf_pool->default_page_size);
- if ((op_ret % iobuf_pool->default_page_size) != 0) {
- count++;
- }
-
- if (count == 0) {
- op_ret = 0;
- goto unlock;
- }
-
- vector = GF_CALLOC (count, sizeof (*vector), gf_qr_mt_iovec);
- if (vector == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- need_unwind = 1;
- goto unlock;
- }
-
- iobref = iobref_new ();
- if (iobref == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- need_unwind = 1;
- goto unlock;
- }
-
- for (i = 0; i < count; i++) {
- /* TODO: Now that we have support for variable
- io-buf-sizes, i guess we need to get rid of
- default size here */
- iobuf = iobuf_get (iobuf_pool);
- if (iobuf == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- need_unwind = 1;
- goto unlock;
- }
-
- start = offset + (iobuf_pool->default_page_size * i);
-
- if (start > end) {
- len = 0;
- } else {
- len = (iobuf_pool->default_page_size >
- ((end - start)) ? (end - start) :
- iobuf_pool->default_page_size);
-
- memcpy (iobuf->ptr, content->data + start, len);
- }
-
- iobref_add (iobref, iobuf);
- iobuf_unref (iobuf);
+ if (qr_size_fits (conf, buf) && qr_mtime_equal (qr_inode, buf)) {
+ qr_inode->buf = *buf;
- vector[i].iov_base = iobuf->ptr;
- vector[i].iov_len = len;
- }
- }
-unlock:
- UNLOCK (&table->lock);
-
-out:
- if (content_cached || need_unwind) {
- QR_STACK_UNWIND (readv, frame, op_ret, op_errno, vector,
- count, &stbuf, iobref, NULL);
-
- } else if (need_validation) {
- stub = fop_readv_stub (frame, qr_readv, fd, size, offset,
- read_flags, xdata);
- if (stub == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- goto out;
- }
-
- qr_validate_cache (frame, this, fd, stub);
- } else {
- if (qr_fd_ctx) {
- LOCK (&qr_fd_ctx->lock);
- {
- path = qr_fd_ctx->path;
- flags = qr_fd_ctx->flags;
-
- if (!(qr_fd_ctx->opened
- || qr_fd_ctx->open_in_transit)) {
- need_open = 1;
- qr_fd_ctx->open_in_transit = 1;
- }
-
- if (qr_fd_ctx->opened) {
- can_wind = 1;
- } else {
- if (frame->local == NULL) {
- frame->local
- = qr_local_new (this);
- if (frame->local == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- need_unwind = 1;
- qr_fd_ctx->open_in_transit = 0;
- goto fdctx_unlock;
- }
- }
-
- stub = fop_readv_stub (frame,
- qr_readv_helper,
- fd, size,
- offset,
- read_flags, xdata);
- if (stub == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- need_unwind = 1;
- qr_fd_ctx->open_in_transit = 0;
- goto fdctx_unlock;
- }
-
- list_add_tail (&stub->list,
- &qr_fd_ctx->waiting_ops);
- }
- }
- fdctx_unlock:
- UNLOCK (&qr_fd_ctx->lock);
-
- if (op_ret == -1) {
- need_unwind = 1;
- goto out;
- }
- } else {
- can_wind = 1;
- }
-
- if (need_open) {
- op_ret = qr_loc_fill (&loc, fd->inode, path);
- if (op_ret == -1) {
- qr_resume_pending_ops (qr_fd_ctx, -1, errno);
- goto ret;
- }
-
- open_frame = create_frame (this, this->ctx->pool);
- if (open_frame == NULL) {
- qr_resume_pending_ops (qr_fd_ctx, -1, ENOMEM);
- qr_loc_wipe (&loc);
- goto ret;
- }
+ gettimeofday (&qr_inode->last_refresh, NULL);
- STACK_WIND (open_frame, qr_open_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->open,
- &loc, flags, fd, NULL);
+ __qr_inode_register (table, qr_inode);
+ } else {
+ __qr_inode_prune (table, qr_inode);
+ }
- qr_loc_wipe (&loc);
- } else if (can_wind) {
- STACK_WIND (frame, qr_readv_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->readv, fd, size,
- offset, read_flags, xdata);
- }
- }
-
-ret:
- GF_FREE (vector);
-
- if (iobref) {
- iobref_unref (iobref);
- }
-
- return 0;
+ return;
}
-int32_t
-qr_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
-{
- GF_ASSERT (frame);
- QR_STACK_UNWIND (writev, frame, op_ret, op_errno, prebuf, postbuf,
- xdata);
- return 0;
-}
-
-
-int32_t
-qr_writev_helper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- struct iovec *vector, int32_t count, off_t off,
- uint32_t flags, struct iobref *iobref, dict_t *xdata)
-{
- qr_local_t *local = NULL;
- qr_fd_ctx_t *fdctx = NULL;
- uint64_t value = 0;
- int32_t ret = 0;
- int32_t op_errno = EINVAL;
-
- GF_ASSERT (frame);
-
- local = frame->local;
- GF_VALIDATE_OR_GOTO (frame->this->name, local, unwind);
- GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind);
- GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind);
-
- if (local->op_ret < 0) {
- op_errno = local->op_errno;
-
- ret = fd_ctx_get (fd, this, &value);
- if (ret == 0) {
- fdctx = (qr_fd_ctx_t *)(long) value;
- }
-
- gf_log (this->name, GF_LOG_WARNING,
- "open failed on path (%s) (%s), unwinding write call",
- fdctx ? fdctx->path : NULL, strerror (errno));
- goto unwind;
- }
-
- STACK_WIND (frame, qr_writev_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->writev, fd, vector, count, off,
- flags, iobref, xdata);
- return 0;
-
-unwind:
- QR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
-}
-
-
-int32_t
-qr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
- int32_t count, off_t off, uint32_t wr_flags, struct iobref *iobref,
- dict_t *xdata)
+void
+qr_content_refresh (xlator_t *this, qr_inode_t *qr_inode, struct iatt *buf)
{
- uint64_t value = 0;
- int flags = 0;
- call_stub_t *stub = NULL;
- char *path = NULL;
- loc_t loc = {0, };
- qr_inode_t *qr_inode = NULL;
- qr_fd_ctx_t *qr_fd_ctx = NULL;
- int32_t op_ret = -1, op_errno = -1, ret = -1;
- char can_wind = 0, need_unwind = 0, need_open = 0;
- qr_private_t *priv = NULL;
- qr_inode_table_t *table = NULL;
- call_frame_t *open_frame = NULL;
+ qr_private_t *priv = NULL;
+ qr_inode_table_t *table = NULL;
priv = this->private;
table = &priv->table;
- ret = fd_ctx_get (fd, this, &value);
-
- if (ret == 0) {
- qr_fd_ctx = (qr_fd_ctx_t *)(long) value;
- }
-
- LOCK (&table->lock);
- {
- ret = inode_ctx_get (fd->inode, this, &value);
- if (ret == 0) {
- qr_inode = (qr_inode_t *)(long)value;
- if (qr_inode != NULL) {
- inode_ctx_del (fd->inode, this, NULL);
- __qr_inode_free (qr_inode);
- }
- }
- }
- UNLOCK (&table->lock);
-
- if (qr_fd_ctx) {
- LOCK (&qr_fd_ctx->lock);
- {
- path = qr_fd_ctx->path;
- flags = qr_fd_ctx->flags;
-
- if (!(qr_fd_ctx->opened
- || qr_fd_ctx->open_in_transit)) {
- need_open = 1;
- qr_fd_ctx->open_in_transit = 1;
- }
-
- if (qr_fd_ctx->opened) {
- can_wind = 1;
- } else {
- frame->local = qr_local_new (this);
- if (frame->local == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- need_unwind = 1;
- qr_fd_ctx->open_in_transit = 0;
- goto unlock;
- }
-
- stub = fop_writev_stub (frame, qr_writev_helper,
- fd, vector, count, off,
- wr_flags, iobref, xdata);
- if (stub == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- need_unwind = 1;
- qr_fd_ctx->open_in_transit = 0;
- goto unlock;
- }
-
- list_add_tail (&stub->list,
- &qr_fd_ctx->waiting_ops);
- }
- }
- unlock:
- UNLOCK (&qr_fd_ctx->lock);
- } else {
- can_wind = 1;
- }
-
- if (need_unwind) {
- QR_STACK_UNWIND (writev, frame, op_ret, op_errno, NULL, NULL, NULL);
- } else if (can_wind) {
- STACK_WIND (frame, qr_writev_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->writev, fd, vector, count,
- off, wr_flags, iobref, xdata);
- } else if (need_open) {
- op_ret = qr_loc_fill (&loc, fd->inode, path);
- if (op_ret == -1) {
- qr_resume_pending_ops (qr_fd_ctx, -1, errno);
- goto ret;
- }
-
- open_frame = create_frame (this, this->ctx->pool);
- if (open_frame == NULL) {
- qr_resume_pending_ops (qr_fd_ctx, -1, ENOMEM);
- qr_loc_wipe (&loc);
- goto ret;
- }
-
- STACK_WIND (open_frame, qr_open_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->open, &loc, flags, fd,
- NULL);
-
- qr_loc_wipe (&loc);
- }
-
-ret:
- return 0;
-}
-
-
-int32_t
-qr_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, struct iatt *buf, dict_t *xdata)
-{
- QR_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata);
- return 0;
-}
-
-
-int32_t
-qr_fstat_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
-{
- qr_local_t *local = NULL;
- qr_fd_ctx_t *fdctx = NULL;
- uint64_t value = 0;
- int32_t ret = 0;
- int32_t op_errno = EINVAL;
-
- GF_ASSERT (frame);
-
- local = frame->local;
- GF_VALIDATE_OR_GOTO (frame->this->name, local, unwind);
- GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind);
- GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind);
-
- if (local->op_ret < 0) {
- op_errno = local->op_errno;
-
- ret = fd_ctx_get (fd, this, &value);
- if (ret == 0) {
- fdctx = (qr_fd_ctx_t *)(long) value;
- }
-
- gf_log (this->name, GF_LOG_WARNING,
- "open failed on path (%s) (%s), unwinding fstat call",
- fdctx ? fdctx->path : NULL, strerror (op_errno));
- goto unwind;
- }
-
- STACK_WIND (frame, qr_fstat_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fstat, fd, xdata);
- return 0;
-
-unwind:
- QR_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL);
- return 0;
-}
-
-
-int32_t
-qr_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
-{
- qr_fd_ctx_t *qr_fd_ctx = NULL;
- char need_open = 0, can_wind = 0, need_unwind = 0;
- uint64_t value = 0;
- int32_t ret = -1, op_ret = -1, op_errno = EINVAL;
- call_stub_t *stub = NULL;
- loc_t loc = {0, };
- char *path = NULL;
- int flags = 0;
- call_frame_t *open_frame = NULL;
-
- GF_ASSERT (frame);
- if ((this == NULL) || (fd == NULL)) {
- gf_log (frame->this->name, GF_LOG_WARNING,
- (this == NULL) ? "xlator object (this) is NULL"
- : "fd is NULL");
- need_unwind = 1;
- goto unwind;
- }
-
- ret = fd_ctx_get (fd, this, &value);
- if (ret == 0) {
- qr_fd_ctx = (qr_fd_ctx_t *)(long) value;
- }
-
- if (qr_fd_ctx) {
- LOCK (&qr_fd_ctx->lock);
- {
- path = qr_fd_ctx->path;
- flags = qr_fd_ctx->flags;
-
- if (!(qr_fd_ctx->opened
- || qr_fd_ctx->open_in_transit)) {
- need_open = 1;
- qr_fd_ctx->open_in_transit = 1;
- }
-
- if (qr_fd_ctx->opened) {
- can_wind = 1;
- } else {
- frame->local = qr_local_new (this);
- if (frame->local == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- need_unwind = 1;
- qr_fd_ctx->open_in_transit = 0;
- goto unlock;
- }
-
- stub = fop_fstat_stub (frame, qr_fstat_helper,
- fd, xdata);
- if (stub == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- need_unwind = 1;
- qr_fd_ctx->open_in_transit = 0;
- goto unlock;
- }
-
- list_add_tail (&stub->list,
- &qr_fd_ctx->waiting_ops);
- }
- }
- unlock:
- UNLOCK (&qr_fd_ctx->lock);
- } else {
- can_wind = 1;
- }
-
-unwind:
- if (need_unwind) {
- QR_STACK_UNWIND (fstat, frame, op_ret, op_errno, NULL, NULL);
- } else if (can_wind) {
- STACK_WIND (frame, qr_fstat_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fstat, fd, xdata);
- } else if (need_open) {
- op_ret = qr_loc_fill (&loc, fd->inode, path);
- if (op_ret == -1) {
- qr_resume_pending_ops (qr_fd_ctx, -1, errno);
- goto ret;
- }
-
- open_frame = create_frame (this, this->ctx->pool);
- if (open_frame == NULL) {
- qr_resume_pending_ops (qr_fd_ctx, -1, ENOMEM);
- qr_loc_wipe (&loc);
- goto ret;
- }
-
- STACK_WIND (open_frame, qr_open_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->open, &loc, flags, fd,
- NULL);
-
- qr_loc_wipe (&loc);
- }
-
-ret:
- return 0;
-}
-
-
-int32_t
-qr_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preop, struct iatt *postop, dict_t *xdata)
-{
- GF_ASSERT (frame);
- QR_STACK_UNWIND (fsetattr, frame, op_ret, op_errno, preop, postop,
- xdata);
- return 0;
-}
-
-
-int32_t
-qr_fsetattr_helper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- struct iatt *stbuf, int32_t valid, dict_t *xdata)
-{
- qr_local_t *local = NULL;
- qr_fd_ctx_t *fdctx = NULL;
- uint64_t value = 0;
- int32_t ret = 0;
- int32_t op_errno = EINVAL;
-
- GF_ASSERT (frame);
-
- local = frame->local;
- GF_VALIDATE_OR_GOTO (frame->this->name, local, unwind);
- GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind);
- GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind);
-
- if (local->op_ret < 0) {
- op_errno = local->op_errno;
-
- ret = fd_ctx_get (fd, this, &value);
- if (ret == 0) {
- fdctx = (qr_fd_ctx_t *)(long) value;
- }
-
- gf_log (this->name, GF_LOG_WARNING,
- "open failed on path (%s) (%s), unwinding fsetattr "
- "call",
- fdctx ? fdctx->path : NULL, strerror (op_errno));
- goto unwind;
- }
-
- STACK_WIND (frame, qr_fsetattr_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fsetattr, fd, stbuf,
- valid, xdata);
- return 0;
-
-unwind:
- QR_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
+ LOCK (&table->lock);
+ {
+ __qr_content_refresh (this, qr_inode, buf);
+ }
+ UNLOCK (&table->lock);
}
-int32_t
-qr_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
- struct iatt *stbuf, int32_t valid, dict_t *xdata)
+gf_boolean_t
+__qr_cache_is_fresh (xlator_t *this, qr_inode_t *qr_inode)
{
- uint64_t value = 0;
- int flags = 0;
- call_stub_t *stub = NULL;
- char *path = NULL;
- loc_t loc = {0, };
- qr_fd_ctx_t *qr_fd_ctx = NULL;
- int32_t ret = -1, op_ret = -1, op_errno = EINVAL;
- char need_open = 0, can_wind = 0, need_unwind = 0;
- call_frame_t *open_frame = NULL;
-
- GF_ASSERT (frame);
- if ((this == NULL) || (fd == NULL)) {
- gf_log (frame->this->name, GF_LOG_WARNING,
- (this == NULL) ? "xlator object (this) is NULL" :
- "fd is NULL");
- need_unwind = 1;
- goto out;
- }
-
- ret = fd_ctx_get (fd, this, &value);
- if (ret == 0) {
- qr_fd_ctx = (qr_fd_ctx_t *)(long) value;
- }
-
- if (qr_fd_ctx) {
- LOCK (&qr_fd_ctx->lock);
- {
- path = qr_fd_ctx->path;
- flags = qr_fd_ctx->flags;
- if (!(qr_fd_ctx->opened
- || qr_fd_ctx->open_in_transit)) {
- need_open = 1;
- qr_fd_ctx->open_in_transit = 1;
- }
-
- if (qr_fd_ctx->opened) {
- can_wind = 1;
- } else {
- frame->local = qr_local_new (this);
- if (frame->local == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- need_unwind = 1;
- qr_fd_ctx->open_in_transit = 0;
- goto unlock;
- }
-
- stub = fop_fsetattr_stub (frame,
- qr_fsetattr_helper,
- fd, stbuf, valid, xdata);
- if (stub == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- need_unwind = 1;
- qr_fd_ctx->open_in_transit = 0;
- goto unlock;
- }
-
- list_add_tail (&stub->list,
- &qr_fd_ctx->waiting_ops);
- }
- }
- unlock:
- UNLOCK (&qr_fd_ctx->lock);
- } else {
- can_wind = 1;
- }
-
-out:
- if (need_unwind) {
- QR_STACK_UNWIND (fsetattr, frame, op_ret, op_errno, NULL, NULL,
- NULL);
- } else if (can_wind) {
- STACK_WIND (frame, qr_fsetattr_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fsetattr, fd, stbuf,
- valid, xdata);
- } else if (need_open) {
- op_ret = qr_loc_fill (&loc, fd->inode, path);
- if (op_ret == -1) {
- qr_resume_pending_ops (qr_fd_ctx, -1, errno);
- goto ret;
- }
-
- open_frame = create_frame (this, this->ctx->pool);
- if (open_frame == NULL) {
- qr_resume_pending_ops (qr_fd_ctx, -1, ENOMEM);
- qr_loc_wipe (&loc);
- goto ret;
- }
-
- STACK_WIND (open_frame, qr_open_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->open, &loc, flags, fd,
- NULL);
-
- qr_loc_wipe (&loc);
- }
-
-ret:
- return 0;
-}
-
-
-int32_t
-qr_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- GF_ASSERT (frame);
- QR_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, xdata);
- return 0;
-}
-
-
-int32_t
-qr_fsetxattr_helper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- dict_t *dict, int32_t flags, dict_t *xdata)
-{
- qr_local_t *local = NULL;
- qr_fd_ctx_t *fdctx = NULL;
- uint64_t value = 0;
- int32_t ret = 0;
- int32_t op_errno = EINVAL;
-
- GF_ASSERT (frame);
-
- local = frame->local;
- GF_VALIDATE_OR_GOTO (frame->this->name, local, unwind);
- GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind);
- GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind);
-
- if (local->op_ret < 0) {
- op_errno = local->op_errno;
-
- ret = fd_ctx_get (fd, this, &value);
- if (ret == 0) {
- fdctx = (qr_fd_ctx_t *)(long) value;
- }
-
- gf_log (this->name, GF_LOG_WARNING,
- "open failed on path (%s) (%s), unwinding fsetxattr "
- "call",
- fdctx ? fdctx->path : NULL, strerror (op_errno));
- goto unwind;
- }
-
- STACK_WIND (frame, qr_fsetxattr_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fsetxattr, fd, dict, flags,
- xdata);
- return 0;
-
-unwind:
- QR_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL);
- return 0;
-}
-
-
-int32_t
-qr_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
- int32_t flags, dict_t *xdata)
-{
- uint64_t value = 0;
- call_stub_t *stub = NULL;
- char *path = NULL;
- loc_t loc = {0, };
- int open_flags = 0;
- qr_fd_ctx_t *qr_fd_ctx = NULL;
- int32_t ret = -1, op_ret = -1, op_errno = EINVAL;
- char need_open = 0, can_wind = 0, need_unwind = 0;
- call_frame_t *open_frame = NULL;
-
- GF_ASSERT (frame);
- if ((this == NULL) || (fd == NULL)) {
- gf_log (frame->this->name, GF_LOG_WARNING,
- (this == NULL) ? "xlator object (this) "
- "is NULL" : "fd is NULL");
- need_unwind = 1;
- goto out;
- }
-
- ret = fd_ctx_get (fd, this, &value);
- if (ret == 0) {
- qr_fd_ctx = (qr_fd_ctx_t *)(long) value;
- }
-
- if (qr_fd_ctx) {
- LOCK (&qr_fd_ctx->lock);
- {
- path = qr_fd_ctx->path;
- open_flags = qr_fd_ctx->flags;
-
- if (!(qr_fd_ctx->opened
- || qr_fd_ctx->open_in_transit)) {
- need_open = 1;
- qr_fd_ctx->open_in_transit = 1;
- }
-
- if (qr_fd_ctx->opened) {
- can_wind = 1;
- } else {
- frame->local = qr_local_new (this);
- if (frame->local == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- need_unwind = 1;
- qr_fd_ctx->open_in_transit = 0;
- goto unlock;
- }
-
- stub = fop_fsetxattr_stub (frame,
- qr_fsetxattr_helper,
- fd, dict, flags, xdata);
- if (stub == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- need_unwind = 1;
- qr_fd_ctx->open_in_transit = 0;
- goto unlock;
- }
-
- list_add_tail (&stub->list,
- &qr_fd_ctx->waiting_ops);
- }
- }
- unlock:
- UNLOCK (&qr_fd_ctx->lock);
- } else {
- can_wind = 1;
- }
-
-out:
- if (need_unwind) {
- QR_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, NULL);
- } else if (can_wind) {
- STACK_WIND (frame, qr_fsetxattr_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fsetxattr, fd, dict,
- flags, xdata);
- } else if (need_open) {
- op_ret = qr_loc_fill (&loc, fd->inode, path);
- if (op_ret == -1) {
- qr_resume_pending_ops (qr_fd_ctx, -1, errno);
- goto ret;
- }
-
- open_frame = create_frame (this, this->ctx->pool);
- if (open_frame == NULL) {
- qr_resume_pending_ops (qr_fd_ctx, -1, ENOMEM);
- qr_loc_wipe (&loc);
- goto ret;
- }
+ qr_conf_t *conf = NULL;
+ qr_private_t *priv = NULL;
+ struct timeval now;
+ struct timeval diff;
- STACK_WIND (open_frame, qr_open_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->open, &loc, open_flags,
- fd, NULL);
+ priv = this->private;
+ conf = &priv->conf;
- qr_loc_wipe (&loc);
- }
+ gettimeofday (&now, NULL);
-ret:
- return 0;
-}
+ timersub (&now, &qr_inode->last_refresh, &diff);
+ if (diff.tv_sec >= conf->cache_timeout)
+ return _gf_false;
-int32_t
-qr_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
-{
- GF_ASSERT (frame);
- QR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, xdata);
- return 0;
+ return _gf_true;
}
-int32_t
-qr_fgetxattr_helper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- const char *name, dict_t *xdata)
+int
+qr_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode_ret,
+ struct iatt *buf, dict_t *xdata, struct iatt *postparent)
{
- qr_local_t *local = NULL;
- qr_fd_ctx_t *fdctx = NULL;
- uint64_t value = 0;
- int32_t ret = 0;
- int32_t op_errno = EINVAL;
-
- GF_ASSERT (frame);
-
- local = frame->local;
- GF_VALIDATE_OR_GOTO (frame->this->name, local, unwind);
- GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind);
- GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind);
-
- if (local->op_ret < 0) {
- op_errno = local->op_errno;
-
- ret = fd_ctx_get (fd, this, &value);
- if (ret == 0) {
- fdctx = (qr_fd_ctx_t *)(long) value;
- }
-
- gf_log (this->name, GF_LOG_WARNING,
- "open failed on path (%s) (%s), unwinding fgetxattr "
- "call",
- fdctx ? fdctx->path : NULL, strerror (op_errno));
- goto unwind;
- }
-
- STACK_WIND (frame, qr_fgetxattr_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fgetxattr, fd, name, xdata);
- return 0;
-
-unwind:
- QR_STACK_UNWIND (fgetxattr, frame, -1, op_errno, NULL, NULL);
- return 0;
-}
+ void *content = NULL;
+ qr_inode_t *qr_inode = NULL;
+ inode_t *inode = NULL;
+ inode = frame->local;
+ frame->local = NULL;
-int32_t
-qr_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
- dict_t *xdata)
-{
- int flags = 0;
- uint64_t value = 0;
- call_stub_t *stub = NULL;
- char *path = NULL;
- loc_t loc = {0, };
- qr_fd_ctx_t *qr_fd_ctx = NULL;
- int32_t ret = -1, op_ret = -1, op_errno = EINVAL;
- char need_open = 0, can_wind = 0, need_unwind = 0;
- call_frame_t *open_frame = NULL;
-
- /*
- * FIXME: Can quick-read use the extended attributes stored in the
- * cache? this needs to be discussed.
- */
-
- GF_ASSERT (frame);
- if ((this == NULL) || (fd == NULL)) {
- gf_log (frame->this->name, GF_LOG_WARNING,
- (this == NULL) ? "xlator object (this) is NULL" :
- "fd is NULL");
- need_unwind = 1;
+ if (op_ret == -1) {
+ qr_inode_prune (this, inode);
goto out;
- }
-
- ret = fd_ctx_get (fd, this, &value);
- if (ret == 0) {
- qr_fd_ctx = (qr_fd_ctx_t *)(long) value;
- }
-
- if (qr_fd_ctx) {
- LOCK (&qr_fd_ctx->lock);
- {
- path = qr_fd_ctx->path;
- flags = qr_fd_ctx->flags;
-
- if (!(qr_fd_ctx->opened
- || qr_fd_ctx->open_in_transit)) {
- need_open = 1;
- qr_fd_ctx->open_in_transit = 1;
- }
-
- if (qr_fd_ctx->opened) {
- can_wind = 1;
- } else {
- frame->local = qr_local_new (this);
- if (frame->local == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- need_unwind = 1;
- qr_fd_ctx->open_in_transit = 0;
- goto unlock;
- }
-
- stub = fop_fgetxattr_stub (frame,
- qr_fgetxattr_helper,
- fd, name, xdata);
- if (stub == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- need_unwind = 1;
- qr_fd_ctx->open_in_transit = 0;
- goto unlock;
- }
-
- list_add_tail (&stub->list,
- &qr_fd_ctx->waiting_ops);
- }
- }
- unlock:
- UNLOCK (&qr_fd_ctx->lock);
- } else {
- can_wind = 1;
- }
-
+ }
+
+ if (dict_get (xdata, "sh-failed")) {
+ qr_inode_prune (this, inode);
+ goto out;
+ }
+
+ content = qr_content_extract (xdata);
+
+ if (content) {
+ /* new content came along, always replace old content */
+ qr_inode = qr_inode_ctx_get_or_new (this, inode);
+ if (!qr_inode) {
+ /* no harm done */
+ GF_FREE (content);
+ goto out;
+ }
+ qr_content_update (this, qr_inode, content, buf);
+ } else {
+ /* purge old content if necessary */
+ qr_inode = qr_inode_ctx_get (this, inode);
+ if (!qr_inode)
+ /* usual path for large files */
+ goto out;
+
+ qr_content_refresh (this, qr_inode, buf);
+ }
out:
- if (need_unwind) {
- QR_STACK_UNWIND (open, frame, op_ret, op_errno, NULL, NULL);
- } else if (can_wind) {
- STACK_WIND (frame, qr_fgetxattr_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fgetxattr, fd, name, xdata);
- } else if (need_open) {
- op_ret = qr_loc_fill (&loc, fd->inode, path);
- if (op_ret == -1) {
- qr_resume_pending_ops (qr_fd_ctx, -1, errno);
- goto ret;
- }
-
- open_frame = create_frame (this, this->ctx->pool);
- if (open_frame == NULL) {
- qr_resume_pending_ops (qr_fd_ctx, -1, ENOMEM);
- qr_loc_wipe (&loc);
- goto ret;
- }
-
- STACK_WIND (open_frame, qr_open_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->open, &loc, flags, fd,
- NULL);
-
- qr_loc_wipe (&loc);
- }
-
-ret:
- return 0;
-}
-
-
-int32_t
-qr_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, dict_t *xdata)
-{
- GF_ASSERT (frame);
- QR_STACK_UNWIND (flush, frame, op_ret, op_errno, xdata);
- return 0;
-}
-
-
-int32_t
-qr_flush_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
-{
- qr_local_t *local = NULL;
- qr_fd_ctx_t *fdctx = NULL;
- uint64_t value = 0;
- int32_t ret = 0;
- int32_t op_errno = EINVAL;
-
- GF_ASSERT (frame);
-
- local = frame->local;
- GF_VALIDATE_OR_GOTO (frame->this->name, local, unwind);
- GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind);
- GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind);
-
- if (local->op_ret < 0) {
- op_errno = local->op_errno;
-
- ret = fd_ctx_get (fd, this, &value);
- if (ret == 0) {
- fdctx = (qr_fd_ctx_t *)(long) value;
- }
-
- gf_log (this->name, GF_LOG_WARNING,
- "open failed on path (%s) (%s), unwinding flush call",
- fdctx ? fdctx->path : NULL, strerror (op_errno));
- goto unwind;
- }
+ if (inode)
+ inode_unref (inode);
- STACK_WIND (frame, qr_flush_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->flush, fd, xdata);
- return 0;
-
-unwind:
- QR_STACK_UNWIND (flush, frame, -1, op_errno, NULL);
+ STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode_ret,
+ buf, xdata, postparent);
return 0;
}
-int32_t
-qr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+int
+qr_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
- uint64_t value = 0;
- call_stub_t *stub = NULL;
- qr_fd_ctx_t *qr_fd_ctx = NULL;
- int32_t ret = -1, op_ret = -1, op_errno = EINVAL;
- char can_wind = 0, need_unwind = 0;
-
- GF_ASSERT (frame);
- if ((this == NULL) || (fd == NULL)) {
- gf_log (frame->this->name, GF_LOG_WARNING,
- (this == NULL) ? "xlator object (this) is NULL"
- : "fd is NULL");
- need_unwind = 1;
- goto out;
- }
-
- ret = fd_ctx_get (fd, this, &value);
- if (ret == 0) {
- qr_fd_ctx = (qr_fd_ctx_t *)(long)value;
- }
+ qr_private_t *priv = NULL;
+ qr_conf_t *conf = NULL;
+ qr_inode_t *qr_inode = NULL;
+ int ret = -1;
+ dict_t *new_xdata = NULL;
- if (qr_fd_ctx) {
- LOCK (&qr_fd_ctx->lock);
- {
- if (qr_fd_ctx->opened) {
- can_wind = 1;
- } else if (qr_fd_ctx->open_in_transit) {
- frame->local = qr_local_new (this);
- if (frame->local == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- need_unwind = 1;
- qr_fd_ctx->open_in_transit = 0;
- goto unlock;
- }
-
- stub = fop_flush_stub (frame, qr_flush_helper,
- fd, xdata);
- if (stub == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- need_unwind = 1;
- qr_fd_ctx->open_in_transit = 0;
- goto unlock;
- }
-
- list_add_tail (&stub->list,
- &qr_fd_ctx->waiting_ops);
- } else {
- op_ret = 0;
- need_unwind = 1;
- }
- }
- unlock:
- UNLOCK (&qr_fd_ctx->lock);
- } else {
- can_wind = 1;
- }
+ priv = this->private;
+ conf = &priv->conf;
-out:
- if (need_unwind) {
- QR_STACK_UNWIND (flush, frame, op_ret, op_errno, NULL);
- } else if (can_wind) {
- STACK_WIND (frame, qr_flush_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->flush, fd, xdata);
- }
+ qr_inode = qr_inode_ctx_get (this, loc->inode);
+ if (qr_inode && qr_inode->data)
+ /* cached. only validate in qr_lookup_cbk */
+ goto wind;
+
+ if (!xdata)
+ xdata = new_xdata = dict_new ();
+
+ if (!xdata)
+ goto wind;
+
+ ret = 0;
+ if (conf->max_file_size)
+ ret = dict_set (xdata, GF_CONTENT_KEY,
+ data_from_uint64 (conf->max_file_size));
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING,
+ "cannot set key in request dict (%s)",
+ loc->path);
+wind:
+ frame->local = inode_ref (loc->inode);
- return 0;
-}
+ STACK_WIND (frame, qr_lookup_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lookup, loc, xdata);
+ if (new_xdata)
+ dict_unref (new_xdata);
-int32_t
-qr_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- GF_ASSERT (frame);
- QR_STACK_UNWIND (fentrylk, frame, op_ret, op_errno, xdata);
return 0;
}
-int32_t
-qr_fentrylk_helper (call_frame_t *frame, xlator_t *this, const char *volume,
- fd_t *fd, const char *basename, entrylk_cmd cmd,
- entrylk_type type, dict_t *xdata)
+int
+qr_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, gf_dirent_t *entries, dict_t *xdata)
{
- qr_local_t *local = NULL;
- qr_fd_ctx_t *fdctx = NULL;
- uint64_t value = 0;
- int32_t ret = 0;
- int32_t op_errno = EINVAL;
+ gf_dirent_t *entry = NULL;
+ qr_inode_t *qr_inode = NULL;
- GF_ASSERT (frame);
+ if (op_ret <= 0)
+ goto unwind;
- local = frame->local;
- GF_VALIDATE_OR_GOTO (frame->this->name, local, unwind);
- GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind);
- GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind);
+ list_for_each_entry (entry, &entries->list, list) {
+ if (!entry->inode)
+ continue;
- if (local->op_ret < 0) {
- op_errno = local->op_errno;
+ qr_inode = qr_inode_ctx_get (this, entry->inode);
+ if (!qr_inode)
+ /* no harm */
+ continue;
- ret = fd_ctx_get (fd, this, &value);
- if (ret == 0) {
- fdctx = (qr_fd_ctx_t *)(long) value;
- }
-
- gf_log (this->name, GF_LOG_WARNING,
- "open failed on path (%s) (%s), unwinding fentrylk "
- "call",
- fdctx ? fdctx->path : NULL, strerror (op_errno));
- goto unwind;
+ qr_content_refresh (this, qr_inode, &entry->d_stat);
}
- STACK_WIND(frame, qr_fentrylk_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fentrylk, volume, fd, basename,
- cmd, type, xdata);
- return 0;
-
unwind:
- QR_STACK_UNWIND (fentrylk, frame, -1, op_errno, NULL);
- return 0;
-}
-
-
-int32_t
-qr_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
- const char *basename, entrylk_cmd cmd, entrylk_type type,
- dict_t *xdata)
-{
- int flags = 0;
- uint64_t value = 0;
- call_stub_t *stub = NULL;
- char *path = NULL;
- loc_t loc = {0, };
- qr_fd_ctx_t *qr_fd_ctx = NULL;
- int32_t ret = -1, op_ret = -1, op_errno = EINVAL;
- char need_open = 0, can_wind = 0, need_unwind = 0;
- call_frame_t *open_frame = NULL;
-
- if ((this == NULL) || (fd == NULL)) {
- gf_log (frame->this->name, GF_LOG_WARNING,
- (this == NULL) ? "xlator object (this) is NULL"
- : "fd is NULL");
- need_unwind = 1;
- goto out;
- }
-
- ret = fd_ctx_get (fd, this, &value);
- if (ret == 0) {
- qr_fd_ctx = (qr_fd_ctx_t *)(long)value;
- }
-
- if (qr_fd_ctx) {
- LOCK (&qr_fd_ctx->lock);
- {
- path = qr_fd_ctx->path;
- flags = qr_fd_ctx->flags;
-
- if (!(qr_fd_ctx->opened
- || qr_fd_ctx->open_in_transit)) {
- need_open = 1;
- qr_fd_ctx->open_in_transit = 1;
- }
-
- if (qr_fd_ctx->opened) {
- can_wind = 1;
- } else {
- frame->local = qr_local_new (this);
- if (frame->local == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- need_unwind = 1;
- qr_fd_ctx->open_in_transit = 0;
- goto unlock;
- }
-
- stub = fop_fentrylk_stub (frame,
- qr_fentrylk_helper,
- volume, fd, basename,
- cmd, type, xdata);
- if (stub == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- need_unwind = 1;
- qr_fd_ctx->open_in_transit = 0;
- goto unlock;
- }
-
- list_add_tail (&stub->list,
- &qr_fd_ctx->waiting_ops);
- }
- }
- unlock:
- UNLOCK (&qr_fd_ctx->lock);
- } else {
- can_wind = 1;
- }
-
-out:
- if (need_unwind) {
- QR_STACK_UNWIND (fentrylk, frame, op_ret, op_errno, NULL);
- } else if (can_wind) {
- STACK_WIND (frame, qr_fentrylk_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fentrylk, volume, fd,
- basename, cmd, type, xdata);
- } else if (need_open) {
- op_ret = qr_loc_fill (&loc, fd->inode, path);
- if (op_ret == -1) {
- qr_resume_pending_ops (qr_fd_ctx, -1, errno);
- goto ret;
- }
-
- open_frame = create_frame (this, this->ctx->pool);
- if (open_frame == NULL) {
- qr_resume_pending_ops (qr_fd_ctx, -1, ENOMEM);
- qr_loc_wipe (&loc);
- goto ret;
- }
-
- STACK_WIND (open_frame, qr_open_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->open, &loc, flags, fd,
- NULL);
-
- qr_loc_wipe (&loc);
- }
-
-ret:
- return 0;
+ STACK_UNWIND_STRICT (readdirp, frame, op_ret, op_errno, entries, xdata);
+ return 0;
}
-int32_t
-qr_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
+int
+qr_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ size_t size, off_t offset, dict_t *xdata)
{
- GF_ASSERT (frame);
- QR_STACK_UNWIND (finodelk, frame, op_ret, op_errno, xdata);
- return 0;
+ STACK_WIND (frame, qr_readdirp_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->readdirp,
+ fd, size, offset, xdata);
+ return 0;
}
-int32_t
-qr_finodelk_helper (call_frame_t *frame, xlator_t *this, const char *volume,
- fd_t *fd, int32_t cmd, struct gf_flock *lock, dict_t *xdata)
-{
- qr_local_t *local = NULL;
- qr_fd_ctx_t *fdctx = NULL;
- uint64_t value = 0;
- int32_t ret = 0;
- int32_t op_errno = EINVAL;
-
- GF_ASSERT (frame);
-
- local = frame->local;
- GF_VALIDATE_OR_GOTO (frame->this->name, local, unwind);
- GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind);
- GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind);
-
- if (local->op_ret < 0) {
- op_errno = local->op_errno;
-
- ret = fd_ctx_get (fd, this, &value);
- if (ret == 0) {
- fdctx = (qr_fd_ctx_t *)(long) value;
- }
-
- gf_log (this->name, GF_LOG_WARNING,
- "open failed on path (%s) (%s), unwinding finodelk "
- "call",
- fdctx ? fdctx->path : NULL, strerror (op_errno));
- goto unwind;
- }
-
- STACK_WIND (frame, qr_finodelk_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->finodelk, volume, fd, cmd, lock,
- xdata);
- return 0;
-
-unwind:
- QR_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL);
- return 0;
-}
-
-
-int32_t
-qr_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
- int32_t cmd, struct gf_flock *lock, dict_t *xdata)
-{
- int flags = 0;
- uint64_t value = 0;
- call_stub_t *stub = NULL;
- char *path = NULL;
- loc_t loc = {0, };
- qr_fd_ctx_t *qr_fd_ctx = NULL;
- int32_t ret = -1, op_ret = -1, op_errno = EINVAL;
- char need_open = 0, can_wind = 0, need_unwind = 0;
- call_frame_t *open_frame = NULL;
-
- if ((this == NULL) || (fd == NULL)) {
- gf_log (frame->this->name, GF_LOG_WARNING,
- (this == NULL) ? "xlator object (this) is NULL"
- : "fd is NULL");
- need_unwind = 1;
- goto out;
- }
-
- ret = fd_ctx_get (fd, this, &value);
- if (ret == 0) {
- qr_fd_ctx = (qr_fd_ctx_t *)(long)value;
- }
-
- if (qr_fd_ctx) {
- LOCK (&qr_fd_ctx->lock);
- {
- path = qr_fd_ctx->path;
- flags = qr_fd_ctx->flags;
-
- if (!(qr_fd_ctx->opened
- || qr_fd_ctx->open_in_transit)) {
- need_open = 1;
- qr_fd_ctx->open_in_transit = 1;
- }
-
- if (qr_fd_ctx->opened) {
- can_wind = 1;
- } else {
- frame->local = qr_local_new (this);
- if (frame->local == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- need_unwind = 1;
- qr_fd_ctx->open_in_transit = 0;
- goto unlock;
- }
-
- stub = fop_finodelk_stub (frame,
- qr_finodelk_helper,
- volume, fd, cmd,
- lock, xdata);
- if (stub == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- need_unwind = 1;
- qr_fd_ctx->open_in_transit = 0;
- goto unlock;
- }
-
- list_add_tail (&stub->list,
- &qr_fd_ctx->waiting_ops);
- }
- }
- unlock:
- UNLOCK (&qr_fd_ctx->lock);
- } else {
- can_wind = 1;
- }
-
-out:
- if (need_unwind) {
- QR_STACK_UNWIND (finodelk, frame, op_ret, op_errno, NULL);
- } else if (can_wind) {
- STACK_WIND (frame, qr_finodelk_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->finodelk, volume, fd,
- cmd, lock, xdata);
- } else if (need_open) {
- op_ret = qr_loc_fill (&loc, fd->inode, path);
- if (op_ret == -1) {
- qr_resume_pending_ops (qr_fd_ctx, -1, errno);
- goto ret;
- }
-
- open_frame = create_frame (this, this->ctx->pool);
- if (open_frame == NULL) {
- qr_resume_pending_ops (qr_fd_ctx, -1, ENOMEM);
- qr_loc_wipe (&loc);
- goto ret;
- }
+int
+qr_readv_cached (call_frame_t *frame, qr_inode_t *qr_inode, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata)
+{
+ xlator_t *this = NULL;
+ qr_private_t *priv = NULL;
+ qr_inode_table_t *table = NULL;
+ int op_ret = -1;
+ struct iobuf *iobuf = NULL;
+ struct iobref *iobref = NULL;
+ struct iovec iov = {0, };
+ struct iatt buf = {0, };
+
+ this = frame->this;
+ priv = this->private;
+ table = &priv->table;
+
+ LOCK (&table->lock);
+ {
+ op_ret = -1;
+
+ if (!qr_inode->data)
+ goto unlock;
- STACK_WIND (open_frame, qr_open_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->open, &loc, flags, fd,
- NULL);
+ if (offset >= qr_inode->size)
+ goto unlock;
- qr_loc_wipe (&loc);
- }
+ if (!__qr_cache_is_fresh (this, qr_inode))
+ goto unlock;
-ret:
- return 0;
-}
+ op_ret = min (size, (qr_inode->size - offset));
+ iobuf = iobuf_get2 (this->ctx->iobuf_pool, op_ret);
+ if (!iobuf) {
+ op_ret = -1;
+ goto unlock;
+ }
-int32_t
-qr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf,
- dict_t *xdata)
-{
- GF_ASSERT (frame);
- QR_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata);
- return 0;
-}
+ iobref = iobref_new ();
+ if (!iobref) {
+ op_ret = -1;
+ goto unlock;
+ }
+ iobref_add (iobref, iobuf);
-int32_t
-qr_fsync_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
- dict_t *xdata)
-{
- qr_local_t *local = NULL;
- qr_fd_ctx_t *fdctx = NULL;
- uint64_t value = 0;
- int32_t ret = 0;
- int32_t op_errno = EINVAL;
+ memcpy (iobuf->ptr, qr_inode->data + offset, op_ret);
- GF_ASSERT (frame);
+ buf = qr_inode->buf;
- local = frame->local;
- GF_VALIDATE_OR_GOTO (frame->this->name, local, unwind);
- GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind);
- GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind);
+ /* bump LRU */
+ __qr_inode_register (table, qr_inode);
+ }
+unlock:
+ UNLOCK (&table->lock);
- if (local->op_ret < 0) {
- op_errno = local->op_errno;
+ if (op_ret > 0) {
+ iov.iov_base = iobuf->ptr;
+ iov.iov_len = op_ret;
- ret = fd_ctx_get (fd, this, &value);
- if (ret == 0) {
- fdctx = (qr_fd_ctx_t *)(long) value;
- }
+ STACK_UNWIND_STRICT (readv, frame, op_ret, 0, &iov, 1,
+ &buf, iobref, xdata);
+ }
- gf_log (this->name, GF_LOG_WARNING,
- "open failed on path (%s) (%s), unwinding fsync call",
- fdctx ? fdctx->path : NULL, strerror (op_errno));
- goto unwind;
- }
+ if (iobuf)
+ iobuf_unref (iobuf);
- STACK_WIND (frame, qr_fsync_cbk, FIRST_CHILD (this),
- FIRST_CHILD(this)->fops->fsync, fd, flags, xdata);
- return 0;
+ if (iobref)
+ iobref_unref (iobref);
-unwind:
- QR_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
+ return op_ret;
}
-int32_t
-qr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
- dict_t *xdata)
+int
+qr_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata)
{
- uint64_t value = 0;
- call_stub_t *stub = NULL;
- char *path = NULL;
- loc_t loc = {0, };
- int open_flags = 0;
- qr_fd_ctx_t *qr_fd_ctx = NULL;
- int32_t ret = -1, op_ret = -1, op_errno = EINVAL;
- char need_open = 0, can_wind = 0, need_unwind = 0;
- call_frame_t *open_frame = NULL;
-
- if ((this == NULL) || (fd == NULL)) {
- gf_log (frame->this->name, GF_LOG_WARNING,
- (this == NULL) ? "xlator object (this) is NULL"
- : "fd is NULL");
- need_unwind = 1;
- goto out;
- }
-
- ret = fd_ctx_get (fd, this, &value);
- if (ret == 0) {
- qr_fd_ctx = (qr_fd_ctx_t *)(long)value;
- }
-
- if (qr_fd_ctx) {
- LOCK (&qr_fd_ctx->lock);
- {
- path = qr_fd_ctx->path;
- open_flags = qr_fd_ctx->flags;
-
- if (!(qr_fd_ctx->opened
- || qr_fd_ctx->open_in_transit)) {
- need_open = 1;
- qr_fd_ctx->open_in_transit = 1;
- }
+ qr_inode_t *qr_inode = NULL;
- if (qr_fd_ctx->opened) {
- can_wind = 1;
- } else {
- frame->local = qr_local_new (this);
- if (frame->local == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- need_unwind = 1;
- qr_fd_ctx->open_in_transit = 0;
- goto unlock;
- }
-
- stub = fop_fsync_stub (frame, qr_fsync_helper,
- fd, flags, xdata);
- if (stub == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- need_unwind = 1;
- qr_fd_ctx->open_in_transit = 0;
- goto unlock;
- }
-
- list_add_tail (&stub->list,
- &qr_fd_ctx->waiting_ops);
- }
- }
- unlock:
- UNLOCK (&qr_fd_ctx->lock);
- } else {
- can_wind = 1;
- }
+ qr_inode = qr_inode_ctx_get (this, fd->inode);
+ if (!qr_inode)
+ goto wind;
-out:
- if (need_unwind) {
- QR_STACK_UNWIND (fsync, frame, op_ret, op_errno, NULL, NULL, NULL);
- } else if (can_wind) {
- STACK_WIND (frame, qr_fsync_cbk, FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fsync, fd, flags, xdata);
- } else if (need_open) {
- op_ret = qr_loc_fill (&loc, fd->inode, path);
- if (op_ret == -1) {
- qr_resume_pending_ops (qr_fd_ctx, -1, errno);
- goto ret;
- }
-
- open_frame = create_frame (this, this->ctx->pool);
- if (open_frame == NULL) {
- qr_resume_pending_ops (qr_fd_ctx, -1, ENOMEM);
- qr_loc_wipe (&loc);
- goto ret;
- }
+ if (qr_readv_cached (frame, qr_inode, size, offset, flags, xdata) <= 0)
+ goto wind;
- STACK_WIND (open_frame, qr_open_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->open, &loc, open_flags,
- fd, NULL);
-
- qr_loc_wipe (&loc);
- }
-
-ret:
- return 0;
+ return 0;
+wind:
+ STACK_WIND (frame, default_readv_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->readv,
+ fd, size, offset, flags, xdata);
+ return 0;
}
-int32_t
-qr_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
+int
+qr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *iov,
+ int count, off_t offset, uint32_t flags, struct iobref *iobref,
+ dict_t *xdata)
{
- int32_t ret = 0;
- uint64_t value = 0;
- qr_inode_t *qr_inode = NULL;
- qr_local_t *local = NULL;
- qr_private_t *priv = NULL;
- qr_inode_table_t *table = NULL;
-
- GF_ASSERT (frame);
+ qr_inode_prune (this, fd->inode);
- if (op_ret == -1) {
- goto out;
- }
-
- local = frame->local;
- if ((local == NULL) || (local->fd == NULL)
- || (local->fd->inode == NULL)) {
- op_ret = -1;
- op_errno = EINVAL;
- gf_log (frame->this->name, GF_LOG_WARNING, "cannot get inode");
- goto out;
- }
-
- if ((this == NULL) || (this->private == NULL)) {
- gf_log (frame->this->name, GF_LOG_WARNING,
- (this == NULL) ? "xlator object (this) is NULL"
- : "cannot get quick read configuration from xlator "
- "object");
- op_ret = -1;
- op_errno = EINVAL;
- goto out;
- }
-
- priv = this->private;
- table = &priv->table;
-
- LOCK (&table->lock);
- {
- ret = inode_ctx_get (local->fd->inode, this, &value);
- if (ret == 0) {
- qr_inode = (qr_inode_t *)(long) value;
-
- if (qr_inode) {
- if (qr_inode->stbuf.ia_size != postbuf->ia_size)
- {
- inode_ctx_del (local->fd->inode, this,
- NULL);
- __qr_inode_free (qr_inode);
- }
- }
- }
- }
- UNLOCK (&table->lock);
-
-out:
- QR_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, prebuf,
- postbuf, xdata);
- return 0;
+ STACK_WIND (frame, default_writev_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->writev,
+ fd, iov, count, offset, flags, iobref, xdata);
+ return 0;
}
-int32_t
-qr_ftruncate_helper (call_frame_t *frame, xlator_t *this, fd_t *fd,
- off_t offset, dict_t *xdata)
+int
+qr_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+ dict_t *xdata)
{
- qr_local_t *local = NULL;
- qr_fd_ctx_t *fdctx = NULL;
- uint64_t value = 0;
- int32_t ret = 0;
- int32_t op_errno = EINVAL;
-
- GF_ASSERT (frame);
-
- local = frame->local;
- GF_VALIDATE_OR_GOTO (frame->this->name, local, unwind);
- GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind);
- GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind);
-
- if (local->op_ret < 0) {
- op_errno = local->op_errno;
-
- ret = fd_ctx_get (fd, this, &value);
- if (ret == 0) {
- fdctx = (qr_fd_ctx_t *)(long) value;
- }
+ qr_inode_prune (this, loc->inode);
- gf_log (this->name, GF_LOG_WARNING,
- "open failed on path (%s) (%s), unwinding ftruncate "
- "call",
- fdctx ? fdctx->path : NULL, strerror (op_errno));
- goto unwind;
- }
-
- STACK_WIND (frame, qr_ftruncate_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
- return 0;
-
-unwind:
- QR_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
+ STACK_WIND (frame, default_truncate_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->truncate,
+ loc, offset, xdata);
+ return 0;
}
-int32_t
+int
qr_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
- dict_t *xdata)
-{
- int flags = 0;
- uint64_t value = 0;
- call_stub_t *stub = NULL;
- char *path = NULL;
- loc_t loc = {0, };
- qr_local_t *local = NULL;
- qr_fd_ctx_t *qr_fd_ctx = NULL;
- int32_t ret = -1, op_ret = -1, op_errno = EINVAL;
- char need_open = 0, can_wind = 0, need_unwind = 0;
- call_frame_t *open_frame = NULL;
-
- GF_ASSERT (frame);
- if ((this == NULL) || (fd == NULL)) {
- gf_log (frame->this->name, GF_LOG_WARNING,
- (this == NULL) ? "xlator object (this) is NULL"
- : "fd is NULL");
- need_unwind = 1;
- goto out;
- }
-
- ret = fd_ctx_get (fd, this, &value);
- if (ret == 0) {
- qr_fd_ctx = (qr_fd_ctx_t *)(long)value;
- }
-
- local = qr_local_new (this);
- if (local == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- need_unwind = 1;
- goto out;
- }
-
- local->fd = fd;
- frame->local = local;
-
- if (qr_fd_ctx) {
- LOCK (&qr_fd_ctx->lock);
- {
- path = qr_fd_ctx->path;
- flags = qr_fd_ctx->flags;
-
- if (!(qr_fd_ctx->opened
- || qr_fd_ctx->open_in_transit)) {
- need_open = 1;
- qr_fd_ctx->open_in_transit = 1;
- }
-
- if (qr_fd_ctx->opened) {
- can_wind = 1;
- } else {
- stub = fop_ftruncate_stub (frame,
- qr_ftruncate_helper,
- fd, offset, xdata);
- if (stub == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- need_unwind = 1;
- qr_fd_ctx->open_in_transit = 0;
- goto unlock;
- }
-
- list_add_tail (&stub->list,
- &qr_fd_ctx->waiting_ops);
- }
- }
- unlock:
- UNLOCK (&qr_fd_ctx->lock);
- } else {
- can_wind = 1;
- }
-
-out:
- if (need_unwind) {
- QR_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, NULL,
- NULL, NULL);
- } else if (can_wind) {
- STACK_WIND (frame, qr_ftruncate_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
- } else if (need_open) {
- op_ret = qr_loc_fill (&loc, fd->inode, path);
- if (op_ret == -1) {
- qr_resume_pending_ops (qr_fd_ctx, -1, errno);
- goto ret;
- }
-
- open_frame = create_frame (this, this->ctx->pool);
- if (open_frame == NULL) {
- qr_resume_pending_ops (qr_fd_ctx, -1, ENOMEM);
- qr_loc_wipe (&loc);
- goto ret;
- }
-
- STACK_WIND (open_frame, qr_open_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->open, &loc, flags, fd,
- NULL);
-
- qr_loc_wipe (&loc);
- }
-
-ret:
- return 0;
-}
-
-
-int32_t
-qr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
- int32_t op_errno, struct gf_flock *lock, dict_t *xdata)
-{
- GF_ASSERT (frame);
- QR_STACK_UNWIND (lk, frame, op_ret, op_errno, lock, xdata);
- return 0;
-}
-
-
-int32_t
-qr_lk_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
- struct gf_flock *lock, dict_t *xdata)
-{
- qr_local_t *local = NULL;
- qr_fd_ctx_t *fdctx = NULL;
- uint64_t value = 0;
- int32_t ret = 0;
- int32_t op_errno = EINVAL;
-
- GF_ASSERT (frame);
-
- local = frame->local;
- GF_VALIDATE_OR_GOTO (frame->this->name, local, unwind);
- GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind);
- GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind);
-
- if (local->op_ret < 0) {
- op_errno = local->op_errno;
-
- ret = fd_ctx_get (fd, this, &value);
- if (ret == 0) {
- fdctx = (qr_fd_ctx_t *)(long) value;
- }
-
- gf_log (this->name, GF_LOG_WARNING,
- "open failed on path (%s) (%s), unwinding lk call",
- fdctx ? fdctx->path : NULL, strerror (op_errno));
- goto unwind;
- }
-
- STACK_WIND (frame, qr_lk_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->lk, fd, cmd, lock, xdata);
-
- return 0;
-
-unwind:
- QR_STACK_UNWIND (lk, frame, -1, op_errno, lock, xdata);
- return 0;
-}
-
-
-int32_t
-qr_lk (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
- struct gf_flock *lock, dict_t *xdata)
-{
- int flags = 0;
- uint64_t value = 0;
- call_stub_t *stub = NULL;
- char *path = NULL;
- loc_t loc = {0, };
- qr_fd_ctx_t *qr_fd_ctx = NULL;
- int32_t ret = -1, op_ret = -1, op_errno = EINVAL;
- char need_open = 0, can_wind = 0, need_unwind = 0;
- call_frame_t *open_frame = NULL;
-
- GF_ASSERT (frame);
- if ((this == NULL) || (fd == NULL)) {
- gf_log (frame->this->name, GF_LOG_WARNING,
- (this == NULL) ? "xlator object (this) is NULL"
- : "fd is NULL");
- need_unwind = 1;
- goto out;
- }
-
- ret = fd_ctx_get (fd, this, &value);
- if (ret == 0) {
- qr_fd_ctx = (qr_fd_ctx_t *)(long)value;
- }
-
- if (qr_fd_ctx) {
- LOCK (&qr_fd_ctx->lock);
- {
- path = qr_fd_ctx->path;
- flags = qr_fd_ctx->flags;
-
- if (!(qr_fd_ctx->opened
- || qr_fd_ctx->open_in_transit)) {
- need_open = 1;
- qr_fd_ctx->open_in_transit = 1;
- }
-
- if (qr_fd_ctx->opened) {
- can_wind = 1;
- } else {
- frame->local = qr_local_new (this);
- if (frame->local == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- need_unwind = 1;
- qr_fd_ctx->open_in_transit = 0;
- goto unlock;
- }
-
- stub = fop_lk_stub (frame, qr_lk_helper, fd,
- cmd, lock, xdata);
- if (stub == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- need_unwind = 1;
- qr_fd_ctx->open_in_transit = 0;
- goto unlock;
- }
-
- list_add_tail (&stub->list,
- &qr_fd_ctx->waiting_ops);
- }
- }
- unlock:
- UNLOCK (&qr_fd_ctx->lock);
- } else {
- can_wind = 1;
- }
-
-out:
- if (need_unwind) {
- QR_STACK_UNWIND (lk, frame, op_ret, op_errno, NULL, NULL);
- } else if (can_wind) {
- STACK_WIND (frame, qr_lk_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->lk, fd, cmd, lock, xdata);
- } else if (need_open) {
- op_ret = qr_loc_fill (&loc, fd->inode, path);
- if (op_ret == -1) {
- qr_resume_pending_ops (qr_fd_ctx, -1, errno);
- goto ret;
- }
-
- open_frame = create_frame (this, this->ctx->pool);
- if (open_frame == NULL) {
- qr_resume_pending_ops (qr_fd_ctx, -1, ENOMEM);
- qr_loc_wipe (&loc);
- goto ret;
- }
-
- STACK_WIND (open_frame, qr_open_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->open, &loc, flags, fd,
- NULL);
-
- qr_loc_wipe (&loc);
- }
-
-ret:
- return 0;
-}
-
-
-int32_t
-qr_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- QR_STACK_UNWIND (unlink, frame, op_ret, op_errno, preparent,
- postparent, xdata);
- return 0;
-}
-
-
-int32_t
-qr_unlink_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
- dict_t *xdata)
+ dict_t *xdata)
{
- qr_local_t *local = NULL;
- uint32_t open_count = 0;
- qr_unlink_ctx_t *unlink_ctx = NULL, *tmp = NULL;
-
- local = frame->local;
-
- LOCK (&local->lock);
- {
- open_count = --local->open_count;
- }
- UNLOCK (&local->lock);
-
- if (open_count > 0) {
- goto out;
- }
-
- list_for_each_entry_safe (unlink_ctx, tmp, &local->list, list) {
- fd_unref (unlink_ctx->fdctx->fd);
- list_del_init (&unlink_ctx->list);
- GF_FREE (unlink_ctx);
- }
-
- if (local->op_ret < 0) {
- /* unwind even if we couldn't open one fd */
- QR_STACK_UNWIND (unlink, frame, -1, local->op_errno, NULL, NULL,
- NULL);
- } else {
- STACK_WIND (frame, qr_unlink_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata);
- }
-
-out:
- return 0;
-}
-
-
-qr_unlink_ctx_t *
-qr_unlink_ctx_new ()
-{
- qr_unlink_ctx_t *ctx = NULL;
-
- ctx = GF_CALLOC (1, sizeof (*ctx), gf_qr_mt_qr_unlink_ctx_t);
- if (ctx == NULL) {
- goto out;
- }
-
- INIT_LIST_HEAD (&ctx->list);
-out:
- return ctx;
-}
-
-
-int32_t
-qr_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
- dict_t *xdata)
-{
- int32_t op_errno = -1, ret = -1, op_ret = -1;
- uint64_t value = 0;
- char need_open = 0;
- qr_local_t *local = NULL;
- qr_fd_ctx_t *fdctx = NULL;
- call_frame_t *open_frame = NULL;
- call_stub_t *stub = NULL;
- qr_inode_t *qr_inode = NULL;
- uint32_t open_count = 0;
- qr_unlink_ctx_t *unlink_ctx = NULL;
-
- ret = inode_ctx_get (loc->inode, this, &value);
- if (ret == 0) {
- qr_inode = (qr_inode_t *)(unsigned long)value;
- }
-
- if (qr_inode == NULL) {
- goto wind;
- }
-
- local = qr_local_new (this);
- GF_VALIDATE_OR_GOTO_WITH_ERROR (this->name, local, unwind, op_errno,
- ENOMEM);
-
- frame->local = local;
-
- op_ret = 0;
-
- LOCK (&loc->inode->lock);
- {
- list_for_each_entry (fdctx, &qr_inode->fd_list, inode_list) {
- need_open = 0;
-
- LOCK (&fdctx->lock);
- {
- if (qr_inode->stbuf.ia_nlink == 1) {
- fdctx->disabled = 1;
- }
-
- if ((fdctx->opened)
- || (strcmp (loc->path, fdctx->path) != 0)) {
- goto unlock;
- }
-
- if (!(fdctx->opened
- || fdctx->open_in_transit)) {
- need_open = 1;
- fdctx->open_in_transit = 1;
- }
-
- if (!fdctx->opened) {
- unlink_ctx = qr_unlink_ctx_new ();
- if (unlink_ctx == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- fdctx->open_in_transit = 0;
- goto unlock;
- }
-
- stub = fop_unlink_stub (frame,
- qr_unlink_helper,
- loc, xflag,
- xdata);
- if (stub == NULL) {
- op_ret = -1;
- op_errno = ENOMEM;
- fdctx->open_in_transit = 0;
- GF_FREE (unlink_ctx);
- goto unlock;
- }
-
- list_add_tail (&stub->list,
- &fdctx->waiting_ops);
-
- local->open_count++;
-
- unlink_ctx->need_open = need_open;
- __fd_ref (fdctx->fd);
- unlink_ctx->fdctx = fdctx;
- list_add_tail (&unlink_ctx->list,
- &local->list);
- }
- }
- unlock:
- UNLOCK (&fdctx->lock);
-
- if (op_ret == -1) {
- break;
- }
- }
-
- open_count = local->open_count;
- }
- UNLOCK (&loc->inode->lock);
-
- if (op_ret == -1) {
- goto unwind;
- }
-
- if (open_count == 0) {
- goto wind;
- }
-
- /* no need to hold local->lock, since we are gaurded by condition
- * local->open_count cannot be zero till we send open on
- * all the required fds. qr_unlink_helper will not modify
- * local->list till local->open_count becomes 0.
- */
- list_for_each_entry (unlink_ctx, &local->list, list) {
- if (!unlink_ctx->need_open) {
- continue;
- }
-
- fdctx = unlink_ctx->fdctx;
- open_frame = create_frame (this, this->ctx->pool);
- if (open_frame == NULL) {
- qr_resume_pending_ops (fdctx, -1, ENOMEM);
- continue;
- }
-
- STACK_WIND (open_frame, qr_open_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->open,
- loc, fdctx->flags, fdctx->fd, fdctx->xdata);
- }
-
- return 0;
-
-unwind:
- if (local && !list_empty (&local->list)) {
- list_for_each_entry (unlink_ctx, &local->list, list) {
- qr_resume_pending_ops (unlink_ctx->fdctx, -1, op_errno);
- }
- } else {
- QR_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL);
- }
-
- return 0;
+ qr_inode_prune (this, fd->inode);
-wind:
- STACK_WIND (frame, qr_unlink_cbk, FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata);
- return 0;
+ STACK_WIND (frame, default_ftruncate_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->ftruncate,
+ fd, offset, xdata);
+ return 0;
}
-int32_t
-qr_release (xlator_t *this, fd_t *fd)
+int
+qr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+ fd_t *fd, dict_t *xdata)
{
- qr_fd_ctx_t *qr_fd_ctx = NULL;
- int32_t ret = 0;
- uint64_t value = 0;
-
- GF_VALIDATE_OR_GOTO ("quick-read", this, out);
- GF_VALIDATE_OR_GOTO (this->name, fd, out);
+ qr_inode_set_priority (this, fd->inode, loc->path);
- ret = fd_ctx_del (fd, this, &value);
- if (ret == 0) {
- qr_fd_ctx = (qr_fd_ctx_t *)(long) value;
- if (qr_fd_ctx) {
- qr_fd_ctx_free (qr_fd_ctx);
- }
- }
-
-out:
- return 0;
+ STACK_WIND (frame, default_open_cbk,
+ FIRST_CHILD (this), FIRST_CHILD (this)->fops->open,
+ loc, flags, fd, xdata);
+ return 0;
}
-
-int32_t
+int
qr_forget (xlator_t *this, inode_t *inode)
{
qr_inode_t *qr_inode = NULL;
- uint64_t value = 0;
- int32_t ret = -1;
- qr_private_t *priv = NULL;
- GF_VALIDATE_OR_GOTO ("quick-read", this, out);
- GF_VALIDATE_OR_GOTO (this->name, this->private, out);
- GF_VALIDATE_OR_GOTO (this->name, inode, out);
+ qr_inode = qr_inode_ctx_get (this, inode);
- priv = this->private;
+ if (!qr_inode)
+ return 0;
- LOCK (&priv->table.lock);
- {
- ret = inode_ctx_del (inode, this, &value);
- if (ret == 0) {
- qr_inode = (qr_inode_t *)(long) value;
- __qr_inode_free (qr_inode);
- }
- }
- UNLOCK (&priv->table.lock);
+ qr_inode_prune (this, inode);
-out:
- return 0;
+ GF_FREE (qr_inode);
+
+ return 0;
}
@@ -3427,32 +696,25 @@ int32_t
qr_inodectx_dump (xlator_t *this, inode_t *inode)
{
qr_inode_t *qr_inode = NULL;
- uint64_t value = 0;
int32_t ret = -1;
char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, };
char buf[256] = {0, };
- ret = inode_ctx_get (inode, this, &value);
- if (ret != 0) {
+ qr_inode = qr_inode_ctx_get (this, inode);
+ if (!qr_inode)
goto out;
- }
-
- qr_inode = (qr_inode_t *)(long)value;
- if (qr_inode == NULL) {
- goto out;
- }
gf_proc_dump_build_key (key_prefix, "xlator.performance.quick-read",
"inodectx");
gf_proc_dump_add_section (key_prefix);
- gf_proc_dump_write ("entire-file-cached", "%s", qr_inode->xattr ? "yes" : "no");
+ gf_proc_dump_write ("entire-file-cached", "%s", qr_inode->data ? "yes" : "no");
- if (qr_inode->tv.tv_sec) {
- gf_time_fmt (buf, sizeof buf, qr_inode->tv.tv_sec,
+ if (qr_inode->last_refresh.tv_sec) {
+ gf_time_fmt (buf, sizeof buf, qr_inode->last_refresh.tv_sec,
gf_timefmt_FT);
snprintf (buf + strlen (buf), sizeof buf - strlen (buf),
- ".%"GF_PRI_SUSECONDS, qr_inode->tv.tv_usec);
+ ".%"GF_PRI_SUSECONDS, qr_inode->last_refresh.tv_usec);
gf_proc_dump_write ("last-cache-validation-time", "%s", buf);
}
@@ -3462,77 +724,6 @@ out:
return ret;
}
-int32_t
-qr_fdctx_dump (xlator_t *this, fd_t *fd)
-{
- qr_fd_ctx_t *fdctx = NULL;
- uint64_t value = 0;
- int32_t ret = 0, i = 0;
- char key[GF_DUMP_MAX_BUF_LEN] = {0, };
- char key_prefix[GF_DUMP_MAX_BUF_LEN] = {0, };
- call_stub_t *stub = NULL;
- gf_boolean_t add_section = _gf_false;
-
- ret = fd_ctx_get (fd, this, &value);
- if (ret != 0) {
- goto out;
- }
-
- fdctx = (qr_fd_ctx_t *)(long)value;
- if (fdctx == NULL) {
- goto out;
- }
-
- gf_proc_dump_build_key (key_prefix, "xlator.performance.quick-read",
- "fdctx");
- gf_proc_dump_add_section (key_prefix);
- add_section = _gf_true;
-
- gf_proc_dump_write ("fd", "%p", fd);
-
- ret = TRY_LOCK (&fdctx->lock);
- if (ret)
- goto out;
- {
- gf_proc_dump_write ("path", "%s", fdctx->path);
-
- gf_proc_dump_write ("opened", "%s", fdctx->opened ? "yes" : "no");
-
- gf_proc_dump_write ("open-in-progress", "%s", fdctx->open_in_transit ?
- "yes" : "no");
-
- gf_proc_dump_write ("caching disabled (for this fd)", "%s",
- fdctx->disabled ? "yes" : "no");
-
- gf_proc_dump_write ("flags", "%d", fdctx->flags);
-
- list_for_each_entry (stub, &fdctx->waiting_ops, list) {
- gf_proc_dump_build_key (key, "",
- "waiting-ops[%d].frame", i);
- gf_proc_dump_write (key, "%"PRId64,
- stub->frame->root->unique);
-
- gf_proc_dump_build_key (key, "",
- "waiting-ops[%d].fop", i);
- gf_proc_dump_write (key, "%s", gf_fop_list[stub->fop]);
-
- i++;
- }
- }
- UNLOCK (&fdctx->lock);
-
- ret = 0;
-out:
- if (ret && fdctx) {
- if (add_section == _gf_false)
- gf_proc_dump_add_section (key_prefix);
-
- gf_proc_dump_write ("Unable to dump the state of fdctx",
- "(Lock acquistion failed) fd: %p, gfid: %s",
- fd, uuid_utoa (fd->inode->gfid));
- }
- return ret;
-}
int
qr_priv_dump (xlator_t *this)
@@ -3553,14 +744,11 @@ qr_priv_dump (xlator_t *this)
priv = this->private;
conf = &priv->conf;
- if (!conf) {
- gf_log (this->name, GF_LOG_WARNING, "conf null in xlator");
+ if (!conf)
return -1;
- }
table = &priv->table;
-
gf_proc_dump_build_key (key_prefix, "xlator.performance.quick-read",
"priv");
@@ -3570,13 +758,12 @@ qr_priv_dump (xlator_t *this)
gf_proc_dump_write ("cache_timeout", "%d", conf->cache_timeout);
if (!table) {
- gf_log (this->name, GF_LOG_WARNING, "table is NULL");
goto out;
} else {
for (i = 0; i < conf->max_pri; i++) {
list_for_each_entry (curr, &table->lru[i], lru) {
file_count++;
- total_size += curr->stbuf.ia_size;
+ total_size += curr->size;
}
}
}
@@ -3652,6 +839,7 @@ reconfigure (xlator_t *this, dict_t *options)
qr_private_t *priv = NULL;
qr_conf_t *conf = NULL;
uint64_t cache_size_new = 0;
+
GF_VALIDATE_OR_GOTO ("quick-read", this, out);
GF_VALIDATE_OR_GOTO (this->name, this->private, out);
GF_VALIDATE_OR_GOTO (this->name, options, out);
@@ -3666,7 +854,7 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("cache-timeout", conf->cache_timeout, options, int32,
out);
- GF_OPTION_RECONF ("cache-size", cache_size_new, options, size, out);
+ GF_OPTION_RECONF ("cache-size", cache_size_new, options, size_uint64, out);
if (!check_cache_size_ok (this, cache_size_new)) {
ret = -1;
gf_log (this->name, GF_LOG_ERROR,
@@ -3807,11 +995,11 @@ init (xlator_t *this)
LOCK_INIT (&priv->table.lock);
conf = &priv->conf;
- GF_OPTION_INIT ("max-file-size", conf->max_file_size, size, out);
+ GF_OPTION_INIT ("max-file-size", conf->max_file_size, size_uint64, out);
GF_OPTION_INIT ("cache-timeout", conf->cache_timeout, int32, out);
- GF_OPTION_INIT ("cache-size", conf->cache_size, size, out);
+ GF_OPTION_INIT ("cache-size", conf->cache_size, size_uint64, out);
if (!check_cache_size_ok (this, conf->cache_size)) {
ret = -1;
goto out;
@@ -3845,14 +1033,6 @@ init (xlator_t *this)
INIT_LIST_HEAD (&priv->table.lru[i]);
}
- this->local_pool = mem_pool_new (qr_local_t, 64);
- if (!this->local_pool) {
- ret = -1;
- gf_log (this->name, GF_LOG_ERROR,
- "failed to create local_t's memory pool");
- goto out;
- }
-
ret = 0;
this->private = priv;
@@ -3924,31 +1104,21 @@ out:
struct xlator_fops fops = {
.lookup = qr_lookup,
+ .readdirp = qr_readdirp,
.open = qr_open,
.readv = qr_readv,
- .writev = qr_writev,
- .fstat = qr_fstat,
- .fsetxattr = qr_fsetxattr,
- .fgetxattr = qr_fgetxattr,
- .flush = qr_flush,
- .fentrylk = qr_fentrylk,
- .finodelk = qr_finodelk,
- .fsync = qr_fsync,
- .ftruncate = qr_ftruncate,
- .lk = qr_lk,
- .fsetattr = qr_fsetattr,
- .unlink = qr_unlink,
+ .writev = qr_writev,
+ .truncate = qr_truncate,
+ .ftruncate = qr_ftruncate
};
struct xlator_cbks cbks = {
.forget = qr_forget,
- .release = qr_release,
};
struct xlator_dumpops dumpops = {
.priv = qr_priv_dump,
.inodectx = qr_inodectx_dump,
- .fdctx = qr_fdctx_dump
};
struct volume_options options[] = {
@@ -3974,4 +1144,5 @@ struct volume_options options[] = {
.max = 1 * GF_UNIT_KB * 1000,
.default_value = "64KB",
},
+ { .key = {NULL} }
};
diff --git a/xlators/performance/quick-read/src/quick-read.h b/xlators/performance/quick-read/src/quick-read.h
index 10a04e79c..6f0a05417 100644
--- a/xlators/performance/quick-read/src/quick-read.h
+++ b/xlators/performance/quick-read/src/quick-read.h
@@ -34,48 +34,20 @@
#include <fnmatch.h>
#include "quick-read-mem-types.h"
-struct qr_fd_ctx {
- char opened;
- char disabled;
- char open_in_transit;
- char *path;
- int flags;
- int wbflags;
- struct list_head waiting_ops;
- gf_lock_t lock;
- struct list_head inode_list;
- fd_t *fd;
- dict_t *xdata;
-};
-typedef struct qr_fd_ctx qr_fd_ctx_t;
-
-struct qr_local {
- char is_open;
- char *path;
- char just_validated;
- fd_t *fd;
- int open_flags;
- int32_t op_ret;
- int32_t op_errno;
- uint32_t open_count;
- call_stub_t *stub;
- struct list_head list;
- gf_lock_t lock;
-};
-typedef struct qr_local qr_local_t;
struct qr_inode {
- dict_t *xattr;
- inode_t *inode;
+ void *data;
+ size_t size;
int priority;
- struct iatt stbuf;
- struct timeval tv;
+ uint32_t ia_mtime;
+ uint32_t ia_mtime_nsec;
+ struct iatt buf;
+ struct timeval last_refresh;
struct list_head lru;
- struct list_head fd_list;
- struct list_head unlinked_dentries;
};
typedef struct qr_inode qr_inode_t;
+
struct qr_priority {
char *pattern;
int32_t priority;
@@ -105,20 +77,5 @@ struct qr_private {
};
typedef struct qr_private qr_private_t;
-struct qr_unlink_ctx {
- struct list_head list;
- qr_fd_ctx_t *fdctx;
- char need_open;
-};
-typedef struct qr_unlink_ctx qr_unlink_ctx_t;
-
-void qr_local_free (qr_local_t *local);
-
-#define QR_STACK_UNWIND(op, frame, params ...) do { \
- qr_local_t *__local = frame->local; \
- frame->local = NULL; \
- STACK_UNWIND_STRICT (op, frame, params); \
- qr_local_free (__local); \
- } while (0)
#endif /* #ifndef __QUICK_READ_H */
diff --git a/xlators/performance/read-ahead/src/Makefile.am b/xlators/performance/read-ahead/src/Makefile.am
index ae2b1ace9..be80ae7ac 100644
--- a/xlators/performance/read-ahead/src/Makefile.am
+++ b/xlators/performance/read-ahead/src/Makefile.am
@@ -1,7 +1,7 @@
xlator_LTLIBRARIES = read-ahead.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
-read_ahead_la_LDFLAGS = -module -avoidversion
+read_ahead_la_LDFLAGS = -module -avoid-version
read_ahead_la_SOURCES = read-ahead.c page.c
read_ahead_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
diff --git a/xlators/performance/read-ahead/src/page.c b/xlators/performance/read-ahead/src/page.c
index e79e7ae78..6e5b52c5e 100644
--- a/xlators/performance/read-ahead/src/page.c
+++ b/xlators/performance/read-ahead/src/page.c
@@ -421,7 +421,12 @@ ra_frame_unwind (call_frame_t *frame)
fill->count * sizeof (*vector));
copied += (fill->count * sizeof (*vector));
- iobref_merge (iobref, fill->iobref);
+ if (iobref_merge (iobref, fill->iobref)) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ iobref_unref (iobref);
+ iobref = NULL;
+ }
}
fill->next->prev = fill->prev;
diff --git a/xlators/performance/read-ahead/src/read-ahead.c b/xlators/performance/read-ahead/src/read-ahead.c
index 549496755..01c861d52 100644
--- a/xlators/performance/read-ahead/src/read-ahead.c
+++ b/xlators/performance/read-ahead/src/read-ahead.c
@@ -498,7 +498,7 @@ ra_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
"expected offset (%"PRId64") when page_count=%d",
offset, file->page_count);
- if (file->expected < (conf->page_size * conf->page_count)) {
+ if (file->expected < (file->page_size * conf->page_count)) {
file->expected += size;
file->page_count = min ((file->expected
/ file->page_size),
@@ -942,6 +942,106 @@ unwind:
return 0;
}
+int
+ra_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ GF_ASSERT (frame);
+
+ STACK_UNWIND_STRICT (discard, frame, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+ return 0;
+}
+
+static int
+ra_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ ra_file_t *file = NULL;
+ fd_t *iter_fd = NULL;
+ inode_t *inode = NULL;
+ uint64_t tmp_file = 0;
+ int32_t op_errno = EINVAL;
+
+ GF_ASSERT (frame);
+ GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind);
+ GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind);
+
+ inode = fd->inode;
+
+ LOCK (&inode->lock);
+ {
+ list_for_each_entry (iter_fd, &inode->fd_list, inode_list) {
+ fd_ctx_get (iter_fd, this, &tmp_file);
+ file = (ra_file_t *)(long)tmp_file;
+ if (!file)
+ continue;
+
+ flush_region(frame, file, offset, len, 1);
+ }
+ }
+ UNLOCK (&inode->lock);
+
+ STACK_WIND (frame, ra_discard_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->discard, fd, offset, len, xdata);
+ return 0;
+
+unwind:
+ STACK_UNWIND_STRICT (discard, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+ra_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ GF_ASSERT (frame);
+
+ STACK_UNWIND_STRICT (zerofill, frame, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+ return 0;
+}
+
+static int
+ra_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata)
+{
+ ra_file_t *file = NULL;
+ fd_t *iter_fd = NULL;
+ inode_t *inode = NULL;
+ uint64_t tmp_file = 0;
+ int32_t op_errno = EINVAL;
+
+ GF_ASSERT (frame);
+ GF_VALIDATE_OR_GOTO (frame->this->name, this, unwind);
+ GF_VALIDATE_OR_GOTO (frame->this->name, fd, unwind);
+
+ inode = fd->inode;
+
+ LOCK (&inode->lock);
+ {
+ list_for_each_entry (iter_fd, &inode->fd_list, inode_list) {
+ fd_ctx_get (iter_fd, this, &tmp_file);
+ file = (ra_file_t *)(long)tmp_file;
+ if (!file)
+ continue;
+
+ flush_region(frame, file, offset, len, 1);
+ }
+ }
+ UNLOCK (&inode->lock);
+
+ STACK_WIND (frame, ra_zerofill_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->zerofill, fd,
+ offset, len, xdata);
+ return 0;
+
+unwind:
+ STACK_UNWIND_STRICT (zerofill, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
int
ra_priv_dump (xlator_t *this)
@@ -1024,6 +1124,9 @@ reconfigure (xlator_t *this, dict_t *options)
GF_OPTION_RECONF ("page-count", conf->page_count, options, uint32, out);
+ GF_OPTION_RECONF ("page-size", conf->page_size, options, size_uint64,
+ out);
+
ret = 0;
out:
return ret;
@@ -1056,6 +1159,8 @@ init (xlator_t *this)
conf->page_size = this->ctx->page_size;
+ GF_OPTION_INIT ("page-size", conf->page_size, size_uint64, out);
+
GF_OPTION_INIT ("page-count", conf->page_count, uint32, out);
GF_OPTION_INIT ("force-atime-update", conf->force_atime_update, bool, out);
@@ -1119,6 +1224,8 @@ struct xlator_fops fops = {
.truncate = ra_truncate,
.ftruncate = ra_ftruncate,
.fstat = ra_fstat,
+ .discard = ra_discard,
+ .zerofill = ra_zerofill,
};
struct xlator_cbks cbks = {
@@ -1142,5 +1249,12 @@ struct volume_options options[] = {
.default_value = "4",
.description = "Number of pages that will be pre-fetched"
},
+ { .key = {"page-size"},
+ .type = GF_OPTION_TYPE_SIZET,
+ .min = 4096,
+ .max = 1048576 * 64,
+ .default_value = "131072",
+ .description = "Page size with which read-ahead performs server I/O"
+ },
{ .key = {NULL} },
};
diff --git a/xlators/performance/readdir-ahead/Makefile.am b/xlators/performance/readdir-ahead/Makefile.am
new file mode 100644
index 000000000..a985f42a8
--- /dev/null
+++ b/xlators/performance/readdir-ahead/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/performance/readdir-ahead/src/Makefile.am b/xlators/performance/readdir-ahead/src/Makefile.am
new file mode 100644
index 000000000..539d6ede4
--- /dev/null
+++ b/xlators/performance/readdir-ahead/src/Makefile.am
@@ -0,0 +1,15 @@
+xlator_LTLIBRARIES = readdir-ahead.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
+
+readdir_ahead_la_LDFLAGS = -module -avoid-version
+
+readdir_ahead_la_SOURCES = readdir-ahead.c
+readdir_ahead_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = readdir-ahead.h readdir-ahead-mem-types.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
diff --git a/xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h b/xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h
new file mode 100644
index 000000000..39e2c5369
--- /dev/null
+++ b/xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h
@@ -0,0 +1,24 @@
+/*
+ Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef __RDA_MEM_TYPES_H__
+#define __RDA_MEM_TYPES_H__
+
+#include "mem-types.h"
+
+enum gf_rda_mem_types_ {
+ gf_rda_mt_rda_local = gf_common_mt_end + 1,
+ gf_rda_mt_rda_fd_ctx,
+ gf_rda_mt_rda_priv,
+ gf_rda_mt_end
+};
+
+#endif
diff --git a/xlators/performance/readdir-ahead/src/readdir-ahead.c b/xlators/performance/readdir-ahead/src/readdir-ahead.c
new file mode 100644
index 000000000..ba96bfcd3
--- /dev/null
+++ b/xlators/performance/readdir-ahead/src/readdir-ahead.c
@@ -0,0 +1,560 @@
+/*
+ Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+/*
+ * performance/readdir-ahead preloads a local buffer with directory entries
+ * on opendir. The optimization involves using maximum sized gluster rpc
+ * requests (128k) to minimize overhead of smaller client requests.
+ *
+ * For example, fuse currently supports a maximum readdir buffer of 4k
+ * (regardless of the filesystem client's buffer size). readdir-ahead should
+ * effectively convert these smaller requests into fewer, larger sized requests
+ * for simple, sequential workloads (i.e., ls).
+ *
+ * The translator is currently designed to handle the simple, sequential case
+ * only. If a non-sequential directory read occurs, readdir-ahead disables
+ * preloads on the directory.
+ */
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "xlator.h"
+#include "call-stub.h"
+#include "readdir-ahead.h"
+#include "readdir-ahead-mem-types.h"
+#include "defaults.h"
+
+static int rda_fill_fd(call_frame_t *, xlator_t *, fd_t *);
+
+/*
+ * Get (or create) the fd context for storing prepopulated directory
+ * entries.
+ */
+static struct
+rda_fd_ctx *get_rda_fd_ctx(fd_t *fd, xlator_t *this)
+{
+ uint64_t val;
+ struct rda_fd_ctx *ctx;
+
+ LOCK(&fd->lock);
+
+ if (__fd_ctx_get(fd, this, &val) < 0) {
+ ctx = GF_CALLOC(1, sizeof(struct rda_fd_ctx),
+ gf_rda_mt_rda_fd_ctx);
+ if (!ctx)
+ goto out;
+
+ LOCK_INIT(&ctx->lock);
+ INIT_LIST_HEAD(&ctx->entries.list);
+ ctx->state = RDA_FD_NEW;
+ /* ctx offset values initialized to 0 */
+
+ if (__fd_ctx_set(fd, this, (uint64_t) ctx) < 0) {
+ GF_FREE(ctx);
+ ctx = NULL;
+ goto out;
+ }
+ } else {
+ ctx = (struct rda_fd_ctx *) val;
+ }
+out:
+ UNLOCK(&fd->lock);
+ return ctx;
+}
+
+/*
+ * Reset the tracking state of the context.
+ */
+static void
+rda_reset_ctx(struct rda_fd_ctx *ctx)
+{
+ ctx->state = RDA_FD_NEW;
+ ctx->cur_offset = 0;
+ ctx->cur_size = 0;
+ ctx->next_offset = 0;
+ gf_dirent_free(&ctx->entries);
+}
+
+/*
+ * Check whether we can handle a request. Offset verification is done by the
+ * caller, so we only check whether the preload buffer has completion status
+ * (including an error) or has some data to return.
+ */
+static gf_boolean_t
+rda_can_serve_readdirp(struct rda_fd_ctx *ctx, size_t request_size)
+{
+ if ((ctx->state & RDA_FD_EOD) ||
+ (ctx->state & RDA_FD_ERROR) ||
+ (!(ctx->state & RDA_FD_PLUGGED) && (ctx->cur_size > 0)))
+ return _gf_true;
+
+ return _gf_false;
+}
+
+/*
+ * Serve a request from the fd dentry list based on the size of the request
+ * buffer. ctx must be locked.
+ */
+static int32_t
+__rda_serve_readdirp(xlator_t *this, gf_dirent_t *entries, size_t request_size,
+ struct rda_fd_ctx *ctx)
+{
+ gf_dirent_t *dirent, *tmp;
+ size_t dirent_size, size = 0;
+ int32_t count = 0;
+ struct rda_priv *priv = this->private;
+
+ list_for_each_entry_safe(dirent, tmp, &ctx->entries.list, list) {
+ dirent_size = gf_dirent_size(dirent->d_name);
+ if (size + dirent_size > request_size)
+ break;
+
+ size += dirent_size;
+ list_del_init(&dirent->list);
+ ctx->cur_size -= dirent_size;
+
+ list_add_tail(&dirent->list, &entries->list);
+ ctx->cur_offset = dirent->d_off;
+ count++;
+ }
+
+ if (ctx->cur_size <= priv->rda_low_wmark)
+ ctx->state |= RDA_FD_PLUGGED;
+
+ return count;
+}
+
+static int32_t
+rda_readdirp_stub(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, dict_t *xdata)
+{
+ gf_dirent_t entries;
+ int32_t ret;
+ struct rda_fd_ctx *ctx;
+ int op_errno = 0;
+
+ ctx = get_rda_fd_ctx(fd, this);
+ INIT_LIST_HEAD(&entries.list);
+ ret = __rda_serve_readdirp(this, &entries, size, ctx);
+
+ if (!ret && (ctx->state & RDA_FD_ERROR)) {
+ ret = -1;
+ op_errno = ctx->op_errno;
+ ctx->state &= ~RDA_FD_ERROR;
+
+ /*
+ * the preload has stopped running in the event of an error, so
+ * pass all future requests along
+ */
+ ctx->state |= RDA_FD_BYPASS;
+ }
+
+ STACK_UNWIND_STRICT(readdirp, frame, ret, op_errno, &entries, xdata);
+ gf_dirent_free(&entries);
+
+ return 0;
+}
+
+static int32_t
+rda_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t off, dict_t *xdata)
+{
+ struct rda_fd_ctx *ctx;
+ call_stub_t *stub;
+ int fill = 0;
+
+ ctx = get_rda_fd_ctx(fd, this);
+ if (!ctx)
+ goto err;
+
+ if (ctx->state & RDA_FD_BYPASS)
+ goto bypass;
+
+ LOCK(&ctx->lock);
+
+ /* recheck now that we have the lock */
+ if (ctx->state & RDA_FD_BYPASS) {
+ UNLOCK(&ctx->lock);
+ goto bypass;
+ }
+
+ /*
+ * If a new read comes in at offset 0 and the buffer has been
+ * completed, reset the context and kickstart the filler again.
+ */
+ if (!off && (ctx->state & RDA_FD_EOD) && (ctx->cur_size == 0)) {
+ rda_reset_ctx(ctx);
+ fill = 1;
+ }
+
+ /*
+ * If a readdir occurs at an unexpected offset or we already have a
+ * request pending, admit defeat and just get out of the way.
+ */
+ if (off != ctx->cur_offset || ctx->stub) {
+ ctx->state |= RDA_FD_BYPASS;
+ UNLOCK(&ctx->lock);
+ goto bypass;
+ }
+
+ stub = fop_readdirp_stub(frame, rda_readdirp_stub, fd, size, off, xdata);
+ if (!stub) {
+ UNLOCK(&ctx->lock);
+ goto err;
+ }
+
+ /*
+ * If we haven't bypassed the preload, this means we can either serve
+ * the request out of the preload or the request that enables us to do
+ * so is in flight...
+ */
+ if (rda_can_serve_readdirp(ctx, size))
+ call_resume(stub);
+ else
+ ctx->stub = stub;
+
+ UNLOCK(&ctx->lock);
+
+ if (fill)
+ rda_fill_fd(frame, this, fd);
+
+ return 0;
+
+bypass:
+ STACK_WIND(frame, default_readdirp_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdirp, fd, size, off, xdata);
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT(readdirp, frame, -1, ENOMEM, NULL, NULL);
+ return 0;
+}
+
+static int32_t
+rda_fill_fd_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+ dict_t *xdata)
+{
+ gf_dirent_t *dirent, *tmp;
+ struct rda_local *local = frame->local;
+ struct rda_fd_ctx *ctx = local->ctx;
+ struct rda_priv *priv = this->private;
+ int fill = 1;
+
+ LOCK(&ctx->lock);
+
+ /* Verify that the preload buffer is still pending on this data. */
+ if (ctx->next_offset != local->offset) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "Out of sequence directory preload.");
+ ctx->state |= (RDA_FD_BYPASS|RDA_FD_ERROR);
+ ctx->op_errno = EUCLEAN;
+
+ goto out;
+ }
+
+ if (entries) {
+ list_for_each_entry_safe(dirent, tmp, &entries->list, list) {
+ list_del_init(&dirent->list);
+ /* must preserve entry order */
+ list_add_tail(&dirent->list, &ctx->entries.list);
+
+ ctx->cur_size += gf_dirent_size(dirent->d_name);
+ ctx->next_offset = dirent->d_off;
+ }
+ }
+
+ if (ctx->cur_size >= priv->rda_high_wmark)
+ ctx->state &= ~RDA_FD_PLUGGED;
+
+ if (!op_ret) {
+ /* we've hit eod */
+ ctx->state &= ~RDA_FD_RUNNING;
+ ctx->state |= RDA_FD_EOD;
+ } else if (op_ret == -1) {
+ /* kill the preload and pend the error */
+ ctx->state &= ~RDA_FD_RUNNING;
+ ctx->state |= RDA_FD_ERROR;
+ ctx->op_errno = op_errno;
+ }
+
+ /*
+ * NOTE: The strict bypass logic in readdirp() means a pending request
+ * is always based on ctx->cur_offset.
+ */
+ if (ctx->stub &&
+ rda_can_serve_readdirp(ctx, ctx->stub->args.size)) {
+ call_resume(ctx->stub);
+ ctx->stub = NULL;
+ }
+
+out:
+ /*
+ * If we have been marked for bypass and have no pending stub, clear the
+ * run state so we stop preloading the context with entries.
+ */
+ if ((ctx->state & RDA_FD_BYPASS) && !ctx->stub)
+ ctx->state &= ~RDA_FD_RUNNING;
+
+ if (!(ctx->state & RDA_FD_RUNNING)) {
+ fill = 0;
+ STACK_DESTROY(ctx->fill_frame->root);
+ ctx->fill_frame = NULL;
+ }
+
+ UNLOCK(&ctx->lock);
+
+ if (fill)
+ rda_fill_fd(frame, this, local->fd);
+
+ return 0;
+}
+
+/*
+ * Start prepopulating the fd context with directory entries.
+ */
+static int
+rda_fill_fd(call_frame_t *frame, xlator_t *this, fd_t *fd)
+{
+ call_frame_t *nframe = NULL;
+ struct rda_local *local = NULL;
+ struct rda_fd_ctx *ctx;
+ off_t offset;
+ struct rda_priv *priv = this->private;
+
+ ctx = get_rda_fd_ctx(fd, this);
+ if (!ctx)
+ goto err;
+
+ LOCK(&ctx->lock);
+
+ if (ctx->state & RDA_FD_NEW) {
+ ctx->state &= ~RDA_FD_NEW;
+ ctx->state |= RDA_FD_RUNNING;
+ if (priv->rda_low_wmark)
+ ctx->state |= RDA_FD_PLUGGED;
+ }
+
+ offset = ctx->next_offset;
+
+ if (!ctx->fill_frame) {
+ nframe = copy_frame(frame);
+ if (!nframe) {
+ UNLOCK(&ctx->lock);
+ goto err;
+ }
+
+ local = mem_get0(this->local_pool);
+ if (!local) {
+ UNLOCK(&ctx->lock);
+ goto err;
+ }
+
+ local->ctx = ctx;
+ local->fd = fd;
+ nframe->local = local;
+
+ ctx->fill_frame = nframe;
+ } else {
+ nframe = ctx->fill_frame;
+ local = nframe->local;
+ }
+
+ local->offset = offset;
+
+ UNLOCK(&ctx->lock);
+
+ STACK_WIND(nframe, rda_fill_fd_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdirp, fd, priv->rda_req_size,
+ offset, NULL);
+
+ return 0;
+
+err:
+ if (nframe)
+ FRAME_DESTROY(nframe);
+
+ return -1;
+}
+
+static int32_t
+rda_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
+{
+ if (!op_ret)
+ rda_fill_fd(frame, this, fd);
+
+ STACK_UNWIND_STRICT(opendir, frame, op_ret, op_errno, fd, xdata);
+ return 0;
+}
+
+static int32_t
+rda_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+ dict_t *xdata)
+{
+ STACK_WIND(frame, rda_opendir_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->opendir, loc, fd, xdata);
+ return 0;
+}
+
+static int32_t
+rda_releasedir(xlator_t *this, fd_t *fd)
+{
+ uint64_t val;
+ struct rda_fd_ctx *ctx;
+
+ if (fd_ctx_del(fd, this, &val) < 0)
+ return -1;
+
+ ctx = (struct rda_fd_ctx *) val;
+ if (!ctx)
+ return 0;
+
+ rda_reset_ctx(ctx);
+
+ if (ctx->fill_frame)
+ STACK_DESTROY(ctx->fill_frame->root);
+
+ if (ctx->stub)
+ gf_log(this->name, GF_LOG_ERROR,
+ "released a directory with a pending stub");
+
+ GF_FREE(ctx);
+ return 0;
+}
+
+int32_t
+mem_acct_init(xlator_t *this)
+{
+ int ret = -1;
+
+ if (!this)
+ goto out;
+
+ ret = xlator_mem_acct_init(this, gf_rda_mt_end + 1);
+
+ if (ret != 0)
+ gf_log(this->name, GF_LOG_ERROR, "Memory accounting init"
+ "failed");
+
+out:
+ return ret;
+}
+
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+ struct rda_priv *priv = this->private;
+
+ GF_OPTION_RECONF("rda-request-size", priv->rda_req_size, options,
+ uint32, err);
+ GF_OPTION_RECONF("rda-low-wmark", priv->rda_low_wmark, options, size_uint64,
+ err);
+ GF_OPTION_RECONF("rda-high-wmark", priv->rda_high_wmark, options, size_uint64,
+ err);
+
+ return 0;
+err:
+ return -1;
+}
+
+int
+init(xlator_t *this)
+{
+ struct rda_priv *priv = NULL;
+
+ GF_VALIDATE_OR_GOTO("readdir-ahead", this, err);
+
+ if (!this->children || this->children->next) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "FATAL: readdir-ahead not configured with exactly one"
+ " child");
+ goto err;
+ }
+
+ if (!this->parents) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "dangling volume. check volfile ");
+ }
+
+ priv = GF_CALLOC(1, sizeof(struct rda_priv), gf_rda_mt_rda_priv);
+ if (!priv)
+ goto err;
+ this->private = priv;
+
+ this->local_pool = mem_pool_new(struct rda_local, 32);
+ if (!this->local_pool)
+ goto err;
+
+ GF_OPTION_INIT("rda-request-size", priv->rda_req_size, uint32, err);
+ GF_OPTION_INIT("rda-low-wmark", priv->rda_low_wmark, size_uint64, err);
+ GF_OPTION_INIT("rda-high-wmark", priv->rda_high_wmark, size_uint64, err);
+
+ return 0;
+
+err:
+ if (this->local_pool)
+ mem_pool_destroy(this->local_pool);
+ if (priv)
+ GF_FREE(priv);
+
+ return -1;
+}
+
+
+void
+fini(xlator_t *this)
+{
+ GF_VALIDATE_OR_GOTO ("readdir-ahead", this, out);
+
+ GF_FREE(this->private);
+
+out:
+ return;
+}
+
+struct xlator_fops fops = {
+ .opendir = rda_opendir,
+ .readdirp = rda_readdirp,
+};
+
+struct xlator_cbks cbks = {
+ .releasedir = rda_releasedir,
+};
+
+struct volume_options options[] = {
+ { .key = {"rda-request-size"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 4096,
+ .max = 131072,
+ .default_value = "131072",
+ .description = "readdir-ahead request size",
+ },
+ { .key = {"rda-low-wmark"},
+ .type = GF_OPTION_TYPE_SIZET,
+ .min = 0,
+ .max = 10 * GF_UNIT_MB,
+ .default_value = "4096",
+ .description = "the value under which we plug",
+ },
+ { .key = {"rda-high-wmark"},
+ .type = GF_OPTION_TYPE_SIZET,
+ .min = 0,
+ .max = 100 * GF_UNIT_MB,
+ .default_value = "131072",
+ .description = "the value over which we unplug",
+ },
+ { .key = {NULL} },
+};
+
diff --git a/xlators/performance/readdir-ahead/src/readdir-ahead.h b/xlators/performance/readdir-ahead/src/readdir-ahead.h
new file mode 100644
index 000000000..e48786dae
--- /dev/null
+++ b/xlators/performance/readdir-ahead/src/readdir-ahead.h
@@ -0,0 +1,46 @@
+/*
+ Copyright (c) 2008-2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef __READDIR_AHEAD_H
+#define __READDIR_AHEAD_H
+
+/* state flags */
+#define RDA_FD_NEW (1 << 0)
+#define RDA_FD_RUNNING (1 << 1)
+#define RDA_FD_EOD (1 << 2)
+#define RDA_FD_ERROR (1 << 3)
+#define RDA_FD_BYPASS (1 << 4)
+#define RDA_FD_PLUGGED (1 << 5)
+
+struct rda_fd_ctx {
+ off_t cur_offset; /* current head of the ctx */
+ size_t cur_size; /* current size of the preload */
+ off_t next_offset; /* tail of the ctx */
+ uint32_t state;
+ gf_lock_t lock;
+ gf_dirent_t entries;
+ call_frame_t *fill_frame;
+ call_stub_t *stub;
+ int op_errno;
+};
+
+struct rda_local {
+ struct rda_fd_ctx *ctx;
+ fd_t *fd;
+ off_t offset;
+};
+
+struct rda_priv {
+ uint32_t rda_req_size;
+ uint64_t rda_low_wmark;
+ uint64_t rda_high_wmark;
+};
+
+#endif /* __READDIR_AHEAD_H */
diff --git a/xlators/performance/symlink-cache/src/Makefile.am b/xlators/performance/symlink-cache/src/Makefile.am
index c37d93e86..4091c3293 100644
--- a/xlators/performance/symlink-cache/src/Makefile.am
+++ b/xlators/performance/symlink-cache/src/Makefile.am
@@ -1,7 +1,7 @@
xlator_LTLIBRARIES = symlink-cache.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/performance
-symlink_cache_la_LDFLAGS = -module -avoidversion
+symlink_cache_la_LDFLAGS = -module -avoid-version
symlink_cache_la_SOURCES = symlink-cache.c
symlink_cache_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
diff --git a/xlators/performance/write-behind/src/Makefile.am b/xlators/performance/write-behind/src/Makefile.am
index 5ca0462ae..6c829d8ee 100644
--- a/xlators/performance/write-behind/src/Makefile.am
+++ b/xlators/performance/write-behind/src/Makefile.am
@@ -1,7 +1,7 @@
xlator_LTLIBRARIES = write-behind.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance
-write_behind_la_LDFLAGS = -module -avoidversion
+write_behind_la_LDFLAGS = -module -avoid-version
write_behind_la_SOURCES = write-behind.c
write_behind_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c
index ffa333ce8..3cb0d449b 100644
--- a/xlators/performance/write-behind/src/write-behind.c
+++ b/xlators/performance/write-behind/src/write-behind.c
@@ -43,13 +43,6 @@ typedef struct wb_inode {
used for trickling_writes
*/
- int32_t op_ret; /* Last found op_ret and op_errno
- while completing a liability
- operation. Will be picked by
- the next arriving writev/flush/fsync
- */
- int32_t op_errno;
-
list_head_t all; /* All requests, from enqueue() till destroy().
Used only for resetting generation
number when empty.
@@ -89,6 +82,12 @@ typedef struct wb_inode {
write-behind from this list, and therefore
get "upgraded" to the "liability" list.
*/
+ list_head_t wip; /* List of write calls in progress, SYNC or non-SYNC
+ which are currently STACK_WIND'ed towards the server.
+ This is for guaranteeing that no two overlapping
+ writes are in progress at the same time. Modules
+ like eager-lock in AFR depend on this behavior.
+ */
uint64_t gen; /* Liability generation number. Represents
the current 'state' of liability. Every
new addition to the liability list bumps
@@ -109,6 +108,7 @@ typedef struct wb_inode {
after it arrived (i.e, those that have a
liability generation higher than itself)
*/
+ size_t size; /* Size of the file to catch write after EOF. */
gf_lock_t lock;
xlator_t *this;
} wb_inode_t;
@@ -120,10 +120,11 @@ typedef struct wb_request {
list_head_t lie; /* either in @liability or @temptation */
list_head_t winds;
list_head_t unwinds;
+ list_head_t wip;
call_stub_t *stub;
- size_t write_size; /* currently held size
+ ssize_t write_size; /* currently held size
(after collapsing) */
size_t orig_size; /* size which arrived with the request.
This is the size by which we grow
@@ -205,6 +206,26 @@ out:
}
+gf_boolean_t
+wb_fd_err (fd_t *fd, xlator_t *this, int32_t *op_errno)
+{
+ gf_boolean_t err = _gf_false;
+ uint64_t value = 0;
+ int32_t tmp = 0;
+
+ if (fd_ctx_get (fd, this, &value) == 0) {
+ if (op_errno) {
+ tmp = value;
+ *op_errno = tmp;
+ }
+
+ err = _gf_true;
+ }
+
+ return err;
+}
+
+
/*
Below is a succinct explanation of the code deciding whether two regions
overlap, from Pavan <tcp@gluster.com>.
@@ -302,6 +323,30 @@ wb_liability_has_conflict (wb_inode_t *wb_inode, wb_request_t *req)
}
+gf_boolean_t
+wb_wip_has_conflict (wb_inode_t *wb_inode, wb_request_t *req)
+{
+ wb_request_t *each = NULL;
+
+ if (req->stub->fop != GF_FOP_WRITE)
+ /* non-writes fundamentally never conflict with WIP requests */
+ return _gf_false;
+
+ list_for_each_entry (each, &wb_inode->wip, wip) {
+ if (each == req)
+ /* request never conflicts with itself,
+ though this condition should never occur.
+ */
+ continue;
+
+ if (wb_requests_overlap (each, req))
+ return _gf_true;
+ }
+
+ return _gf_false;
+}
+
+
static int
__wb_request_unref (wb_request_t *req)
{
@@ -320,6 +365,7 @@ __wb_request_unref (wb_request_t *req)
if (req->refcount == 0) {
list_del_init (&req->todo);
list_del_init (&req->lie);
+ list_del_init (&req->wip);
list_del_init (&req->all);
if (list_empty (&wb_inode->all)) {
@@ -425,6 +471,7 @@ wb_enqueue_common (wb_inode_t *wb_inode, call_stub_t *stub, int tempted)
INIT_LIST_HEAD (&req->lie);
INIT_LIST_HEAD (&req->winds);
INIT_LIST_HEAD (&req->unwinds);
+ INIT_LIST_HEAD (&req->wip);
req->stub = stub;
req->wb_inode = wb_inode;
@@ -432,8 +479,8 @@ wb_enqueue_common (wb_inode_t *wb_inode, call_stub_t *stub, int tempted)
req->ordering.tempted = tempted;
if (stub->fop == GF_FOP_WRITE) {
- req->write_size = iov_length (stub->args.writev.vector,
- stub->args.writev.count);
+ req->write_size = iov_length (stub->args.vector,
+ stub->args.count);
/* req->write_size can change as we collapse
small writes. But the window needs to grow
@@ -449,7 +496,7 @@ wb_enqueue_common (wb_inode_t *wb_inode, call_stub_t *stub, int tempted)
req->op_ret = req->write_size;
req->op_errno = 0;
- if (stub->args.writev.fd->flags & O_APPEND)
+ if (stub->args.fd->flags & O_APPEND)
req->ordering.append = 1;
}
@@ -457,28 +504,53 @@ wb_enqueue_common (wb_inode_t *wb_inode, call_stub_t *stub, int tempted)
switch (stub->fop) {
case GF_FOP_WRITE:
- req->ordering.off = stub->args.writev.off;
- req->ordering.size = req->write_size;
+ LOCK (&wb_inode->lock);
+ {
+ if (wb_inode->size < stub->args.offset) {
+ req->ordering.off = wb_inode->size;
+ req->ordering.size = stub->args.offset
+ + req->write_size
+ - wb_inode->size;
+ } else {
+ req->ordering.off = stub->args.offset;
+ req->ordering.size = req->write_size;
+ }
- req->fd = fd_ref (stub->args.writev.fd);
+ if (wb_inode->size < stub->args.offset + req->write_size)
+ wb_inode->size = stub->args.offset
+ + req->write_size;
+ }
+ UNLOCK (&wb_inode->lock);
+
+ req->fd = fd_ref (stub->args.fd);
break;
case GF_FOP_READ:
- req->ordering.off = stub->args.readv.off;
- req->ordering.size = stub->args.readv.size;
+ req->ordering.off = stub->args.offset;
+ req->ordering.size = stub->args.size;
- req->fd = fd_ref (stub->args.readv.fd);
+ req->fd = fd_ref (stub->args.fd);
break;
case GF_FOP_TRUNCATE:
- req->ordering.off = stub->args.truncate.off;
+ req->ordering.off = stub->args.offset;
req->ordering.size = 0; /* till infinity */
+ LOCK (&wb_inode->lock);
+ {
+ wb_inode->size = req->ordering.off;
+ }
+ UNLOCK (&wb_inode->lock);
break;
case GF_FOP_FTRUNCATE:
- req->ordering.off = stub->args.ftruncate.off;
+ req->ordering.off = stub->args.offset;
req->ordering.size = 0; /* till infinity */
+ LOCK (&wb_inode->lock);
+ {
+ wb_inode->size = req->ordering.off;
+ }
+ UNLOCK (&wb_inode->lock);
- req->fd = fd_ref (stub->args.ftruncate.fd);
+ req->fd = fd_ref (stub->args.fd);
break;
default:
@@ -541,6 +613,7 @@ __wb_inode_create (xlator_t *this, inode_t *inode)
INIT_LIST_HEAD (&wb_inode->todo);
INIT_LIST_HEAD (&wb_inode->liability);
INIT_LIST_HEAD (&wb_inode->temptation);
+ INIT_LIST_HEAD (&wb_inode->wip);
wb_inode->this = this;
@@ -629,12 +702,25 @@ wb_head_done (wb_request_t *head)
void
-wb_inode_err (wb_inode_t *wb_inode, int op_errno)
+wb_fulfill_err (wb_request_t *head, int op_errno)
{
+ wb_inode_t *wb_inode;
+ wb_request_t *req;
+
+ wb_inode = head->wb_inode;
+
+ /* for all future requests yet to arrive */
+ fd_ctx_set (head->fd, THIS, op_errno);
+
LOCK (&wb_inode->lock);
{
- wb_inode->op_ret = -1;
- wb_inode->op_errno = op_errno;
+ /* for all requests already arrived */
+ list_for_each_entry (req, &wb_inode->all, all) {
+ if (req->fd != head->fd)
+ continue;
+ req->op_ret = -1;
+ req->op_errno = op_errno;
+ }
}
UNLOCK (&wb_inode->lock);
}
@@ -654,7 +740,7 @@ wb_fulfill_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
wb_inode = head->wb_inode;
if (op_ret == -1) {
- wb_inode_err (wb_inode, op_errno);
+ wb_fulfill_err (head, op_errno);
} else if (op_ret < head->total_size) {
/*
* We've encountered a short write, for whatever reason.
@@ -664,7 +750,7 @@ wb_fulfill_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
* TODO: Retry the write so we can potentially capture
* a real error condition (i.e., ENOSPC).
*/
- wb_inode_err (wb_inode, EIO);
+ wb_fulfill_err (head, EIO);
}
wb_head_done (head);
@@ -678,34 +764,48 @@ wb_fulfill_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
#define WB_IOV_LOAD(vec, cnt, req, head) do { \
- memcpy (&vec[cnt], req->stub->args.writev.vector, \
- (req->stub->args.writev.count * sizeof(vec[0]))); \
- cnt += req->stub->args.writev.count; \
+ memcpy (&vec[cnt], req->stub->args.vector, \
+ (req->stub->args.count * sizeof(vec[0]))); \
+ cnt += req->stub->args.count; \
head->total_size += req->write_size; \
} while (0)
-void
+int
wb_fulfill_head (wb_inode_t *wb_inode, wb_request_t *head)
{
- struct iovec vector[MAX_VECTOR_COUNT];
- int count = 0;
- wb_request_t *req = NULL;
- call_frame_t *frame = NULL;
+ struct iovec vector[MAX_VECTOR_COUNT];
+ int count = 0;
+ wb_request_t *req = NULL;
+ call_frame_t *frame = NULL;
+ gf_boolean_t fderr = _gf_false;
+ xlator_t *this = NULL;
- frame = create_frame (wb_inode->this, wb_inode->this->ctx->pool);
- if (!frame)
- goto enomem;
+ this = THIS;
+
+ /* make sure head->total_size is updated before we run into any
+ * errors
+ */
WB_IOV_LOAD (vector, count, head, head);
list_for_each_entry (req, &head->winds, winds) {
WB_IOV_LOAD (vector, count, req, head);
- iobref_merge (head->stub->args.writev.iobref,
- req->stub->args.writev.iobref);
+ if (iobref_merge (head->stub->args.iobref,
+ req->stub->args.iobref))
+ goto err;
}
+ if (wb_fd_err (head->fd, this, NULL)) {
+ fderr = _gf_true;
+ goto err;
+ }
+
+ frame = create_frame (wb_inode->this, wb_inode->this->ctx->pool);
+ if (!frame)
+ goto err;
+
frame->root->lk_owner = head->lk_owner;
frame->local = head;
@@ -718,32 +818,36 @@ wb_fulfill_head (wb_inode_t *wb_inode, wb_request_t *head)
STACK_WIND (frame, wb_fulfill_cbk, FIRST_CHILD (frame->this),
FIRST_CHILD (frame->this)->fops->writev,
head->fd, vector, count,
- head->stub->args.writev.off,
- head->stub->args.writev.flags,
- head->stub->args.writev.iobref, NULL);
+ head->stub->args.offset,
+ head->stub->args.flags,
+ head->stub->args.iobref, NULL);
- return;
-enomem:
- wb_inode_err (wb_inode, ENOMEM);
+ return 0;
+err:
+ if (!fderr) {
+ /* frame creation failure */
+ fderr = ENOMEM;
+ wb_fulfill_err (head, fderr);
+ }
wb_head_done (head);
- return;
+ return fderr;
}
#define NEXT_HEAD(head, req) do { \
if (head) \
- wb_fulfill_head (wb_inode, head); \
+ ret |= wb_fulfill_head (wb_inode, head); \
head = req; \
- expected_offset = req->stub->args.writev.off + \
+ expected_offset = req->stub->args.offset + \
req->write_size; \
curr_aggregate = 0; \
vector_count = 0; \
} while (0)
-void
+int
wb_fulfill (wb_inode_t *wb_inode, list_head_t *liabilities)
{
wb_request_t *req = NULL;
@@ -753,6 +857,7 @@ wb_fulfill (wb_inode_t *wb_inode, list_head_t *liabilities)
off_t expected_offset = 0;
size_t curr_aggregate = 0;
size_t vector_count = 0;
+ int ret = 0;
conf = wb_inode->this->private;
@@ -774,7 +879,7 @@ wb_fulfill (wb_inode_t *wb_inode, list_head_t *liabilities)
continue;
}
- if (expected_offset != req->stub->args.writev.off) {
+ if (expected_offset != req->stub->args.offset) {
NEXT_HEAD (head, req);
continue;
}
@@ -784,7 +889,7 @@ wb_fulfill (wb_inode_t *wb_inode, list_head_t *liabilities)
continue;
}
- if (vector_count + req->stub->args.writev.count >
+ if (vector_count + req->stub->args.count >
MAX_VECTOR_COUNT) {
NEXT_HEAD (head, req);
continue;
@@ -792,12 +897,13 @@ wb_fulfill (wb_inode_t *wb_inode, list_head_t *liabilities)
list_add_tail (&req->winds, &head->winds);
curr_aggregate += req->write_size;
- vector_count += req->stub->args.writev.count;
+ vector_count += req->stub->args.count;
}
if (head)
- wb_fulfill_head (wb_inode, head);
- return;
+ ret |= wb_fulfill_head (wb_inode, head);
+
+ return ret;
}
@@ -861,10 +967,20 @@ __wb_collapse_small_writes (wb_request_t *holder, wb_request_t *req)
struct iobuf *iobuf = NULL;
struct iobref *iobref = NULL;
int ret = -1;
+ ssize_t required_size = 0;
+ size_t holder_len = 0;
+ size_t req_len = 0;
if (!holder->iobref) {
- /* TODO: check the required size */
- iobuf = iobuf_get (req->wb_inode->this->ctx->iobuf_pool);
+ holder_len = iov_length (holder->stub->args.vector,
+ holder->stub->args.count);
+ req_len = iov_length (req->stub->args.vector,
+ req->stub->args.count);
+
+ required_size = max ((THIS->ctx->page_size),
+ (holder_len + req_len));
+ iobuf = iobuf_get2 (req->wb_inode->this->ctx->iobuf_pool,
+ required_size);
if (iobuf == NULL) {
goto out;
}
@@ -877,33 +993,33 @@ __wb_collapse_small_writes (wb_request_t *holder, wb_request_t *req)
ret = iobref_add (iobref, iobuf);
if (ret != 0) {
- iobuf_unref (iobuf);
- iobref_unref (iobref);
gf_log (req->wb_inode->this->name, GF_LOG_WARNING,
"cannot add iobuf (%p) into iobref (%p)",
iobuf, iobref);
+ iobuf_unref (iobuf);
+ iobref_unref (iobref);
goto out;
}
- iov_unload (iobuf->ptr, holder->stub->args.writev.vector,
- holder->stub->args.writev.count);
- holder->stub->args.writev.vector[0].iov_base = iobuf->ptr;
- holder->stub->args.writev.count = 1;
+ iov_unload (iobuf->ptr, holder->stub->args.vector,
+ holder->stub->args.count);
+ holder->stub->args.vector[0].iov_base = iobuf->ptr;
+ holder->stub->args.count = 1;
- iobref_unref (holder->stub->args.writev.iobref);
- holder->stub->args.writev.iobref = iobref;
+ iobref_unref (holder->stub->args.iobref);
+ holder->stub->args.iobref = iobref;
iobuf_unref (iobuf);
holder->iobref = iobref_ref (iobref);
}
- ptr = holder->stub->args.writev.vector[0].iov_base + holder->write_size;
+ ptr = holder->stub->args.vector[0].iov_base + holder->write_size;
- iov_unload (ptr, req->stub->args.writev.vector,
- req->stub->args.writev.count);
+ iov_unload (ptr, req->stub->args.vector,
+ req->stub->args.count);
- holder->stub->args.writev.vector[0].iov_len += req->write_size;
+ holder->stub->args.vector[0].iov_len += req->write_size;
holder->write_size += req->write_size;
holder->ordering.size += req->write_size;
@@ -917,13 +1033,13 @@ void
__wb_preprocess_winds (wb_inode_t *wb_inode)
{
off_t offset_expected = 0;
- size_t space_left = 0;
+ ssize_t space_left = 0;
wb_request_t *req = NULL;
wb_request_t *tmp = NULL;
wb_request_t *holder = NULL;
wb_conf_t *conf = NULL;
int ret = 0;
- size_t page_size = 0;
+ ssize_t page_size = 0;
/* With asynchronous IO from a VM guest (as a file), there
can be two sequential writes happening in two regions
@@ -953,10 +1069,10 @@ __wb_preprocess_winds (wb_inode_t *wb_inode)
continue;
}
- offset_expected = holder->stub->args.writev.off
+ offset_expected = holder->stub->args.offset
+ holder->write_size;
- if (req->stub->args.writev.off != offset_expected) {
+ if (req->stub->args.offset != offset_expected) {
holder->ordering.go = 1;
holder = req;
continue;
@@ -968,6 +1084,12 @@ __wb_preprocess_winds (wb_inode_t *wb_inode)
continue;
}
+ if (req->fd != holder->fd) {
+ holder->ordering.go = 1;
+ holder = req;
+ continue;
+ }
+
space_left = page_size - holder->write_size;
if (space_left < req->write_size) {
@@ -1022,6 +1144,18 @@ __wb_pick_winds (wb_inode_t *wb_inode, list_head_t *tasks,
/* wait some more */
continue;
+ if (req->stub->fop == GF_FOP_WRITE) {
+ if (wb_wip_has_conflict (wb_inode, req))
+ continue;
+
+ list_add_tail (&req->wip, &wb_inode->wip);
+
+ if (!req->ordering.tempted)
+ /* unrefed in wb_writev_cbk */
+ req->stub->frame->local =
+ __wb_request_ref (req);
+ }
+
list_del_init (&req->todo);
if (req->ordering.tempted)
@@ -1054,38 +1188,83 @@ wb_process_queue (wb_inode_t *wb_inode)
list_head_t tasks = {0, };
list_head_t lies = {0, };
list_head_t liabilities = {0, };
+ int retry = 0;
INIT_LIST_HEAD (&tasks);
INIT_LIST_HEAD (&lies);
INIT_LIST_HEAD (&liabilities);
- LOCK (&wb_inode->lock);
- {
- __wb_preprocess_winds (wb_inode);
+ do {
+ LOCK (&wb_inode->lock);
+ {
+ __wb_preprocess_winds (wb_inode);
- __wb_pick_winds (wb_inode, &tasks, &liabilities);
+ __wb_pick_winds (wb_inode, &tasks, &liabilities);
- __wb_pick_unwinds (wb_inode, &lies);
+ __wb_pick_unwinds (wb_inode, &lies);
- }
- UNLOCK (&wb_inode->lock);
+ }
+ UNLOCK (&wb_inode->lock);
- wb_do_unwinds (wb_inode, &lies);
+ wb_do_unwinds (wb_inode, &lies);
- wb_do_winds (wb_inode, &tasks);
+ wb_do_winds (wb_inode, &tasks);
- wb_fulfill (wb_inode, &liabilities);
+ /* fd might've been marked bad due to previous errors.
+ * Since, caller of wb_process_queue might be the last fop on
+ * inode, make sure we keep processing request queue, till there
+ * are no requests left.
+ */
+ retry = wb_fulfill (wb_inode, &liabilities);
+ } while (retry);
return;
}
+void
+wb_set_inode_size(wb_inode_t *wb_inode, struct iatt *postbuf)
+{
+ GF_ASSERT (wb_inode);
+ GF_ASSERT (postbuf);
+
+ LOCK (&wb_inode->lock);
+ {
+ wb_inode->size = postbuf->ia_size;
+ }
+ UNLOCK (&wb_inode->lock);
+}
+
+
+int
+wb_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *prebuf, struct iatt *postbuf, dict_t *xdata)
+{
+ wb_request_t *req = NULL;
+ wb_inode_t *wb_inode;
+
+ req = frame->local;
+ frame->local = NULL;
+ wb_inode = req->wb_inode;
+
+ wb_request_unref (req);
+
+ /* requests could be pending while this was in progress */
+ wb_process_queue(wb_inode);
+
+ STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
+ return 0;
+}
+
+
int
wb_writev_helper (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iovec *vector, int32_t count, off_t offset,
uint32_t flags, struct iobref *iobref, dict_t *xdata)
{
- STACK_WIND (frame, default_writev_cbk,
+ STACK_WIND (frame, wb_writev_cbk,
FIRST_CHILD (this), FIRST_CHILD (this)->fops->writev,
fd, vector, count, offset, flags, iobref, xdata);
return 0;
@@ -1102,10 +1281,15 @@ wb_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
gf_boolean_t wb_disabled = 0;
call_stub_t *stub = NULL;
int ret = -1;
- int op_errno = EINVAL;
+ int32_t op_errno = EINVAL;
int o_direct = O_DIRECT;
conf = this->private;
+
+ if (wb_fd_err (fd, this, &op_errno)) {
+ goto unwind;
+ }
+
wb_inode = wb_inode_create (this, fd->inode);
if (!wb_inode) {
op_errno = ENOMEM;
@@ -1118,24 +1302,9 @@ wb_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
if (fd->flags & (O_SYNC|O_DSYNC|o_direct))
wb_disabled = 1;
- if (flags & (O_SYNC|O_DSYNC|O_DIRECT))
- /* O_DIRECT flag in params of writev must _always_ be honored */
+ if (flags & (O_SYNC|O_DSYNC|o_direct))
wb_disabled = 1;
- op_errno = 0;
- LOCK (&wb_inode->lock);
- {
- /* pick up a previous error in fulfillment */
- if (wb_inode->op_ret < 0)
- op_errno = wb_inode->op_errno;
-
- wb_inode->op_ret = 0;
- }
- UNLOCK (&wb_inode->lock);
-
- if (op_errno)
- goto unwind;
-
if (wb_disabled)
stub = fop_writev_stub (frame, wb_writev_helper, fd, vector,
count, offset, flags, iobref, xdata);
@@ -1233,7 +1402,7 @@ wb_flush_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
wb_conf_t *conf = NULL;
wb_inode_t *wb_inode = NULL;
call_frame_t *bg_frame = NULL;
- int op_errno = 0;
+ int32_t op_errno = 0;
int op_ret = 0;
conf = this->private;
@@ -1245,19 +1414,10 @@ wb_flush_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
goto unwind;
}
- LOCK (&wb_inode->lock);
- {
- if (wb_inode->op_ret < 0) {
- op_ret = -1;
- op_errno = wb_inode->op_errno;
- }
-
- wb_inode->op_ret = 0;
- }
- UNLOCK (&wb_inode->lock);
-
- if (op_errno)
+ if (wb_fd_err (fd, this, &op_errno)) {
+ op_ret = -1;
goto unwind;
+ }
if (conf->flush_behind)
goto flushbehind;
@@ -1301,7 +1461,7 @@ wb_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
if (!wb_enqueue (wb_inode, stub))
goto unwind;
- wb_process_queue (wb_inode);
+ wb_process_queue (wb_inode);
return 0;
@@ -1334,6 +1494,10 @@ wb_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
{
wb_inode_t *wb_inode = NULL;
call_stub_t *stub = NULL;
+ int32_t op_errno = EINVAL;
+
+ if (wb_fd_err (fd, this, &op_errno))
+ goto unwind;
wb_inode = wb_inode_ctx_get (this, fd->inode);
if (!wb_inode)
@@ -1351,7 +1515,7 @@ wb_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
return 0;
unwind:
- STACK_UNWIND_STRICT (fsync, frame, -1, ENOMEM, NULL, NULL, NULL);
+ STACK_UNWIND_STRICT (fsync, frame, -1, op_errno, NULL, NULL, NULL);
return 0;
@@ -1452,11 +1616,29 @@ noqueue:
}
+int32_t
+wb_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ GF_ASSERT (frame->local);
+
+ if (op_ret == 0)
+ wb_set_inode_size (frame->local, postbuf);
+
+ frame->local = NULL;
+
+ STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+ return 0;
+}
+
+
int
wb_truncate_helper (call_frame_t *frame, xlator_t *this, loc_t *loc,
off_t offset, dict_t *xdata)
{
- STACK_WIND (frame, default_truncate_cbk, FIRST_CHILD(this),
+ STACK_WIND (frame, wb_truncate_cbk, FIRST_CHILD(this),
FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
return 0;
}
@@ -1473,6 +1655,8 @@ wb_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
if (!wb_inode)
goto unwind;
+ frame->local = wb_inode;
+
stub = fop_truncate_stub (frame, wb_truncate_helper, loc,
offset, xdata);
if (!stub)
@@ -1495,11 +1679,29 @@ unwind:
}
+int32_t
+wb_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ GF_ASSERT (frame->local);
+
+ if (op_ret == 0)
+ wb_set_inode_size (frame->local, postbuf);
+
+ frame->local = NULL;
+
+ STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+ return 0;
+}
+
+
int
wb_ftruncate_helper (call_frame_t *frame, xlator_t *this, fd_t *fd,
off_t offset, dict_t *xdata)
{
- STACK_WIND (frame, default_ftruncate_cbk, FIRST_CHILD(this),
+ STACK_WIND (frame, wb_ftruncate_cbk, FIRST_CHILD(this),
FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
return 0;
}
@@ -1511,25 +1713,39 @@ wb_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
{
wb_inode_t *wb_inode = NULL;
call_stub_t *stub = NULL;
+ int32_t op_errno = 0;
wb_inode = wb_inode_create (this, fd->inode);
- if (!wb_inode)
+ if (!wb_inode) {
+ op_errno = ENOMEM;
goto unwind;
+ }
+
+ if (wb_fd_err (fd, this, &op_errno))
+ goto unwind;
+
+ frame->local = wb_inode;
stub = fop_ftruncate_stub (frame, wb_ftruncate_helper, fd,
offset, xdata);
- if (!stub)
+ if (!stub) {
+ op_errno = ENOMEM;
goto unwind;
+ }
- if (!wb_enqueue (wb_inode, stub))
+ if (!wb_enqueue (wb_inode, stub)) {
+ op_errno = ENOMEM;
goto unwind;
+ }
wb_process_queue (wb_inode);
return 0;
unwind:
- STACK_UNWIND_STRICT (ftruncate, frame, -1, ENOMEM, NULL, NULL, NULL);
+ frame->local = NULL;
+
+ STACK_UNWIND_STRICT (ftruncate, frame, -1, op_errno, NULL, NULL, NULL);
if (stub)
call_stub_destroy (stub);
@@ -1629,6 +1845,81 @@ noqueue:
}
+int32_t
+wb_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+{
+ wb_inode_t *wb_inode = NULL;
+
+ wb_inode = wb_inode_create (this, fd->inode);
+ if (!wb_inode)
+ goto unwind;
+
+ if (((flags & O_RDWR) || (flags & O_WRONLY)) && (flags & O_TRUNC))
+ wb_inode->size = 0;
+
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->create, loc, flags, mode,
+ umask, fd, xdata);
+ return 0;
+
+unwind:
+ STACK_UNWIND_STRICT (create, frame, -1, ENOMEM, NULL, NULL, NULL, NULL,
+ NULL, NULL);
+ return 0;
+}
+
+
+int32_t
+wb_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ fd_t *fd, dict_t *xdata)
+{
+ wb_inode_t *wb_inode = NULL;
+
+ wb_inode = wb_inode_create (this, fd->inode);
+ if (!wb_inode)
+ goto unwind;
+
+ if (((flags & O_RDWR) || (flags & O_WRONLY)) && (flags & O_TRUNC))
+ wb_inode->size = 0;
+
+ STACK_WIND_TAIL (frame, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
+ return 0;
+
+unwind:
+ STACK_UNWIND_STRICT (open, frame, -1, ENOMEM, NULL, NULL);
+ return 0;
+}
+
+
+int32_t
+wb_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *postparent)
+{
+ if (op_ret == 0) {
+ wb_inode_t *wb_inode = wb_inode_ctx_get (this, inode);
+ if (wb_inode)
+ wb_set_inode_size (wb_inode, buf);
+ }
+
+ STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf,
+ xdata, postparent);
+ return 0;
+}
+
+
+int32_t
+wb_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ dict_t *xdata)
+{
+ STACK_WIND (frame, wb_lookup_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lookup, loc, xdata);
+ return 0;
+}
+
+
int
wb_forget (xlator_t *this, inode_t *inode)
{
@@ -1642,13 +1933,22 @@ wb_forget (xlator_t *this, inode_t *inode)
if (!wb_inode)
return 0;
- LOCK (&wb_inode->lock);
- {
- GF_ASSERT (list_empty (&wb_inode->todo));
- GF_ASSERT (list_empty (&wb_inode->liability));
- GF_ASSERT (list_empty (&wb_inode->temptation));
- }
- UNLOCK (&wb_inode->lock);
+ GF_ASSERT (list_empty (&wb_inode->todo));
+ GF_ASSERT (list_empty (&wb_inode->liability));
+ GF_ASSERT (list_empty (&wb_inode->temptation));
+
+ GF_FREE (wb_inode);
+
+ return 0;
+}
+
+
+int
+wb_release (xlator_t *this, fd_t *fd)
+{
+ uint64_t tmp = 0;
+
+ fd_ctx_del (fd, this, &tmp);
return 0;
}
@@ -1709,7 +2009,7 @@ __wb_dump_requests (struct list_head *head, char *prefix)
req->write_size);
gf_proc_dump_write ("offset", "%"PRId64,
- req->stub->args.writev.off);
+ req->stub->args.offset);
flag = req->ordering.lied;
gf_proc_dump_write ("lied", "%d", flag);
@@ -1766,9 +2066,6 @@ wb_inode_dump (xlator_t *this, inode_t *inode)
gf_proc_dump_write ("window_current", "%"GF_PRI_SIZET,
wb_inode->window_current);
- gf_proc_dump_write ("op_ret", "%d", wb_inode->op_ret);
-
- gf_proc_dump_write ("op_errno", "%d", wb_inode->op_errno);
ret = TRY_LOCK (&wb_inode->lock);
if (!ret)
@@ -1819,7 +2116,7 @@ reconfigure (xlator_t *this, dict_t *options)
conf = this->private;
- GF_OPTION_RECONF ("cache-size", conf->window_size, options, size, out);
+ GF_OPTION_RECONF ("cache-size", conf->window_size, options, size_uint64, out);
GF_OPTION_RECONF ("flush-behind", conf->flush_behind, options, bool,
out);
@@ -1866,7 +2163,7 @@ init (xlator_t *this)
conf->aggregate_size = WB_AGGREGATE_SIZE;
/* configure 'option window-size <size>' */
- GF_OPTION_INIT ("cache-size", conf->window_size, size, out);
+ GF_OPTION_INIT ("cache-size", conf->window_size, size_uint64, out);
if (!conf->window_size && conf->aggregate_size) {
gf_log (this->name, GF_LOG_WARNING,
@@ -1941,6 +2238,7 @@ struct xlator_fops fops = {
struct xlator_cbks cbks = {
.forget = wb_forget,
+ .release = wb_release
};
@@ -1975,6 +2273,8 @@ struct volume_options options[] = {
{ .key = {"strict-O_DIRECT"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
+ .description = "This option when set to off, ignores the "
+ "O_DIRECT flag."
},
{ .key = {"strict-write-ordering"},
.type = GF_OPTION_TYPE_BOOL,
diff --git a/xlators/playground/Makefile.am b/xlators/playground/Makefile.am
new file mode 100644
index 000000000..e7de6b31a
--- /dev/null
+++ b/xlators/playground/Makefile.am
@@ -0,0 +1,2 @@
+SUBDIRS = template
+CLEANFILES =
diff --git a/xlators/playground/template/Makefile.am b/xlators/playground/template/Makefile.am
new file mode 100644
index 000000000..f26892443
--- /dev/null
+++ b/xlators/playground/template/Makefile.am
@@ -0,0 +1,2 @@
+SUBDIRS = src
+
diff --git a/xlators/playground/template/src/Makefile.am b/xlators/playground/template/src/Makefile.am
new file mode 100644
index 000000000..21f1c5f6b
--- /dev/null
+++ b/xlators/playground/template/src/Makefile.am
@@ -0,0 +1,16 @@
+xlator_LTLIBRARIES = template.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/testing/features
+
+template_la_LDFLAGS = -module -avoid-version
+
+template_la_SOURCES = template.c
+template_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
+noinst_HEADERS = template.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src
+
+AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+CLEANFILES =
+
diff --git a/xlators/playground/template/src/template.c b/xlators/playground/template/src/template.c
new file mode 100644
index 000000000..37a7794a0
--- /dev/null
+++ b/xlators/playground/template/src/template.c
@@ -0,0 +1,49 @@
+/*
+ Copyright (c) 2006-2012 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "template.h"
+
+int32_t
+init (xlator_t *this)
+{
+
+ if (!this->children || this->children->next) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "not configured with exactly one child. exiting");
+ return -1;
+ }
+
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "dangling volume. check volfile ");
+ }
+
+ return 0;
+}
+
+void
+fini (xlator_t *this)
+{
+ return;
+}
+
+struct xlator_fops fops = {
+};
+
+struct xlator_cbks cbks = {
+};
+
+struct volume_options options[] = {
+ { .key = {NULL} },
+};
diff --git a/xlators/playground/template/src/template.h b/xlators/playground/template/src/template.h
new file mode 100644
index 000000000..d6aced501
--- /dev/null
+++ b/xlators/playground/template/src/template.h
@@ -0,0 +1,24 @@
+/*
+ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef __TEMPLATE_H__
+#define __TEMPLATE_H__
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "glusterfs.h"
+#include "logging.h"
+#include "dict.h"
+#include "xlator.h"
+#include "defaults.h"
+
+#endif /* __TEMPLATE_H__ */
diff --git a/xlators/protocol/auth/addr/src/Makefile.am b/xlators/protocol/auth/addr/src/Makefile.am
index 1b88e0cf4..426e7c2fb 100644
--- a/xlators/protocol/auth/addr/src/Makefile.am
+++ b/xlators/protocol/auth/addr/src/Makefile.am
@@ -1,7 +1,7 @@
auth_LTLIBRARIES = addr.la
authdir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/auth
-addr_la_LDFLAGS = -module -avoidversion
+addr_la_LDFLAGS = -module -avoid-version
addr_la_SOURCES = addr.c
addr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
diff --git a/xlators/protocol/auth/addr/src/addr.c b/xlators/protocol/auth/addr/src/addr.c
index 199fc6db0..181d091bd 100644
--- a/xlators/protocol/auth/addr/src/addr.c
+++ b/xlators/protocol/auth/addr/src/addr.c
@@ -2,19 +2,10 @@
Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
@@ -190,6 +181,7 @@ gf_auth (dict_t *input_params, dict_t *config_params)
addr_str = strtok_r (NULL, ADDR_DELIMITER, &tmp);
}
GF_FREE (addr_cpy);
+ addr_cpy = NULL;
}
if (allow_addr) {
diff --git a/xlators/protocol/auth/login/src/Makefile.am b/xlators/protocol/auth/login/src/Makefile.am
index 8a424ba18..d84db91c4 100644
--- a/xlators/protocol/auth/login/src/Makefile.am
+++ b/xlators/protocol/auth/login/src/Makefile.am
@@ -1,7 +1,7 @@
auth_LTLIBRARIES = login.la
authdir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/auth
-login_la_LDFLAGS = -module -avoidversion
+login_la_LDFLAGS = -module -avoid-version
login_la_SOURCES = login.c
login_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
diff --git a/xlators/protocol/auth/login/src/login.c b/xlators/protocol/auth/login/src/login.c
index 702a876ac..c2f0bf0d0 100644
--- a/xlators/protocol/auth/login/src/login.c
+++ b/xlators/protocol/auth/login/src/login.c
@@ -2,19 +2,10 @@
Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _CONFIG_H
diff --git a/xlators/protocol/client/src/Makefile.am b/xlators/protocol/client/src/Makefile.am
index 7eed68929..cf89d42da 100644
--- a/xlators/protocol/client/src/Makefile.am
+++ b/xlators/protocol/client/src/Makefile.am
@@ -2,7 +2,7 @@
xlator_LTLIBRARIES = client.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/protocol
-client_la_LDFLAGS = -module -avoidversion
+client_la_LDFLAGS = -module -avoid-version
client_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
$(top_builddir)/rpc/rpc-lib/src/libgfrpc.la \
diff --git a/xlators/protocol/client/src/client-handshake.c b/xlators/protocol/client/src/client-handshake.c
index f5f056c6a..85b0f757b 100644
--- a/xlators/protocol/client/src/client-handshake.c
+++ b/xlators/protocol/client/src/client-handshake.c
@@ -25,6 +25,7 @@
#include "portmap-xdr.h"
#include "rpc-common-xdr.h"
+#define CLIENT_REOPEN_MAX_ATTEMPTS 1024
extern rpc_clnt_prog_t clnt3_3_fop_prog;
extern rpc_clnt_prog_t clnt_pmap_prog;
@@ -52,7 +53,7 @@ rpc_client_ping_timer_expired (void *data)
rpc_clnt_connection_t *conn = NULL;
int disconnect = 0;
int transport_activity = 0;
- struct timeval timeout = {0, };
+ struct timespec timeout = {0, };
struct timeval current = {0, };
struct rpc_clnt *clnt = NULL;
xlator_t *this = NULL;
@@ -100,7 +101,7 @@ rpc_client_ping_timer_expired (void *data)
"ping timer expired but transport activity "
"detected - not bailing transport");
timeout.tv_sec = conf->opt.ping_timeout;
- timeout.tv_usec = 0;
+ timeout.tv_nsec = 0;
conn->ping_timer =
gf_timer_call_after (this->ctx, timeout,
@@ -139,7 +140,7 @@ client_start_ping (void *data)
clnt_conf_t *conf = NULL;
rpc_clnt_connection_t *conn = NULL;
int32_t ret = -1;
- struct timeval timeout = {0, };
+ struct timespec timeout = {0, };
call_frame_t *frame = NULL;
int frame_count = 0;
@@ -195,7 +196,7 @@ client_start_ping (void *data)
}
timeout.tv_sec = conf->opt.ping_timeout;
- timeout.tv_usec = 0;
+ timeout.tv_nsec = 0;
conn->ping_timer =
gf_timer_call_after (this->ctx, timeout,
@@ -240,7 +241,7 @@ client_ping_cbk (struct rpc_req *req, struct iovec *iov, int count,
{
xlator_t *this = NULL;
rpc_clnt_connection_t *conn = NULL;
- struct timeval timeout = {0, };
+ struct timespec timeout = {0, };
call_frame_t *frame = NULL;
clnt_conf_t *conf = NULL;
@@ -280,7 +281,7 @@ client_ping_cbk (struct rpc_req *req, struct iovec *iov, int count,
timeout.tv_sec = conf->opt.ping_timeout;
- timeout.tv_usec = 0;
+ timeout.tv_nsec = 0;
gf_timer_call_cancel (this->ctx,
conn->ping_timer);
@@ -415,9 +416,6 @@ clnt_fd_lk_reacquire_failed (xlator_t *this, clnt_fd_ctx_t *fdctx,
{
fdctx->remote_fd = -1;
fdctx->lk_heal_state = GF_LK_HEAL_DONE;
-
- list_add_tail (&fdctx->sfd_pos,
- &conf->saved_fds);
}
pthread_mutex_unlock (&conf->lock);
@@ -467,17 +465,23 @@ client_set_lk_version (xlator_t *this)
clnt_conf_t *conf = NULL;
call_frame_t *frame = NULL;
gf_set_lk_ver_req req = {0, };
+ char *process_uuid = NULL;
GF_VALIDATE_OR_GOTO ("client", this, err);
conf = (clnt_conf_t *) this->private;
req.lk_ver = client_get_lk_ver (conf);
- ret = gf_asprintf (&req.uid, "%s-%s-%d",
- this->ctx->process_uuid, this->name,
- this->graph->id);
- if (ret == -1)
+ ret = dict_get_str (this->options, "process-uuid", &process_uuid);
+ if (!process_uuid) {
+ ret = -1;
goto err;
+ }
+ req.uid = gf_strdup (process_uuid);
+ if (!req.uid) {
+ ret = -1;
+ goto err;
+ }
frame = create_frame (this, this->ctx->pool);
if (!frame) {
@@ -603,7 +607,7 @@ clnt_release_reopen_fd_cbk (struct rpc_req *req, struct iovec *iov,
clnt_fd_lk_reacquire_failed (this, fdctx, conf);
- decrement_reopen_fd_count (this, conf);
+ fdctx->reopen_done (fdctx, this);
frame->local = NULL;
STACK_DESTROY (frame->root);
@@ -633,10 +637,11 @@ clnt_release_reopen_fd (xlator_t *this, clnt_fd_ctx_t *fdctx)
clnt_release_reopen_fd_cbk, NULL,
NULL, 0, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_releasedir_req);
+ return 0;
out:
if (ret) {
- decrement_reopen_fd_count (this, conf);
clnt_fd_lk_reacquire_failed (this, fdctx, conf);
+ fdctx->reopen_done (fdctx, this);
if (frame) {
frame->local = NULL;
STACK_DESTROY (frame->root);
@@ -754,13 +759,10 @@ client_reacquire_lock_cbk (struct rpc_req *req, struct iovec *iov,
pthread_mutex_lock (&conf->lock);
{
fdctx->lk_heal_state = GF_LK_HEAL_DONE;
-
- list_add_tail (&fdctx->sfd_pos,
- &conf->saved_fds);
}
pthread_mutex_unlock (&conf->lock);
- decrement_reopen_fd_count (this, conf);
+ fdctx->reopen_done (fdctx, this);
}
ret = 0;
@@ -869,8 +871,7 @@ client_reacquire_lock (xlator_t *this, clnt_fd_ctx_t *fdctx)
if (client_fd_lk_list_empty (fdctx->lk_ctx, _gf_false)) {
gf_log (this->name, GF_LOG_DEBUG,
"fd lock list is empty");
- decrement_reopen_fd_count (this,
- (clnt_conf_t *)this->private);
+ fdctx->reopen_done (fdctx, this);
} else {
lk_ctx = fdctx->lk_ctx;
@@ -885,14 +886,66 @@ out:
return ret;
}
+void
+client_default_reopen_done (clnt_fd_ctx_t *fdctx, xlator_t *this)
+{
+ gf_log_callingfn (this->name, GF_LOG_WARNING,
+ "This function should never be called");
+}
+
+void
+client_reopen_done (clnt_fd_ctx_t *fdctx, xlator_t *this)
+{
+ clnt_conf_t *conf = NULL;
+ gf_boolean_t destroy = _gf_false;
+
+ conf = this->private;
+
+ pthread_mutex_lock (&conf->lock);
+ {
+ fdctx->reopen_attempts = 0;
+ if (!fdctx->released)
+ list_add_tail (&fdctx->sfd_pos, &conf->saved_fds);
+ else
+ destroy = _gf_true;
+ fdctx->reopen_done = client_default_reopen_done;
+ }
+ pthread_mutex_unlock (&conf->lock);
+
+ if (destroy)
+ client_fdctx_destroy (this, fdctx);
+}
+
+void
+client_child_up_reopen_done (clnt_fd_ctx_t *fdctx, xlator_t *this)
+{
+ clnt_conf_t *conf = NULL;
+ uint64_t fd_count = 0;
+
+ conf = this->private;
+
+ LOCK (&conf->rec_lock);
+ {
+ fd_count = --(conf->reopen_fd_count);
+ }
+ UNLOCK (&conf->rec_lock);
+
+ client_reopen_done (fdctx, this);
+ if (fd_count == 0) {
+ gf_log (this->name, GF_LOG_INFO,
+ "last fd open'd/lock-self-heal'd - notifying CHILD-UP");
+ client_set_lk_version (this);
+ client_notify_parents_child_up (this);
+ }
+}
+
int
client3_3_reopen_cbk (struct rpc_req *req, struct iovec *iov, int count,
void *myframe)
{
int32_t ret = -1;
gfs3_open_rsp rsp = {0,};
- int attempt_lock_recovery = _gf_false;
- uint64_t fd_count = 0;
+ gf_boolean_t attempt_lock_recovery = _gf_false;
clnt_local_t *local = NULL;
clnt_conf_t *conf = NULL;
clnt_fd_ctx_t *fdctx = NULL;
@@ -900,12 +953,10 @@ client3_3_reopen_cbk (struct rpc_req *req, struct iovec *iov, int count,
xlator_t *this = NULL;
frame = myframe;
- if (!frame || !frame->this)
- goto out;
-
this = frame->this;
+ conf = this->private;
local = frame->local;
- conf = frame->this->private;
+ fdctx = local->fdctx;
if (-1 == req->rpc_status) {
gf_log (frame->this->name, GF_LOG_WARNING,
@@ -938,32 +989,23 @@ client3_3_reopen_cbk (struct rpc_req *req, struct iovec *iov, int count,
goto out;
}
- fdctx = local->fdctx;
- if (!fdctx) {
- gf_log (frame->this->name, GF_LOG_WARNING, "fdctx not found");
- ret = -1;
- goto out;
- }
-
pthread_mutex_lock (&conf->lock);
{
fdctx->remote_fd = rsp.fd;
if (!fdctx->released) {
- if (!client_fd_lk_list_empty (fdctx->lk_ctx, _gf_false)) {
+ if (conf->lk_heal &&
+ !client_fd_lk_list_empty (fdctx->lk_ctx,
+ _gf_false)) {
attempt_lock_recovery = _gf_true;
fdctx->lk_heal_state = GF_LK_HEAL_IN_PROGRESS;
- } else {
- list_add_tail (&fdctx->sfd_pos,
- &conf->saved_fds);
}
- fdctx = NULL;
}
}
pthread_mutex_unlock (&conf->lock);
ret = 0;
- if (conf->lk_heal && attempt_lock_recovery) {
+ if (attempt_lock_recovery) {
/* Delay decrementing the reopen fd count untill all the
locks corresponding to this fd are acquired.*/
gf_log (this->name, GF_LOG_DEBUG, "acquiring locks "
@@ -973,23 +1015,15 @@ client3_3_reopen_cbk (struct rpc_req *req, struct iovec *iov, int count,
clnt_reacquire_lock_error (this, local->fdctx, conf);
gf_log (this->name, GF_LOG_WARNING, "acquiring locks "
"failed on %s", local->loc.path);
- ret = 0;
}
- } else {
- fd_count = decrement_reopen_fd_count (frame->this, conf);
}
out:
- if (fdctx) {
- clnt_release_reopen_fd (this, fdctx);
- } else if ((ret < 0) && frame && frame->this && conf) {
- decrement_reopen_fd_count (frame->this, conf);
- }
+ if (!attempt_lock_recovery)
+ fdctx->reopen_done (fdctx, this);
- if (frame) {
- frame->local = NULL;
- STACK_DESTROY (frame->root);
- }
+ frame->local = NULL;
+ STACK_DESTROY (frame->root);
client_local_wipe (local);
@@ -1008,11 +1042,10 @@ client3_3_reopendir_cbk (struct rpc_req *req, struct iovec *iov, int count,
call_frame_t *frame = NULL;
frame = myframe;
- if (!frame || !frame->this)
- goto out;
+ local = frame->local;
+ fdctx = local->fdctx;
+ conf = frame->this->private;
- local = frame->local;
- conf = frame->this->private;
if (-1 == req->rpc_status) {
gf_log (frame->this->name, GF_LOG_WARNING,
@@ -1045,46 +1078,24 @@ client3_3_reopendir_cbk (struct rpc_req *req, struct iovec *iov, int count,
goto out;
}
- fdctx = local->fdctx;
- if (!fdctx) {
- gf_log (frame->this->name, GF_LOG_WARNING, "fdctx not found");
- ret = -1;
- goto out;
- }
-
pthread_mutex_lock (&conf->lock);
{
fdctx->remote_fd = rsp.fd;
-
- if (!fdctx->released) {
- list_add_tail (&fdctx->sfd_pos, &conf->saved_fds);
- fdctx = NULL;
- }
}
pthread_mutex_unlock (&conf->lock);
- decrement_reopen_fd_count (frame->this, conf);
- ret = 0;
-
out:
- if (fdctx)
- client_fdctx_destroy (frame->this, fdctx);
-
- if ((ret < 0) && frame && frame->this && conf)
- decrement_reopen_fd_count (frame->this, conf);
-
- if (frame) {
- frame->local = NULL;
- STACK_DESTROY (frame->root);
- }
+ fdctx->reopen_done (fdctx, frame->this);
+ frame->local = NULL;
+ STACK_DESTROY (frame->root);
client_local_wipe (local);
return 0;
}
-int
-protocol_client_reopendir (xlator_t *this, clnt_fd_ctx_t *fdctx)
+static int
+protocol_client_reopendir (clnt_fd_ctx_t *fdctx, xlator_t *this)
{
int ret = -1;
gfs3_opendir_req req = {{0,},};
@@ -1092,9 +1103,6 @@ protocol_client_reopendir (xlator_t *this, clnt_fd_ctx_t *fdctx)
call_frame_t *frame = NULL;
clnt_conf_t *conf = NULL;
- if (!this || !fdctx)
- goto out;
-
conf = this->private;
local = mem_get0 (this->local_pool);
@@ -1120,7 +1128,7 @@ protocol_client_reopendir (xlator_t *this, clnt_fd_ctx_t *fdctx)
gf_log (frame->this->name, GF_LOG_DEBUG,
"attempting reopen on %s", local->loc.path);
- frame->local = local; local = NULL;
+ frame->local = local;
ret = client_submit_request (this, &req, frame, conf->fops,
GFS3_OP_OPENDIR,
@@ -1128,11 +1136,11 @@ protocol_client_reopendir (xlator_t *this, clnt_fd_ctx_t *fdctx)
NULL, 0, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_opendir_req);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"failed to send the re-opendir request");
}
- return ret;
+ return 0;
out:
if (frame) {
@@ -1143,16 +1151,14 @@ out:
if (local)
client_local_wipe (local);
- if ((ret < 0) && this && conf) {
- decrement_reopen_fd_count (this, conf);
- }
+ fdctx->reopen_done (fdctx, this);
return 0;
}
-int
-protocol_client_reopen (xlator_t *this, clnt_fd_ctx_t *fdctx)
+static int
+protocol_client_reopenfile (clnt_fd_ctx_t *fdctx, xlator_t *this)
{
int ret = -1;
gfs3_open_req req = {{0,},};
@@ -1160,9 +1166,6 @@ protocol_client_reopen (xlator_t *this, clnt_fd_ctx_t *fdctx)
call_frame_t *frame = NULL;
clnt_conf_t *conf = NULL;
- if (!this || !fdctx)
- goto out;
-
conf = this->private;
frame = create_frame (this, this->ctx->pool);
@@ -1192,17 +1195,16 @@ protocol_client_reopen (xlator_t *this, clnt_fd_ctx_t *fdctx)
gf_log (frame->this->name, GF_LOG_DEBUG,
"attempting reopen on %s", local->loc.path);
- local = NULL;
ret = client_submit_request (this, &req, frame, conf->fops,
GFS3_OP_OPEN, client3_3_reopen_cbk, NULL,
NULL, 0, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_open_req);
if (ret) {
- gf_log (THIS->name, GF_LOG_ERROR,
+ gf_log (this->name, GF_LOG_ERROR,
"failed to send the re-open request");
}
- return ret;
+ return 0;
out:
if (frame) {
@@ -1213,14 +1215,65 @@ out:
if (local)
client_local_wipe (local);
- if ((ret < 0) && this && conf) {
- decrement_reopen_fd_count (this, conf);
- }
+ fdctx->reopen_done (fdctx, this);
return 0;
}
+static void
+protocol_client_reopen (clnt_fd_ctx_t *fdctx, xlator_t *this)
+{
+ if (fdctx->is_dir)
+ protocol_client_reopendir (fdctx, this);
+ else
+ protocol_client_reopenfile (fdctx, this);
+}
+
+gf_boolean_t
+__is_fd_reopen_in_progress (clnt_fd_ctx_t *fdctx)
+{
+ if (fdctx->reopen_done == client_default_reopen_done)
+ return _gf_false;
+ return _gf_true;
+}
+
+void
+client_attempt_reopen (fd_t *fd, xlator_t *this)
+{
+ clnt_conf_t *conf = NULL;
+ clnt_fd_ctx_t *fdctx = NULL;
+ gf_boolean_t reopen = _gf_false;
+
+ if (!fd || !this)
+ goto out;
+
+ conf = this->private;
+ pthread_mutex_lock (&conf->lock);
+ {
+ fdctx = this_fd_get_ctx (fd, this);
+ if (!fdctx)
+ goto unlock;
+ if (__is_fd_reopen_in_progress (fdctx))
+ goto unlock;
+ if (fdctx->remote_fd != -1)
+ goto unlock;
+
+ if (fdctx->reopen_attempts == CLIENT_REOPEN_MAX_ATTEMPTS) {
+ reopen = _gf_true;
+ fdctx->reopen_done = client_reopen_done;
+ list_del_init (&fdctx->sfd_pos);
+ } else {
+ fdctx->reopen_attempts++;
+ }
+ }
+unlock:
+ pthread_mutex_unlock (&conf->lock);
+ if (reopen)
+ protocol_client_reopen (fdctx, this);
+out:
+ return;
+}
int
client_post_handshake (call_frame_t *frame, xlator_t *this)
@@ -1245,6 +1298,7 @@ client_post_handshake (call_frame_t *frame, xlator_t *this)
if (fdctx->remote_fd != -1)
continue;
+ fdctx->reopen_done = client_child_up_reopen_done;
list_del_init (&fdctx->sfd_pos);
list_add_tail (&fdctx->sfd_pos, &reopen_head);
count++;
@@ -1263,10 +1317,7 @@ client_post_handshake (call_frame_t *frame, xlator_t *this)
list_for_each_entry_safe (fdctx, tmp, &reopen_head, sfd_pos) {
list_del_init (&fdctx->sfd_pos);
- if (fdctx->is_dir)
- protocol_client_reopendir (this, fdctx);
- else
- protocol_client_reopen (this, fdctx);
+ protocol_client_reopen (fdctx, this);
}
} else {
gf_log (this->name, GF_LOG_DEBUG,
@@ -1407,7 +1458,7 @@ client_setvolume_cbk (struct rpc_req *req, struct iovec *iov, int count, void *m
gf_log (this->name, GF_LOG_INFO,
"Connected to %s, attached to remote volume '%s'.",
- conf->rpc->conn.trans->peerinfo.identifier,
+ conf->rpc->conn.name,
remote_subvol);
rpc_clnt_set_connected (&conf->rpc->conn);
@@ -1430,6 +1481,7 @@ client_setvolume_cbk (struct rpc_req *req, struct iovec *iov, int count, void *m
gf_log (this->name, GF_LOG_INFO, "Server and Client "
"lk-version numbers are same, no need to "
"reopen the fds");
+ client_notify_parents_child_up (frame->this);
}
out:
@@ -1478,6 +1530,7 @@ client_setvolume (xlator_t *this, struct rpc_clnt *rpc)
char *process_uuid_xl = NULL;
clnt_conf_t *conf = NULL;
dict_t *options = NULL;
+ char counter_str[32] = {0};
options = this->options;
conf = this->private;
@@ -1503,13 +1556,24 @@ client_setvolume (xlator_t *this, struct rpc_clnt *rpc)
}
}
- /* With multiple graphs possible in the same process, we need a
+ /* When lock-heal is enabled:
+ * With multiple graphs possible in the same process, we need a
field to bring the uniqueness. Graph-ID should be enough to get the
- job done
+ job done.
+ * When lock-heal is disabled, connection-id should always be unique so
+ * that server never gets to reuse the previous connection resources
+ * so it cleans up the resources on every disconnect. Otherwise
+ * it may lead to stale resources, i.e. leaked file desciptors,
+ * inode/entry locks
*/
- ret = gf_asprintf (&process_uuid_xl, "%s-%s-%d",
+ if (!conf->lk_heal) {
+ snprintf (counter_str, sizeof (counter_str),
+ "-%"PRIu64, conf->setvol_count);
+ conf->setvol_count++;
+ }
+ ret = gf_asprintf (&process_uuid_xl, "%s-%s-%d%s",
this->ctx->process_uuid, this->name,
- this->graph->id);
+ this->graph->id, counter_str);
if (-1 == ret) {
gf_log (this->name, GF_LOG_ERROR,
"asprintf failed while setting process_uuid");
@@ -1691,12 +1755,15 @@ client_query_portmap_cbk (struct rpc_req *req, struct iovec *iov, int count, voi
ret = -1;
gf_log (this->name, ((!conf->portmap_err_logged) ?
GF_LOG_ERROR : GF_LOG_DEBUG),
- "failed to get the port number for remote subvolume");
+ "failed to get the port number for remote subvolume. "
+ "Please run 'gluster volume status' on server to see "
+ "if brick process is running.");
conf->portmap_err_logged = 1;
goto out;
}
conf->portmap_err_logged = 0;
+ conf->disconnect_err_logged = 0;
config.remote_port = rsp.port;
rpc_clnt_reconfig (conf->rpc, &config);
diff --git a/xlators/protocol/client/src/client-helpers.c b/xlators/protocol/client/src/client-helpers.c
index 77b416c47..5d9f00fdc 100644
--- a/xlators/protocol/client/src/client-helpers.c
+++ b/xlators/protocol/client/src/client-helpers.c
@@ -16,7 +16,6 @@
#include "client.h"
#include "fd.h"
-
int
client_fd_lk_list_empty (fd_lk_ctx_t *lk_ctx, gf_boolean_t try_lock)
{
@@ -276,3 +275,75 @@ clnt_readdir_rsp_cleanup (gfs3_readdir_rsp *rsp)
return 0;
}
+
+int
+client_get_remote_fd (xlator_t *this, fd_t *fd, int flags, int64_t *remote_fd)
+{
+ clnt_fd_ctx_t *fdctx = NULL;
+ clnt_conf_t *conf = NULL;
+
+ GF_VALIDATE_OR_GOTO (this->name, fd, out);
+ GF_VALIDATE_OR_GOTO (this->name, remote_fd, out);
+
+ conf = this->private;
+ pthread_mutex_lock (&conf->lock);
+ {
+ fdctx = this_fd_get_ctx (fd, this);
+ if (!fdctx)
+ *remote_fd = GF_ANON_FD_NO;
+ else if (__is_fd_reopen_in_progress (fdctx))
+ *remote_fd = -1;
+ else
+ *remote_fd = fdctx->remote_fd;
+ }
+ pthread_mutex_unlock (&conf->lock);
+
+ if ((flags & FALLBACK_TO_ANON_FD) && (*remote_fd == -1))
+ *remote_fd = GF_ANON_FD_NO;
+
+ return 0;
+out:
+ return -1;
+}
+
+gf_boolean_t
+client_is_reopen_needed (fd_t *fd, xlator_t *this, int64_t remote_fd)
+{
+ clnt_fd_ctx_t *fdctx = NULL;
+
+ fdctx = this_fd_get_ctx (fd, this);
+ if (fdctx && (fdctx->remote_fd == -1) &&
+ (remote_fd == GF_ANON_FD_NO))
+ return _gf_true;
+ return _gf_false;
+}
+
+int
+client_fd_fop_prepare_local (call_frame_t *frame, fd_t *fd, int64_t remote_fd)
+{
+ xlator_t *this = NULL;
+ clnt_conf_t *conf = NULL;
+ clnt_local_t *local = NULL;
+ int ret = 0;
+
+ this = frame->this;
+ conf = this->private;
+
+ if (!frame || !fd) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ frame->local = mem_get0 (this->local_pool);
+ if (frame->local == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ local = frame->local;
+ local->fd = fd_ref (fd);
+ local->attempt_reopen = client_is_reopen_needed (fd, this, remote_fd);
+ return 0;
+out:
+ return ret;
+}
diff --git a/xlators/protocol/client/src/client-lk.c b/xlators/protocol/client/src/client-lk.c
index 5611c7b83..b3c36a420 100644
--- a/xlators/protocol/client/src/client-lk.c
+++ b/xlators/protocol/client/src/client-lk.c
@@ -16,9 +16,6 @@
static void
__insert_and_merge (clnt_fd_ctx_t *fdctx, client_posix_lock_t *lock);
-static int
-client_send_recovery_lock (call_frame_t *frame, xlator_t *this,
- client_posix_lock_t *lock);
static void
__dump_client_lock (client_posix_lock_t *lock)
{
@@ -230,7 +227,7 @@ subtract_locks (client_posix_lock_t *big, client_posix_lock_t *small)
/* LOG-TODO : decide what more info is required here*/
gf_log ("client-protocol", GF_LOG_CRITICAL,
"Unexpected case in subtract_locks. Please send "
- "a bug report to gluster-devel@nongnu.org");
+ "a bug report to gluster-devel@gluster.org");
}
return v;
@@ -430,21 +427,6 @@ delete_granted_locks_fd (clnt_fd_ctx_t *fdctx)
return ret;
}
-static void
-client_mark_bad_fd (fd_t *fd, clnt_fd_ctx_t *fdctx)
-{
- xlator_t *this = NULL;
-
- this = THIS;
- if (fdctx)
- fdctx->remote_fd = -1;
-
- gf_log (this->name, GF_LOG_WARNING,
- "marking the file descriptor (%p) bad", fd);
-
- this_fd_set_ctx (fd, this, NULL, fdctx);
-}
-
int32_t
client_cmd_to_gf_cmd (int32_t cmd, int32_t *gf_cmd)
{
@@ -553,348 +535,6 @@ out:
}
-static int
-construct_reserve_unlock (struct gf_flock *lock, call_frame_t *frame,
- client_posix_lock_t *client_lock)
-{
- GF_ASSERT (lock);
- GF_ASSERT (frame);
-
- lock->l_type = F_UNLCK;
- lock->l_start = 0;
- lock->l_whence = SEEK_SET;
- lock->l_len = 0; /* Whole file */
- lock->l_pid = (uint64_t)(unsigned long)frame->root;
- lock->l_owner = client_lock->owner;
-
- frame->root->lk_owner = client_lock->owner;
-
- return 0;
-}
-
-static int
-construct_reserve_lock (client_posix_lock_t *client_lock, call_frame_t *frame,
- struct gf_flock *lock)
-{
- GF_ASSERT (client_lock);
-
- memcpy (lock, &(client_lock->user_flock), sizeof (struct gf_flock));
-
- frame->root->lk_owner = client_lock->owner;
-
- return 0;
-}
-
-uint64_t
-decrement_reopen_fd_count (xlator_t *this, clnt_conf_t *conf)
-{
- uint64_t fd_count = 0;
-
- LOCK (&conf->rec_lock);
- {
- fd_count = --(conf->reopen_fd_count);
- }
- UNLOCK (&conf->rec_lock);
-
- if (fd_count == 0) {
- gf_log (this->name, GF_LOG_INFO,
- "last fd open'd/lock-self-heal'd - notifying CHILD-UP");
- client_set_lk_version (this);
- client_notify_parents_child_up (this);
- }
-
- return fd_count;
-}
-
-int32_t
-client_remove_reserve_lock_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct gf_flock *lock, dict_t *xdata)
-{
- clnt_local_t *local = NULL;
- clnt_conf_t *conf = NULL;
-
- uint64_t fd_count = 0;
-
- local = frame->local;
- conf = this->private;
-
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "removing reserver lock on fd failed: %s",
- strerror(op_errno));
- goto cleanup;
- }
-
- gf_log (this->name, GF_LOG_DEBUG,
- "Removing reserve lock was successful.");
-
-cleanup:
- frame->local = NULL;
-
- client_mark_bad_fd (local->client_lock->fd, local->fdctx);
-
- destroy_client_lock (local->client_lock);
- client_local_wipe (local);
- STACK_DESTROY (frame->root);
-
- fd_count = decrement_reopen_fd_count (this, conf);
- gf_log (this->name, GF_LOG_TRACE,
- "Need to attempt lock recovery on %lld open fds",
- (unsigned long long) fd_count);
- return 0;
-}
-
-static void
-client_remove_reserve_lock (xlator_t *this, call_frame_t *frame,
- client_posix_lock_t *lock)
-{
- struct gf_flock unlock;
-
- construct_reserve_unlock (&unlock, frame, lock);
-
- STACK_WIND (frame, client_remove_reserve_lock_cbk,
- this, this->fops->lk,
- lock->fd, F_RESLK_UNLCK, &unlock, NULL);
-}
-
-static client_posix_lock_t *
-get_next_recovery_lock (xlator_t *this, clnt_local_t *local)
-{
- client_posix_lock_t *lock = NULL;
-
- pthread_mutex_lock (&local->mutex);
- {
- if (list_empty (&local->lock_list)) {
- gf_log (this->name, GF_LOG_DEBUG,
- "lock-list empty");
- goto unlock;
- }
-
- lock = list_entry ((local->lock_list).next, typeof (*lock), list);
- list_del_init (&lock->list);
- }
-unlock:
- pthread_mutex_unlock (&local->mutex);
-
- return lock;
-
-}
-
-int32_t
-client_reserve_lock_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct gf_flock *lock, dict_t *xdata)
-{
-
- clnt_local_t *local = NULL;
- clnt_conf_t *conf = NULL;
-
- uint64_t fd_count = 0;
-
- local = frame->local;
- conf = this->private;
-
- /* Got the reserve lock. Check if lock is grantable and proceed
- with the real lk call */
-
- if (op_ret >= 0) {
- /* Lock is grantable if flock reflects a successful getlk() call*/
- if (lock->l_type == F_UNLCK && lock->l_pid) {
- gf_log (this->name, GF_LOG_INFO,
- "Got the reservelk, but the lock is not grantable. ");
- client_remove_reserve_lock (this, frame, local->client_lock);
- goto out;
- }
-
- gf_log (this->name, GF_LOG_DEBUG, "reserve lock succeeded");
- client_send_recovery_lock (frame, this, local->client_lock);
- goto out;
- }
-
- /* Somebody else has a reserve lk. Lock conflict detected.
- Mark fd as bad */
-
- gf_log (this->name, GF_LOG_WARNING,
- "reservelk OP failed. aborting lock recovery");
-
- client_mark_bad_fd (local->client_lock->fd,
- local->fdctx);
- destroy_client_lock (local->client_lock);
- frame->local = NULL;
- client_local_wipe (local);
- STACK_DESTROY (frame->root);
-
- fd_count = decrement_reopen_fd_count (this, conf);
- gf_log (this->name, GF_LOG_DEBUG,
- "need to attempt lock recovery on %"PRIu64" open fds",
- fd_count);
-
-out:
- return 0;
-}
-
-int32_t
-client_recovery_lock_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- struct gf_flock *lock, dict_t *xdata)
-{
- clnt_local_t *local = NULL;
- clnt_fd_ctx_t *fdctx = NULL;
- clnt_conf_t *conf = NULL;
- client_posix_lock_t *next_lock = NULL;
-
- struct gf_flock reserve_flock;
- uint64_t fd_count = 0;
-
- local = frame->local;
- conf = this->private;
-
- if (op_ret < 0) {
- gf_log (this->name, GF_LOG_ERROR,
- "lock recovery failed: %s",
- strerror(op_errno));
-
- client_mark_bad_fd (local->client_lock->fd,
- local->fdctx);
- goto cleanup;
-
- /* Lock recovered. Continue with reserve lock for next lock */
- } else {
- gf_log (this->name, GF_LOG_DEBUG,
- "lock recovered successfully - continuing with next lock.");
-
- next_lock = get_next_recovery_lock (this, local);
- if (!next_lock) {
- gf_log (this->name, GF_LOG_DEBUG,
- "all locks recovered on fd");
- goto cleanup;
- }
-
- construct_reserve_lock (next_lock, frame, &reserve_flock);
- local->fdctx = fdctx;
- local->client_lock = next_lock;
-
- STACK_WIND (frame, client_reserve_lock_cbk,
- this, this->fops->lk,
- next_lock->fd, F_RESLK_LCK, &reserve_flock, NULL);
- goto out;
-
- }
-
-cleanup:
-
- frame->local = NULL;
- client_local_wipe (local);
-
- if (local->client_lock)
- destroy_client_lock (local->client_lock);
-
- STACK_DESTROY (frame->root);
-
- fd_count = decrement_reopen_fd_count (this, conf);
-
- gf_log (this->name, GF_LOG_DEBUG,
- "need to attempt lock recovery on %"PRIu64" open fds",
- fd_count);
-
-out:
- return 0;
-}
-
-static int
-client_send_recovery_lock (call_frame_t *frame, xlator_t *this,
- client_posix_lock_t *lock)
-{
- frame->root->lk_owner = lock->owner;
-
- /* Send all locks as F_SETLK to prevent the frame
- from blocking if there is a conflict */
-
- STACK_WIND (frame, client_recovery_lock_cbk,
- this, this->fops->lk,
- lock->fd, F_SETLK,
- &(lock->user_flock), NULL);
-
- return 0;
-}
-
-static int
-client_lockrec_init (clnt_fd_ctx_t *fdctx, clnt_local_t *local)
-{
-
- INIT_LIST_HEAD (&local->lock_list);
- pthread_mutex_init (&local->mutex, NULL);
-
- pthread_mutex_lock (&fdctx->mutex);
- {
- list_splice_init (&fdctx->lock_list, &local->lock_list);
- }
- pthread_mutex_unlock (&fdctx->mutex);
-
- return 0;
-}
-
-
-int
-client_attempt_lock_recovery (xlator_t *this, clnt_fd_ctx_t *fdctx)
-{
- call_frame_t *frame = NULL;
- clnt_local_t *local = NULL;
- client_posix_lock_t *lock = NULL;
-
- struct gf_flock reserve_flock;
- int ret = 0;
-
- local = mem_get0 (this->local_pool);
- if (!local) {
- ret = -ENOMEM;
- goto out;
- }
-
- client_lockrec_init (fdctx, local);
-
- lock = get_next_recovery_lock (this, local);
- if (!lock) {
- gf_log (this->name, GF_LOG_DEBUG,
- "no locks found on fd");
- ret = -1;
- goto out;
- }
-
- frame = create_frame (this, this->ctx->pool);
- if (!frame) {
- gf_log (this->name, GF_LOG_ERROR,
- "creating of frame failed, lock recovery failed");
- ret = -1;
- goto out;
- }
-
- construct_reserve_lock (lock, frame, &reserve_flock);
-
- frame->local = local;
- local->fdctx = fdctx;
- local->client_lock = lock;
-
- STACK_WIND (frame, client_reserve_lock_cbk,
- this, this->fops->lk,
- lock->fd, F_RESLK_LCK, &reserve_flock, NULL);
-
-out:
- return ret;
-
-
-}
-
int32_t
client_dump_locks (char *name, inode_t *inode,
dict_t *dict)
diff --git a/xlators/protocol/client/src/client-rpc-fops.c b/xlators/protocol/client/src/client-rpc-fops.c
index d5114e073..55d62dd39 100644
--- a/xlators/protocol/client/src/client-rpc-fops.c
+++ b/xlators/protocol/client/src/client-rpc-fops.c
@@ -185,8 +185,7 @@ out:
gf_log (this->name, GF_LOG_WARNING,
"remote operation failed: %s. Path: (%s to %s)",
strerror (gf_error_to_errno (rsp.op_errno)),
- (local) ? local->loc.path : "--",
- (local) ? local->loc2.path: "--");
+ local->loc.path, local->loc2.path);
}
CLIENT_STACK_UNWIND (symlink, frame, rsp.op_ret,
@@ -252,11 +251,9 @@ client3_3_mknod_cbk (struct rpc_req *req, struct iovec *iov, int count,
out:
if (rsp.op_ret == -1) {
gf_log (this->name, GF_LOG_WARNING,
- "remote operation failed: %s. Path: %s (%s)",
+ "remote operation failed: %s. Path: %s",
strerror (gf_error_to_errno (rsp.op_errno)),
- (local) ? local->loc.path : "--",
- (local && local->loc.inode) ?
- uuid_utoa (local->loc.inode->gfid) : "--");
+ local->loc.path);
}
CLIENT_STACK_UNWIND (mknod, frame, rsp.op_ret,
@@ -320,11 +317,9 @@ client3_3_mkdir_cbk (struct rpc_req *req, struct iovec *iov, int count,
out:
if (rsp.op_ret == -1) {
gf_log (this->name, GF_LOG_WARNING,
- "remote operation failed: %s. Path: %s (%s)",
+ "remote operation failed: %s. Path: %s",
strerror (gf_error_to_errno (rsp.op_errno)),
- (local) ? local->loc.path : "--",
- (local && local->loc.inode) ?
- uuid_utoa (local->loc.inode->gfid) : "--");
+ local->loc.path);
}
CLIENT_STACK_UNWIND (mkdir, frame, rsp.op_ret,
@@ -396,6 +391,7 @@ client_add_fd_to_saved_fds (xlator_t *this, fd_t *fd, loc_t *loc, int32_t flags,
fdctx->flags = flags;
fdctx->lk_ctx = fd_lk_ctx_ref (fd->lk_ctx);
fdctx->lk_heal_state = GF_LK_HEAL_DONE;
+ fdctx->reopen_done = client_default_reopen_done;
INIT_LIST_HEAD (&fdctx->sfd_pos);
INIT_LIST_HEAD (&fdctx->lock_list);
@@ -465,9 +461,7 @@ out:
gf_log (this->name, GF_LOG_WARNING,
"remote operation failed: %s. Path: %s (%s)",
strerror (gf_error_to_errno (rsp.op_errno)),
- (local) ? local->loc.path : "--",
- (local && local->loc.inode) ?
- uuid_utoa (local->loc.inode->gfid) : "--");
+ local->loc.path, loc_gfid_utoa (&local->loc));
}
CLIENT_STACK_UNWIND (open, frame, rsp.op_ret,
@@ -575,8 +569,9 @@ client3_3_readlink_cbk (struct rpc_req *req, struct iovec *iov, int count,
out:
if (rsp.op_ret == -1) {
- gf_log (this->name, GF_LOG_WARNING, "remote operation failed: %s",
- strerror (gf_error_to_errno (rsp.op_errno)));
+ gf_log (this->name, (gf_error_to_errno(rsp.op_errno) == ENOENT)?
+ GF_LOG_DEBUG:GF_LOG_WARNING, "remote operation failed:"
+ " %s", strerror (gf_error_to_errno (rsp.op_errno)));
}
CLIENT_STACK_UNWIND (readlink, frame, rsp.op_ret,
@@ -834,11 +829,13 @@ client3_3_writev_cbk (struct rpc_req *req, struct iovec *iov, int count,
int ret = 0;
xlator_t *this = NULL;
dict_t *xdata = NULL;
+ clnt_local_t *local = NULL;
this = THIS;
frame = myframe;
+ local = frame->local;
if (-1 == req->rpc_status) {
rsp.op_ret = -1;
@@ -867,6 +864,9 @@ out:
if (rsp.op_ret == -1) {
gf_log (this->name, GF_LOG_WARNING, "remote operation failed: %s",
strerror (gf_error_to_errno (rsp.op_errno)));
+ } else if (rsp.op_ret >= 0) {
+ if (local->attempt_reopen)
+ client_attempt_reopen (local->fd, this);
}
CLIENT_STACK_UNWIND (writev, frame, rsp.op_ret,
gf_error_to_errno (rsp.op_errno), &prestat,
@@ -1093,14 +1093,14 @@ client3_3_getxattr_cbk (struct rpc_req *req, struct iovec *iov, int count,
out:
if (rsp.op_ret == -1) {
- gf_log (this->name, (((op_errno == ENOTSUP) || (op_errno == ENODATA)) ?
+ gf_log (this->name, (((op_errno == ENOTSUP) ||
+ (op_errno == ENODATA) ||
+ (op_errno == ENOENT)) ?
GF_LOG_DEBUG : GF_LOG_WARNING),
"remote operation failed: %s. Path: %s (%s). Key: %s",
strerror (op_errno),
- (local) ? local->loc.path : "--",
- (local && local->loc.inode) ?
- uuid_utoa (local->loc.inode->gfid) : "--",
- (local) ? local->name : "(null)");
+ local->loc.path, loc_gfid_utoa (&local->loc),
+ (local->name) ? local->name : "(null)");
}
CLIENT_STACK_UNWIND (getxattr, frame, rsp.op_ret, op_errno, dict, xdata);
@@ -1543,16 +1543,17 @@ int
client3_3_finodelk_cbk (struct rpc_req *req, struct iovec *iov, int count,
void *myframe)
{
- call_frame_t *frame = NULL;
- gf_common_rsp rsp = {0,};
- int ret = 0;
- xlator_t *this = NULL;
- dict_t *xdata = NULL;
+ call_frame_t *frame = NULL;
+ gf_common_rsp rsp = {0,};
+ int ret = 0;
+ xlator_t *this = NULL;
+ dict_t *xdata = NULL;
+ clnt_local_t *local = NULL;
- this = THIS;
-
frame = myframe;
+ this = frame->this;
+ local = frame->local;
if (-1 == req->rpc_status) {
rsp.op_ret = -1;
@@ -1576,6 +1577,9 @@ out:
(EAGAIN != gf_error_to_errno (rsp.op_errno))) {
gf_log (this->name, GF_LOG_WARNING, "remote operation failed: %s",
strerror (gf_error_to_errno (rsp.op_errno)));
+ } else if (rsp.op_ret == 0) {
+ if (local->attempt_reopen)
+ client_attempt_reopen (local->fd, this);
}
CLIENT_STACK_UNWIND (finodelk, frame, rsp.op_ret,
gf_error_to_errno (rsp.op_errno), xdata);
@@ -1737,9 +1741,7 @@ out:
gf_log (this->name, GF_LOG_WARNING,
"remote operation failed: %s. Path: %s (%s)",
strerror (gf_error_to_errno (rsp.op_errno)),
- (local) ? local->loc.path : "--",
- (local && local->loc.inode) ?
- uuid_utoa (local->loc.inode->gfid) : "--");
+ local->loc.path, loc_gfid_utoa (&local->loc));
}
CLIENT_STACK_UNWIND (xattrop, frame, rsp.op_ret,
@@ -1807,6 +1809,9 @@ out:
gf_log (this->name, GF_LOG_WARNING,
"remote operation failed: %s",
strerror (gf_error_to_errno (op_errno)));
+ } else if (rsp.op_ret == 0) {
+ if (local->attempt_reopen)
+ client_attempt_reopen (local->fd, this);
}
CLIENT_STACK_UNWIND (fxattrop, frame, rsp.op_ret,
gf_error_to_errno (op_errno), dict, xdata);
@@ -1931,6 +1936,221 @@ out:
return 0;
}
+int
+client3_3_fallocate_cbk (struct rpc_req *req, struct iovec *iov, int count,
+ void *myframe)
+{
+ call_frame_t *frame = NULL;
+ gfs3_fallocate_rsp rsp = {0,};
+ struct iatt prestat = {0,};
+ struct iatt poststat = {0,};
+ int ret = 0;
+ xlator_t *this = NULL;
+ dict_t *xdata = NULL;
+
+
+ this = THIS;
+
+ frame = myframe;
+
+ if (-1 == req->rpc_status) {
+ rsp.op_ret = -1;
+ rsp.op_errno = ENOTCONN;
+ goto out;
+ }
+ ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gfs3_fallocate_rsp);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "XDR decoding failed");
+ rsp.op_ret = -1;
+ rsp.op_errno = EINVAL;
+ goto out;
+ }
+
+ if (-1 != rsp.op_ret) {
+ gf_stat_to_iatt (&rsp.statpre, &prestat);
+ gf_stat_to_iatt (&rsp.statpost, &poststat);
+ }
+
+ GF_PROTOCOL_DICT_UNSERIALIZE (this, xdata, (rsp.xdata.xdata_val),
+ (rsp.xdata.xdata_len), ret,
+ rsp.op_errno, out);
+
+out:
+ if (rsp.op_ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING, "remote operation failed: %s",
+ strerror (gf_error_to_errno (rsp.op_errno)));
+ }
+ CLIENT_STACK_UNWIND (fallocate, frame, rsp.op_ret,
+ gf_error_to_errno (rsp.op_errno), &prestat,
+ &poststat, xdata);
+
+ free (rsp.xdata.xdata_val);
+
+ if (xdata)
+ dict_unref (xdata);
+
+ return 0;
+}
+
+int
+client3_3_discard_cbk(struct rpc_req *req, struct iovec *iov, int count,
+ void *myframe)
+{
+ call_frame_t *frame = NULL;
+ gfs3_discard_rsp rsp = {0,};
+ struct iatt prestat = {0,};
+ struct iatt poststat = {0,};
+ int ret = 0;
+ xlator_t *this = NULL;
+ dict_t *xdata = NULL;
+
+ this = THIS;
+
+ frame = myframe;
+
+ if (-1 == req->rpc_status) {
+ rsp.op_ret = -1;
+ rsp.op_errno = ENOTCONN;
+ goto out;
+ }
+ ret = xdr_to_generic(*iov, &rsp, (xdrproc_t) xdr_gfs3_discard_rsp);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "XDR decoding failed");
+ rsp.op_ret = -1;
+ rsp.op_errno = EINVAL;
+ goto out;
+ }
+
+ if (-1 != rsp.op_ret) {
+ gf_stat_to_iatt (&rsp.statpre, &prestat);
+ gf_stat_to_iatt (&rsp.statpost, &poststat);
+ }
+
+ GF_PROTOCOL_DICT_UNSERIALIZE (this, xdata, (rsp.xdata.xdata_val),
+ (rsp.xdata.xdata_len), ret,
+ rsp.op_errno, out);
+
+out:
+ if (rsp.op_ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING, "remote operation failed: %s",
+ strerror (gf_error_to_errno (rsp.op_errno)));
+ }
+ CLIENT_STACK_UNWIND (discard, frame, rsp.op_ret,
+ gf_error_to_errno (rsp.op_errno), &prestat,
+ &poststat, xdata);
+
+ free (rsp.xdata.xdata_val);
+
+ if (xdata)
+ dict_unref (xdata);
+
+ return 0;
+}
+
+int
+client3_3_zerofill_cbk(struct rpc_req *req, struct iovec *iov, int count,
+ void *myframe)
+{
+ call_frame_t *frame = NULL;
+ gfs3_zerofill_rsp rsp = {0,};
+ struct iatt prestat = {0,};
+ struct iatt poststat = {0,};
+ int ret = 0;
+ xlator_t *this = NULL;
+ dict_t *xdata = NULL;
+
+ this = THIS;
+
+ frame = myframe;
+
+ if (-1 == req->rpc_status) {
+ rsp.op_ret = -1;
+ rsp.op_errno = ENOTCONN;
+ goto out;
+ }
+ ret = xdr_to_generic(*iov, &rsp, (xdrproc_t) xdr_gfs3_zerofill_rsp);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "XDR decoding failed");
+ rsp.op_ret = -1;
+ rsp.op_errno = EINVAL;
+ goto out;
+ }
+
+ if (-1 != rsp.op_ret) {
+ gf_stat_to_iatt (&rsp.statpre, &prestat);
+ gf_stat_to_iatt (&rsp.statpost, &poststat);
+ }
+
+ GF_PROTOCOL_DICT_UNSERIALIZE (this, xdata, (rsp.xdata.xdata_val),
+ (rsp.xdata.xdata_len), ret,
+ rsp.op_errno, out);
+
+out:
+ if (rsp.op_ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "remote operation failed: %s",
+ strerror (gf_error_to_errno (rsp.op_errno)));
+ }
+ CLIENT_STACK_UNWIND (zerofill, frame, rsp.op_ret,
+ gf_error_to_errno (rsp.op_errno), &prestat,
+ &poststat, xdata);
+
+ free (rsp.xdata.xdata_val);
+
+ if (xdata)
+ dict_unref (xdata);
+
+ return 0;
+}
+
+int
+client3_3_ipc_cbk (struct rpc_req *req, struct iovec *iov, int count,
+ void *myframe)
+{
+ call_frame_t *frame = NULL;
+ gfs3_zerofill_rsp rsp = {0,};
+ int ret = 0;
+ xlator_t *this = NULL;
+ dict_t *xdata = NULL;
+
+ this = THIS;
+
+ frame = myframe;
+
+ if (-1 == req->rpc_status) {
+ rsp.op_ret = -1;
+ rsp.op_errno = ENOTCONN;
+ goto out;
+ }
+ ret = xdr_to_generic(*iov, &rsp, (xdrproc_t) xdr_gf_common_rsp);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "XDR decoding failed");
+ rsp.op_ret = -1;
+ rsp.op_errno = EINVAL;
+ goto out;
+ }
+
+ GF_PROTOCOL_DICT_UNSERIALIZE (this, xdata, (rsp.xdata.xdata_val),
+ (rsp.xdata.xdata_len), ret,
+ rsp.op_errno, out);
+
+out:
+ if (rsp.op_ret == -1) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "remote operation failed: %s",
+ strerror (gf_error_to_errno (rsp.op_errno)));
+ }
+ CLIENT_STACK_UNWIND (ipc, frame,
+ rsp.op_ret, gf_error_to_errno (rsp.op_errno),
+ xdata);
+
+ free (rsp.xdata.xdata_val);
+
+ if (xdata)
+ dict_unref (xdata);
+
+ return 0;
+}
int
client3_3_setattr_cbk (struct rpc_req *req, struct iovec *iov, int count,
@@ -2048,11 +2268,9 @@ client3_3_create_cbk (struct rpc_req *req, struct iovec *iov, int count,
out:
if (rsp.op_ret == -1) {
gf_log (this->name, GF_LOG_WARNING,
- "remote operation failed: %s. Path: %s (%s)",
+ "remote operation failed: %s. Path: %s",
strerror (gf_error_to_errno (rsp.op_errno)),
- (local) ? local->loc.path : "--",
- (local && local->loc.inode) ?
- uuid_utoa (local->loc.inode->gfid) : "--");
+ local->loc.path);
}
CLIENT_STACK_UNWIND (create, frame, rsp.op_ret,
@@ -2447,10 +2665,7 @@ out:
gf_log (this->name, GF_LOG_WARNING,
"remote operation failed: %s (%s -> %s)",
strerror (gf_error_to_errno (rsp.op_errno)),
- (local) ? ((local->loc.path)? local->loc.path :
- uuid_utoa (local->loc.inode->gfid)) : "--",
- (local) ? ((local->loc2.path)? local->loc2.path :
- local->loc2.name) : "--");
+ local->loc.path, local->loc2.path);
}
CLIENT_STACK_UNWIND (link, frame, rsp.op_ret,
@@ -2520,9 +2735,7 @@ out:
gf_log (this->name, GF_LOG_WARNING,
"remote operation failed: %s. Path: %s (%s)",
strerror (gf_error_to_errno (rsp.op_errno)),
- (local) ? local->loc.path : "--",
- (local && local->loc.inode) ?
- uuid_utoa (local->loc.inode->gfid) : "--");
+ local->loc.path, loc_gfid_utoa (&local->loc));
}
CLIENT_STACK_UNWIND (opendir, frame, rsp.op_ret,
gf_error_to_errno (rsp.op_errno), fd, xdata);
@@ -2599,13 +2812,12 @@ out:
rsp.op_errno = op_errno;
if (rsp.op_ret == -1) {
/* any error other than ENOENT */
- if (rsp.op_errno != ENOENT)
+ if (!(local->loc.name && rsp.op_errno == ENOENT) &&
+ !(rsp.op_errno == ESTALE))
gf_log (this->name, GF_LOG_WARNING,
"remote operation failed: %s. Path: %s (%s)",
- strerror (rsp.op_errno),
- (local) ? local->loc.path : "--",
- (local && local->loc.inode) ?
- uuid_utoa (local->loc.inode->gfid) : "--");
+ strerror (rsp.op_errno), local->loc.path,
+ loc_gfid_utoa (&local->loc));
else
gf_log (this->name, GF_LOG_TRACE, "not found on remote node");
@@ -2679,6 +2891,9 @@ out:
gf_log (this->name, GF_LOG_WARNING,
"remote operation failed: %s",
strerror (gf_error_to_errno (rsp.op_errno)));
+ } else if (rsp.op_ret >= 0) {
+ if (local->attempt_reopen)
+ client_attempt_reopen (local->fd, this);
}
CLIENT_STACK_UNWIND (readv, frame, rsp.op_ret,
gf_error_to_errno (rsp.op_errno), vector, rspcount,
@@ -2758,7 +2973,7 @@ client_fdctx_destroy (xlator_t *this, clnt_fd_ctx_t *fdctx)
if (fdctx->is_dir) {
gfs3_releasedir_req req = {{0,},};
req.fd = fdctx->remote_fd;
- gf_log (this->name, GF_LOG_DEBUG, "sending releasedir on fd");
+ gf_log (this->name, GF_LOG_TRACE, "sending releasedir on fd");
client_submit_request (this, &req, fr, &clnt3_3_fop_prog,
GFS3_OP_RELEASEDIR,
client3_3_releasedir_cbk,
@@ -2767,7 +2982,7 @@ client_fdctx_destroy (xlator_t *this, clnt_fd_ctx_t *fdctx)
} else {
gfs3_release_req req = {{0,},};
req.fd = fdctx->remote_fd;
- gf_log (this->name, GF_LOG_DEBUG, "sending release on fd");
+ gf_log (this->name, GF_LOG_TRACE, "sending release on fd");
client_submit_request (this, &req, fr, &clnt3_3_fop_prog,
GFS3_OP_RELEASE,
client3_3_release_cbk, NULL,
@@ -2902,12 +3117,13 @@ client3_3_lookup (call_frame_t *frame, xlator_t *this,
op_errno = ENOMEM;
goto unwind;
}
+ frame->local = local;
if (!(args->loc && args->loc->inode))
goto unwind;
loc_copy (&local->loc, args->loc);
- frame->local = local;
+ loc_path (&local->loc, NULL);
if (args->loc->parent) {
if (!uuid_is_null (args->loc->parent->gfid))
@@ -3115,7 +3331,8 @@ client3_3_ftruncate (call_frame_t *frame, xlator_t *this,
conf = this->private;
- CLIENT_GET_REMOTE_FD(conf, args->fd, remote_fd, op_errno, unwind);
+ CLIENT_GET_REMOTE_FD (this, args->fd, DEFAULT_REMOTE_FD,
+ remote_fd, op_errno, unwind);
req.offset = args->offset;
req.fd = remote_fd;
@@ -3422,6 +3639,7 @@ client3_3_symlink (call_frame_t *frame, xlator_t *this,
goto unwind;
loc_copy (&local->loc, args->loc);
+ loc_path (&local->loc, NULL);
if (!uuid_is_null (args->loc->parent->gfid))
memcpy (req.pargfid, args->loc->parent->gfid, 16);
@@ -3571,7 +3789,9 @@ client3_3_link (call_frame_t *frame, xlator_t *this,
}
loc_copy (&local->loc, args->oldloc);
+ loc_path (&local->loc, NULL);
loc_copy (&local->loc2, args->newloc);
+ loc_path (&local->loc2, NULL);
frame->local = local;
req.newbname = (char *)args->newloc->name;
@@ -3621,12 +3841,13 @@ client3_3_mknod (call_frame_t *frame, xlator_t *this,
op_errno = ENOMEM;
goto unwind;
}
+ frame->local = local;
if (!(args->loc && args->loc->parent))
goto unwind;
loc_copy (&local->loc, args->loc);
- frame->local = local;
+ loc_path (&local->loc, NULL);
if (!uuid_is_null (args->loc->parent->gfid))
memcpy (req.pargfid, args->loc->parent->gfid, 16);
@@ -3688,12 +3909,13 @@ client3_3_mkdir (call_frame_t *frame, xlator_t *this,
op_errno = ENOMEM;
goto unwind;
}
+ frame->local = local;
if (!(args->loc && args->loc->parent))
goto unwind;
loc_copy (&local->loc, args->loc);
- frame->local = local;
+ loc_path (&local->loc, NULL);
if (!uuid_is_null (args->loc->parent->gfid))
memcpy (req.pargfid, args->loc->parent->gfid, 16);
@@ -3754,6 +3976,8 @@ client3_3_create (call_frame_t *frame, xlator_t *this,
op_errno = ENOMEM;
goto unwind;
}
+ frame->local = local;
+
if (!(args->loc && args->loc->parent))
goto unwind;
@@ -3761,7 +3985,7 @@ client3_3_create (call_frame_t *frame, xlator_t *this,
local->flags = args->flags;
loc_copy (&local->loc, args->loc);
- frame->local = local;
+ loc_path (&local->loc, NULL);
if (!uuid_is_null (args->loc->parent->gfid))
memcpy (req.pargfid, args->loc->parent->gfid, 16);
@@ -3824,13 +4048,15 @@ client3_3_open (call_frame_t *frame, xlator_t *this,
op_errno = ENOMEM;
goto unwind;
}
+ frame->local = local;
+
if (!(args->loc && args->loc->inode))
goto unwind;
local->fd = fd_ref (args->fd);
local->flags = args->flags;
loc_copy (&local->loc, args->loc);
- frame->local = local;
+ loc_path (&local->loc, NULL);
if (!uuid_is_null (args->loc->inode->gfid))
memcpy (req.gfid, args->loc->inode->gfid, 16);
@@ -3875,13 +4101,13 @@ client3_3_readv (call_frame_t *frame, xlator_t *this,
clnt_args_t *args = NULL;
int64_t remote_fd = -1;
clnt_conf_t *conf = NULL;
+ clnt_local_t *local = NULL;
int op_errno = ESTALE;
gfs3_read_req req = {{0,},};
int ret = 0;
struct iovec rsp_vec = {0, };
struct iobuf *rsp_iobuf = NULL;
struct iobref *rsp_iobref = NULL;
- clnt_local_t *local = NULL;
if (!frame || !this || !data)
goto unwind;
@@ -3889,7 +4115,14 @@ client3_3_readv (call_frame_t *frame, xlator_t *this,
args = data;
conf = this->private;
- CLIENT_GET_REMOTE_FD(conf, args->fd, remote_fd, op_errno, unwind);
+ CLIENT_GET_REMOTE_FD (this, args->fd, FALLBACK_TO_ANON_FD,
+ remote_fd, op_errno, unwind);
+ ret = client_fd_fop_prepare_local (frame, args->fd, remote_fd);
+ if (ret) {
+ op_errno = -ret;
+ goto unwind;
+ }
+ local = frame->local;
req.size = args->size;
req.offset = args->offset;
@@ -3927,15 +4160,8 @@ client3_3_readv (call_frame_t *frame, xlator_t *this,
goto unwind;
}
- local = mem_get0 (this->local_pool);
- if (local == NULL) {
- op_errno = ENOMEM;
- goto unwind;
- }
-
local->iobref = rsp_iobref;
rsp_iobref = NULL;
- frame->local = local;
GF_PROTOCOL_DICT_SERIALIZE (this, args->xdata, (&req.xdata.xdata_val),
req.xdata.xdata_len, op_errno, unwind);
@@ -3946,17 +4172,12 @@ client3_3_readv (call_frame_t *frame, xlator_t *this,
local->iobref,
(xdrproc_t)xdr_gfs3_read_req);
if (ret) {
+ //unwind is done in the cbk
gf_log (this->name, GF_LOG_WARNING, "failed to send the fop");
}
GF_FREE (req.xdata.xdata_val);
- if (rsp_iobuf)
- iobuf_unref (rsp_iobuf);
-
- if (rsp_iobref)
- iobref_unref (rsp_iobref);
-
return 0;
unwind:
if (rsp_iobuf)
@@ -3980,7 +4201,7 @@ client3_3_writev (call_frame_t *frame, xlator_t *this, void *data)
clnt_conf_t *conf = NULL;
gfs3_write_req req = {{0,},};
int op_errno = ESTALE;
- int ret = 0;
+ int ret = 0;
if (!frame || !this || !data)
goto unwind;
@@ -3988,7 +4209,13 @@ client3_3_writev (call_frame_t *frame, xlator_t *this, void *data)
args = data;
conf = this->private;
- CLIENT_GET_REMOTE_FD(conf, args->fd, remote_fd, op_errno, unwind);
+ CLIENT_GET_REMOTE_FD (this, args->fd, FALLBACK_TO_ANON_FD,
+ remote_fd, op_errno, unwind);
+ ret = client_fd_fop_prepare_local (frame, args->fd, remote_fd);
+ if (ret) {
+ op_errno = -ret;
+ goto unwind;
+ }
req.size = args->size;
req.offset = args->offset;
@@ -4052,7 +4279,8 @@ client3_3_flush (call_frame_t *frame, xlator_t *this,
args = data;
conf = this->private;
- CLIENT_GET_REMOTE_FD(conf, args->fd, remote_fd, op_errno, unwind);
+ CLIENT_GET_REMOTE_FD (this, args->fd, DEFAULT_REMOTE_FD,
+ remote_fd, op_errno, unwind);
conf = this->private;
@@ -4111,7 +4339,8 @@ client3_3_fsync (call_frame_t *frame, xlator_t *this,
args = data;
conf = this->private;
- CLIENT_GET_REMOTE_FD(conf, args->fd, remote_fd, op_errno, unwind);
+ CLIENT_GET_REMOTE_FD (this, args->fd, DEFAULT_REMOTE_FD,
+ remote_fd, op_errno, unwind);
req.fd = remote_fd;
req.data = args->flags;
@@ -4159,7 +4388,8 @@ client3_3_fstat (call_frame_t *frame, xlator_t *this,
args = data;
conf = this->private;
- CLIENT_GET_REMOTE_FD(conf, args->fd, remote_fd, op_errno, unwind);
+ CLIENT_GET_REMOTE_FD (this, args->fd, DEFAULT_REMOTE_FD,
+ remote_fd, op_errno, unwind);
req.fd = remote_fd;
memcpy (req.gfid, args->fd->inode->gfid, 16);
@@ -4209,12 +4439,14 @@ client3_3_opendir (call_frame_t *frame, xlator_t *this,
op_errno = ENOMEM;
goto unwind;
}
+ frame->local = local;
+
if (!(args->loc && args->loc->inode))
goto unwind;
local->fd = fd_ref (args->fd);
loc_copy (&local->loc, args->loc);
- frame->local = local;
+ loc_path (&local->loc, NULL);
if (!uuid_is_null (args->loc->inode->gfid))
memcpy (req.gfid, args->loc->inode->gfid, 16);
@@ -4268,7 +4500,8 @@ client3_3_fsyncdir (call_frame_t *frame, xlator_t *this, void *data)
args = data;
conf = this->private;
- CLIENT_GET_REMOTE_FD(conf, args->fd, remote_fd, op_errno, unwind);
+ CLIENT_GET_REMOTE_FD (this, args->fd, DEFAULT_REMOTE_FD,
+ remote_fd, op_errno, unwind);
req.fd = remote_fd;
req.data = args->flags;
@@ -4437,7 +4670,8 @@ client3_3_fsetxattr (call_frame_t *frame, xlator_t *this,
args = data;
conf = this->private;
- CLIENT_GET_REMOTE_FD(conf, args->fd, remote_fd, op_errno, unwind);
+ CLIENT_GET_REMOTE_FD (this, args->fd, DEFAULT_REMOTE_FD,
+ remote_fd, op_errno, unwind);
req.fd = remote_fd;
req.flags = args->flags;
@@ -4501,7 +4735,8 @@ client3_3_fgetxattr (call_frame_t *frame, xlator_t *this,
args = data;
conf = this->private;
- CLIENT_GET_REMOTE_FD(conf, args->fd, remote_fd, op_errno, unwind);
+ CLIENT_GET_REMOTE_FD (this, args->fd, DEFAULT_REMOTE_FD,
+ remote_fd, op_errno, unwind);
local = mem_get0 (this->local_pool);
if (!local) {
@@ -4617,6 +4852,7 @@ client3_3_getxattr (call_frame_t *frame, xlator_t *this,
}
loc_copy (&local->loc, args->loc);
+ loc_path (&local->loc, NULL);
if (args->name)
local->name = gf_strdup (args->name);
@@ -4781,6 +5017,8 @@ client3_3_xattrop (call_frame_t *frame, xlator_t *this,
else
memcpy (req.gfid, args->loc->gfid, 16);
+ loc_copy (&local->loc, args->loc);
+ loc_path (&local->loc, NULL);
GF_ASSERT_AND_GOTO_WITH_ERROR (this->name,
!uuid_is_null (*((uuid_t*)req.gfid)),
unwind, op_errno, EINVAL);
@@ -4844,11 +5082,11 @@ client3_3_fxattrop (call_frame_t *frame, xlator_t *this,
clnt_args_t *args = NULL;
int64_t remote_fd = -1;
clnt_conf_t *conf = NULL;
+ clnt_local_t *local = NULL;
gfs3_fxattrop_req req = {{0,},};
int op_errno = ESTALE;
int ret = 0;
int count = 0;
- clnt_local_t *local = NULL;
struct iobref *rsp_iobref = NULL;
struct iobuf *rsp_iobuf = NULL;
struct iovec *rsphdr = NULL;
@@ -4860,19 +5098,20 @@ client3_3_fxattrop (call_frame_t *frame, xlator_t *this,
args = data;
conf = this->private;
- CLIENT_GET_REMOTE_FD(conf, args->fd, remote_fd, op_errno, unwind);
+ CLIENT_GET_REMOTE_FD (this, args->fd, FALLBACK_TO_ANON_FD,
+ remote_fd, op_errno, unwind);
+ ret = client_fd_fop_prepare_local (frame, args->fd, remote_fd);
+ if (ret) {
+ op_errno = -ret;
+ goto unwind;
+ }
+
+ local = frame->local;
req.fd = remote_fd;
req.flags = args->flags;
memcpy (req.gfid, args->fd->inode->gfid, 16);
- local = mem_get0 (this->local_pool);
- if (!local) {
- op_errno = ENOMEM;
- goto unwind;
- }
- frame->local = local;
-
rsp_iobref = iobref_new ();
if (rsp_iobref == NULL) {
op_errno = ENOMEM;
@@ -4920,12 +5159,6 @@ client3_3_fxattrop (call_frame_t *frame, xlator_t *this,
GF_FREE (req.xdata.xdata_val);
- if (rsp_iobuf)
- iobuf_unref (rsp_iobuf);
-
- if (rsp_iobref)
- iobref_unref (rsp_iobref);
-
return 0;
unwind:
CLIENT_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL);
@@ -5017,7 +5250,8 @@ client3_3_fremovexattr (call_frame_t *frame, xlator_t *this,
conf = this->private;
- CLIENT_GET_REMOTE_FD(conf, args->fd, remote_fd, op_errno, unwind);
+ CLIENT_GET_REMOTE_FD (this, args->fd, DEFAULT_REMOTE_FD,
+ remote_fd, op_errno, unwind);
memcpy (req.gfid, args->fd->inode->gfid, 16);
req.name = (char *)args->name;
@@ -5069,8 +5303,10 @@ client3_3_lk (call_frame_t *frame, xlator_t *this,
op_errno = ENOMEM;
goto unwind;
}
+ frame->local = local;
- CLIENT_GET_REMOTE_FD(conf, args->fd, remote_fd, op_errno, unwind);
+ CLIENT_GET_REMOTE_FD (this, args->fd, DEFAULT_REMOTE_FD,
+ remote_fd, op_errno, unwind);
ret = client_cmd_to_gf_cmd (args->cmd, &gf_cmd);
if (ret) {
@@ -5095,7 +5331,6 @@ client3_3_lk (call_frame_t *frame, xlator_t *this,
local->owner = frame->root->lk_owner;
local->cmd = args->cmd;
local->fd = fd_ref (args->fd);
- frame->local = local;
req.fd = remote_fd;
req.cmd = gf_cmd;
@@ -5220,15 +5455,20 @@ client3_3_finodelk (call_frame_t *frame, xlator_t *this,
int64_t remote_fd = -1;
clnt_conf_t *conf = NULL;
int op_errno = ESTALE;
- int ret = 0;
+ int ret = 0;
if (!frame || !this || !data)
goto unwind;
args = data;
conf = this->private;
-
- CLIENT_GET_REMOTE_FD(conf, args->fd, remote_fd, op_errno, unwind);
+ CLIENT_GET_REMOTE_FD (this, args->fd, FALLBACK_TO_ANON_FD,
+ remote_fd, op_errno, unwind);
+ ret = client_fd_fop_prepare_local (frame, args->fd, remote_fd);
+ if (ret) {
+ op_errno = -ret;
+ goto unwind;
+ }
if (args->cmd == F_GETLK || args->cmd == F_GETLK64)
gf_cmd = GF_LK_GETLK;
@@ -5361,7 +5601,8 @@ client3_3_fentrylk (call_frame_t *frame, xlator_t *this,
args = data;
conf = this->private;
- CLIENT_GET_REMOTE_FD(conf, args->fd, remote_fd, op_errno, unwind);
+ CLIENT_GET_REMOTE_FD (this, args->fd, DEFAULT_REMOTE_FD,
+ remote_fd, op_errno, unwind);
req.fd = remote_fd;
req.cmd = args->cmd_entrylk;
@@ -5414,7 +5655,8 @@ client3_3_rchecksum (call_frame_t *frame, xlator_t *this,
args = data;
conf = this->private;
- CLIENT_GET_REMOTE_FD(conf, args->fd, remote_fd, op_errno, unwind);
+ CLIENT_GET_REMOTE_FD (this, args->fd, DEFAULT_REMOTE_FD,
+ remote_fd, op_errno, unwind);
req.len = args->len;
req.offset = args->offset;
@@ -5470,7 +5712,8 @@ client3_3_readdir (call_frame_t *frame, xlator_t *this,
args = data;
conf = this->private;
- CLIENT_GET_REMOTE_FD(conf, args->fd, remote_fd, op_errno, unwind);
+ CLIENT_GET_REMOTE_FD (this, args->fd, DEFAULT_REMOTE_FD,
+ remote_fd, op_errno, unwind);
readdir_rsp_size = xdr_sizeof ((xdrproc_t) xdr_gfs3_readdir_rsp, &rsp)
+ args->size;
@@ -5579,7 +5822,8 @@ client3_3_readdirp (call_frame_t *frame, xlator_t *this,
args = data;
conf = this->private;
- CLIENT_GET_REMOTE_FD(conf, args->fd, remote_fd, op_errno, unwind);
+ CLIENT_GET_REMOTE_FD (this, args->fd, DEFAULT_REMOTE_FD,
+ remote_fd, op_errno, unwind);
readdirp_rsp_size = xdr_sizeof ((xdrproc_t) xdr_gfs3_readdirp_rsp, &rsp)
+ args->size;
@@ -5732,7 +5976,8 @@ client3_3_fsetattr (call_frame_t *frame, xlator_t *this, void *data)
args = data;
conf = this->private;
- CLIENT_GET_REMOTE_FD(conf, args->fd, remote_fd, op_errno, unwind);
+ CLIENT_GET_REMOTE_FD (this, args->fd, DEFAULT_REMOTE_FD,
+ remote_fd, op_errno, unwind);
req.fd = remote_fd;
req.valid = args->valid;
@@ -5760,7 +6005,181 @@ unwind:
return 0;
}
+int32_t
+client3_3_fallocate(call_frame_t *frame, xlator_t *this, void *data)
+{
+ clnt_args_t *args = NULL;
+ int64_t remote_fd = -1;
+ clnt_conf_t *conf = NULL;
+ gfs3_fallocate_req req = {{0},};
+ int op_errno = ESTALE;
+ int ret = 0;
+
+ if (!frame || !this || !data)
+ goto unwind;
+
+ args = data;
+ conf = this->private;
+
+ CLIENT_GET_REMOTE_FD (this, args->fd, DEFAULT_REMOTE_FD,
+ remote_fd, op_errno, unwind);
+
+ req.fd = remote_fd;
+ req.flags = args->flags;
+ req.offset = args->offset;
+ req.size = args->size;
+ memcpy(req.gfid, args->fd->inode->gfid, 16);
+
+ GF_PROTOCOL_DICT_SERIALIZE (this, args->xdata, (&req.xdata.xdata_val),
+ req.xdata.xdata_len, op_errno, unwind);
+
+ ret = client_submit_request (this, &req, frame, conf->fops,
+ GFS3_OP_FALLOCATE,
+ client3_3_fallocate_cbk, NULL,
+ NULL, 0, NULL, 0,
+ NULL, (xdrproc_t)xdr_gfs3_fallocate_req);
+ if (ret) {
+ gf_log (this->name, GF_LOG_WARNING, "failed to send the fop");
+ }
+
+ GF_FREE (req.xdata.xdata_val);
+
+ return 0;
+unwind:
+ CLIENT_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+ GF_FREE (req.xdata.xdata_val);
+
+ return 0;
+}
+
+int32_t
+client3_3_discard(call_frame_t *frame, xlator_t *this, void *data)
+{
+ clnt_args_t *args = NULL;
+ int64_t remote_fd = -1;
+ clnt_conf_t *conf = NULL;
+ gfs3_discard_req req = {{0},};
+ int op_errno = ESTALE;
+ int ret = 0;
+
+ if (!frame || !this || !data)
+ goto unwind;
+ args = data;
+ conf = this->private;
+
+ CLIENT_GET_REMOTE_FD (this, args->fd, DEFAULT_REMOTE_FD,
+ remote_fd, op_errno, unwind);
+
+ req.fd = remote_fd;
+ req.offset = args->offset;
+ req.size = args->size;
+ memcpy(req.gfid, args->fd->inode->gfid, 16);
+
+ GF_PROTOCOL_DICT_SERIALIZE (this, args->xdata, (&req.xdata.xdata_val),
+ req.xdata.xdata_len, op_errno, unwind);
+
+ ret = client_submit_request(this, &req, frame, conf->fops,
+ GFS3_OP_DISCARD, client3_3_discard_cbk,
+ NULL, NULL, 0, NULL, 0, NULL,
+ (xdrproc_t) xdr_gfs3_discard_req);
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING, "failed to send the fop");
+
+ GF_FREE (req.xdata.xdata_val);
+
+ return 0;
+unwind:
+ CLIENT_STACK_UNWIND(discard, frame, -1, op_errno, NULL, NULL, NULL);
+ GF_FREE (req.xdata.xdata_val);
+
+ return 0;
+}
+
+int32_t
+client3_3_zerofill(call_frame_t *frame, xlator_t *this, void *data)
+{
+ clnt_args_t *args = NULL;
+ int64_t remote_fd = -1;
+ clnt_conf_t *conf = NULL;
+ gfs3_zerofill_req req = {{0},};
+ int op_errno = ESTALE;
+ int ret = 0;
+
+ GF_ASSERT (frame);
+
+ if (!this || !data)
+ goto unwind;
+
+ args = data;
+ conf = this->private;
+
+ CLIENT_GET_REMOTE_FD (this, args->fd, DEFAULT_REMOTE_FD,
+ remote_fd, op_errno, unwind);
+
+ req.fd = remote_fd;
+ req.offset = args->offset;
+ req.size = args->size;
+ memcpy(req.gfid, args->fd->inode->gfid, 16);
+
+ GF_PROTOCOL_DICT_SERIALIZE (this, args->xdata, (&req.xdata.xdata_val),
+ req.xdata.xdata_len, op_errno, unwind);
+
+ ret = client_submit_request(this, &req, frame, conf->fops,
+ GFS3_OP_ZEROFILL, client3_3_zerofill_cbk,
+ NULL, NULL, 0, NULL, 0, NULL,
+ (xdrproc_t) xdr_gfs3_zerofill_req);
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING, "failed to send the fop");
+
+ GF_FREE (req.xdata.xdata_val);
+
+ return 0;
+unwind:
+ CLIENT_STACK_UNWIND(zerofill, frame, -1, op_errno, NULL, NULL, NULL);
+ GF_FREE (req.xdata.xdata_val);
+
+ return 0;
+}
+
+int32_t
+client3_3_ipc (call_frame_t *frame, xlator_t *this, void *data)
+{
+ clnt_args_t *args = NULL;
+ clnt_conf_t *conf = NULL;
+ gfs3_ipc_req req = {0,};
+ int op_errno = ESTALE;
+ int ret = 0;
+
+ GF_ASSERT (frame);
+
+ if (!this || !data)
+ goto unwind;
+
+ args = data;
+ conf = this->private;
+
+ req.op = args->cmd;
+
+ GF_PROTOCOL_DICT_SERIALIZE (this, args->xdata, (&req.xdata.xdata_val),
+ req.xdata.xdata_len, op_errno, unwind);
+
+ ret = client_submit_request(this, &req, frame, conf->fops,
+ GFS3_OP_IPC, client3_3_ipc_cbk,
+ NULL, NULL, 0, NULL, 0, NULL,
+ (xdrproc_t) xdr_gfs3_ipc_req);
+ if (ret)
+ gf_log (this->name, GF_LOG_WARNING, "failed to send the fop");
+
+ GF_FREE (req.xdata.xdata_val);
+
+ return 0;
+unwind:
+ CLIENT_STACK_UNWIND(ipc, frame, -1, op_errno, NULL);
+ GF_FREE (req.xdata.xdata_val);
+
+ return 0;
+}
/* Table Specific to FOPS */
@@ -5807,10 +6226,14 @@ rpc_clnt_procedure_t clnt3_3_fop_actors[GF_FOP_MAXVALUE] = {
[GF_FOP_SETATTR] = { "SETATTR", client3_3_setattr },
[GF_FOP_FSETATTR] = { "FSETATTR", client3_3_fsetattr },
[GF_FOP_READDIRP] = { "READDIRP", client3_3_readdirp },
+ [GF_FOP_FALLOCATE] = { "FALLOCATE", client3_3_fallocate },
+ [GF_FOP_DISCARD] = { "DISCARD", client3_3_discard },
+ [GF_FOP_ZEROFILL] = { "ZEROFILL", client3_3_zerofill},
[GF_FOP_RELEASE] = { "RELEASE", client3_3_release },
[GF_FOP_RELEASEDIR] = { "RELEASEDIR", client3_3_releasedir },
[GF_FOP_GETSPEC] = { "GETSPEC", client3_getspec },
- [GF_FOP_FREMOVEXATTR] = { "FREMOVEXATTR", client3_3_fremovexattr },
+ [GF_FOP_FREMOVEXATTR]= { "FREMOVEXATTR",client3_3_fremovexattr },
+ [GF_FOP_IPC] = { "IPC", client3_3_ipc },
};
/* Used From RPC-CLNT library to log proper name of procedure based on number */
@@ -5859,6 +6282,11 @@ char *clnt3_3_fop_names[GFS3_OP_MAXVALUE] = {
[GFS3_OP_RELEASE] = "RELEASE",
[GFS3_OP_RELEASEDIR] = "RELEASEDIR",
[GFS3_OP_FREMOVEXATTR] = "FREMOVEXATTR",
+ [GFS3_OP_FALLOCATE] = "FALLOCATE",
+ [GFS3_OP_DISCARD] = "DISCARD",
+ [GFS3_OP_ZEROFILL] = "ZEROFILL",
+ [GFS3_OP_IPC] = "IPC",
+
};
rpc_clnt_prog_t clnt3_3_fop_prog = {
diff --git a/xlators/protocol/client/src/client.c b/xlators/protocol/client/src/client.c
index 6cc3c9896..f1415899e 100644
--- a/xlators/protocol/client/src/client.c
+++ b/xlators/protocol/client/src/client.c
@@ -21,6 +21,7 @@
#include "statedump.h"
#include "compat-errno.h"
+#include "xdr-rpc.h"
#include "glusterfs3.h"
extern rpc_clnt_prog_t clnt_handshake_prog;
@@ -130,7 +131,7 @@ client_register_grace_timer (xlator_t *this, clnt_conf_t *conf)
conf->grace_timer =
gf_timer_call_after (this->ctx,
- conf->grace_tv,
+ conf->grace_ts,
client_grace_timeout,
conf->rpc);
}
@@ -815,12 +816,16 @@ client_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
goto out;
args.loc = loc;
- args.flags = flags;
args.mode = mode;
args.fd = fd;
args.umask = umask;
args.xdata = xdata;
+ if (!conf->filter_o_direct)
+ args.flags = flags;
+ else
+ args.flags = (flags & ~O_DIRECT);
+
proc = &conf->fops->proctable[GF_FOP_CREATE];
if (!proc) {
gf_log (this->name, GF_LOG_ERROR,
@@ -854,10 +859,14 @@ client_open (call_frame_t *frame, xlator_t *this, loc_t *loc,
goto out;
args.loc = loc;
- args.flags = flags;
args.fd = fd;
args.xdata = xdata;
+ if (!conf->filter_o_direct)
+ args.flags = flags;
+ else
+ args.flags = (flags & ~O_DIRECT);
+
proc = &conf->fops->proctable[GF_FOP_OPEN];
if (!proc) {
gf_log (this->name, GF_LOG_ERROR,
@@ -935,6 +944,7 @@ client_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
args.vector = vector;
args.count = count;
args.offset = off;
+ args.size = iov_length (vector, count);
args.flags = flags;
args.iobref = iobref;
args.xdata = xdata;
@@ -1953,6 +1963,142 @@ out:
return 0;
}
+int32_t
+client_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ int ret = -1;
+ clnt_conf_t *conf = NULL;
+ rpc_clnt_procedure_t *proc = NULL;
+ clnt_args_t args = {0,};
+
+ conf = this->private;
+ if (!conf || !conf->fops)
+ goto out;
+
+ args.fd = fd;
+ args.flags = mode;
+ args.offset = offset;
+ args.size = len;
+ args.xdata = xdata;
+
+ proc = &conf->fops->proctable[GF_FOP_FALLOCATE];
+ if (!proc) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "rpc procedure not found for %s",
+ gf_fop_list[GF_FOP_FALLOCATE]);
+ goto out;
+ }
+ if (proc->fn)
+ ret = proc->fn (frame, this, &args);
+out:
+ if (ret)
+ STACK_UNWIND_STRICT (fallocate, frame, -1, ENOTCONN, NULL, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+client_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ int ret = -1;
+ clnt_conf_t *conf = NULL;
+ rpc_clnt_procedure_t *proc = NULL;
+ clnt_args_t args = {0,};
+
+ conf = this->private;
+ if (!conf || !conf->fops)
+ goto out;
+
+ args.fd = fd;
+ args.offset = offset;
+ args.size = len;
+ args.xdata = xdata;
+
+ proc = &conf->fops->proctable[GF_FOP_DISCARD];
+ if (!proc) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "rpc procedure not found for %s",
+ gf_fop_list[GF_FOP_DISCARD]);
+ goto out;
+ }
+ if (proc->fn)
+ ret = proc->fn (frame, this, &args);
+out:
+ if (ret)
+ STACK_UNWIND_STRICT(discard, frame, -1, ENOTCONN, NULL, NULL, NULL);
+
+ return 0;
+}
+
+int32_t
+client_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata)
+{
+ int ret = -1;
+ clnt_conf_t *conf = NULL;
+ rpc_clnt_procedure_t *proc = NULL;
+ clnt_args_t args = {0,};
+
+ conf = this->private;
+ if (!conf || !conf->fops)
+ goto out;
+
+ args.fd = fd;
+ args.offset = offset;
+ args.size = len;
+ args.xdata = xdata;
+
+ proc = &conf->fops->proctable[GF_FOP_ZEROFILL];
+ if (!proc) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "rpc procedure not found for %s",
+ gf_fop_list[GF_FOP_ZEROFILL]);
+ goto out;
+ }
+ if (proc->fn)
+ ret = proc->fn (frame, this, &args);
+out:
+ if (ret)
+ STACK_UNWIND_STRICT(zerofill, frame, -1, ENOTCONN,
+ NULL, NULL, NULL);
+
+ return 0;
+}
+
+
+int32_t
+client_ipc (call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata)
+{
+ int ret = -1;
+ clnt_conf_t *conf = NULL;
+ rpc_clnt_procedure_t *proc = NULL;
+ clnt_args_t args = {0,};
+
+ conf = this->private;
+ if (!conf || !conf->fops)
+ goto out;
+
+ args.cmd = op;
+ args.xdata = xdata;
+
+ proc = &conf->fops->proctable[GF_FOP_IPC];
+ if (!proc) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "rpc procedure not found for %s",
+ gf_fop_list[GF_FOP_IPC]);
+ goto out;
+ }
+ if (proc->fn)
+ ret = proc->fn (frame, this, &args);
+out:
+ if (ret)
+ STACK_UNWIND_STRICT(ipc, frame, -1, ENOTCONN, NULL);
+
+ return 0;
+}
+
int32_t
client_getspec (call_frame_t *frame, xlator_t *this, const char *key,
@@ -2042,7 +2188,7 @@ client_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,
gf_log (this->name, GF_LOG_DEBUG, "got RPC_CLNT_CONNECT");
if ((ret < 0) || (strcasecmp (handshake, "on"))) {
- ret = client_handshake (this, conf->rpc);
+ ret = client_handshake (this, rpc);
if (ret)
gf_log (this->name, GF_LOG_WARNING,
"handshake msg returned %d", ret);
@@ -2084,9 +2230,19 @@ client_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,
client_register_grace_timer (this, conf);
if (!conf->skip_notify) {
- if (conf->connected)
- gf_log (this->name, GF_LOG_INFO,
- "disconnected");
+ if (conf->connected) {
+ gf_log (this->name,
+ ((!conf->disconnect_err_logged)
+ ? GF_LOG_INFO : GF_LOG_DEBUG),
+ "disconnected from %s. Client process "
+ "will keep trying to connect to "
+ "glusterd until brick's port is "
+ "available",
+ conf->rpc->conn.name);
+
+ if (conf->portmap_err_logged)
+ conf->disconnect_err_logged = 1;
+ }
/* If the CHILD_DOWN event goes to parent xlator
multiple times, the logic of parent xlator notify
@@ -2110,10 +2266,14 @@ client_rpc_notify (struct rpc_clnt *rpc, void *mydata, rpc_clnt_event_t event,
conf->connected = 0;
conf->skip_notify = 0;
- if (conf->quick_reconnect) {
- conf->quick_reconnect = 0;
- rpc_clnt_start (conf->rpc);
- }
+ if (conf->quick_reconnect) {
+ conf->quick_reconnect = 0;
+ rpc_clnt_start (rpc);
+
+ } else {
+ rpc->conn.config.remote_port = 0;
+
+ }
break;
@@ -2198,6 +2358,9 @@ build_client_config (xlator_t *this, clnt_conf_t *conf)
gf_log (this->name, GF_LOG_WARNING,
"option 'remote-subvolume' not given");
+ GF_OPTION_INIT ("filter-O_DIRECT", conf->filter_o_direct,
+ bool, out);
+
ret = 0;
out:
return ret;
@@ -2321,14 +2484,14 @@ client_init_grace_timer (xlator_t *this, dict_t *options,
ret = dict_get_int32 (options, "grace-timeout", &grace_timeout);
if (!ret)
- conf->grace_tv.tv_sec = grace_timeout;
+ conf->grace_ts.tv_sec = grace_timeout;
else
- conf->grace_tv.tv_sec = 10;
+ conf->grace_ts.tv_sec = 10;
- conf->grace_tv.tv_usec = 0;
+ conf->grace_ts.tv_nsec = 0;
gf_log (this->name, GF_LOG_DEBUG, "Client grace timeout "
- "value = %"PRIu64, conf->grace_tv.tv_sec);
+ "value = %"GF_PRI_SECOND, conf->grace_ts.tv_sec);
ret = 0;
out:
@@ -2382,6 +2545,9 @@ reconfigure (xlator_t *this, dict_t *options)
}
}
+ GF_OPTION_RECONF ("filter-O_DIRECT", conf->filter_o_direct,
+ options, bool, out);
+
ret = client_init_grace_timer (this, options, conf);
if (ret)
goto out;
@@ -2651,7 +2817,11 @@ struct xlator_fops fops = {
.fxattrop = client_fxattrop,
.setattr = client_setattr,
.fsetattr = client_fsetattr,
+ .fallocate = client_fallocate,
+ .discard = client_discard,
+ .zerofill = client_zerofill,
.getspec = client_getspec,
+ .ipc = client_ipc,
};
@@ -2706,16 +2876,33 @@ struct volume_options options[] = {
{ .key = {"lk-heal"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "off",
+ .description = "When the connection to client is lost, server "
+ "cleans up all the locks held by the client. After "
+ "the connection is restored, the client reacquires "
+ "(heals) the fcntl locks released by the server."
},
{ .key = {"grace-timeout"},
.type = GF_OPTION_TYPE_INT,
.min = 10,
- .max = 1800
+ .max = 1800,
+ .default_value = "10",
+ .description = "Specifies the duration for the lock state to be "
+ "maintained on the client after a network "
+ "disconnection. Range 10-1800 seconds."
},
{.key = {"tcp-window-size"},
.type = GF_OPTION_TYPE_SIZET,
.min = GF_MIN_SOCKET_WINDOW_SIZE,
- .max = GF_MAX_SOCKET_WINDOW_SIZE
+ .max = GF_MAX_SOCKET_WINDOW_SIZE,
+ .description = "Specifies the window size for tcp socket."
+ },
+ { .key = {"filter-O_DIRECT"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "disable",
+ .description = "If enabled, in open() and creat() calls, O_DIRECT "
+ "flag will be filtered at the client protocol level so server will "
+ "still continue to cache the file. This works similar to NFS's "
+ "behavior of O_DIRECT",
},
{ .key = {NULL} },
};
diff --git a/xlators/protocol/client/src/client.h b/xlators/protocol/client/src/client.h
index 420674910..bc0f5d0e9 100644
--- a/xlators/protocol/client/src/client.h
+++ b/xlators/protocol/client/src/client.h
@@ -34,27 +34,27 @@ typedef enum {
GF_LK_HEAL_DONE,
} lk_heal_state_t;
-#define CLIENT_GET_REMOTE_FD(conf, fd, remote_fd, op_errno, label) \
+typedef enum {
+ DEFAULT_REMOTE_FD = 0,
+ FALLBACK_TO_ANON_FD = 1
+} clnt_remote_fd_flags_t;
+
+#define CLIENT_GET_REMOTE_FD(xl, fd, flags, remote_fd, op_errno, label) \
do { \
- clnt_fd_ctx_t *fdctx = NULL; \
- pthread_mutex_lock (&conf->lock); \
- { \
- fdctx = this_fd_get_ctx (fd, THIS); \
- } \
- pthread_mutex_unlock (&conf->lock); \
- if (!fdctx) { \
- remote_fd = -2; \
- } else { \
- remote_fd = fdctx->remote_fd; \
+ int _ret = 0; \
+ _ret = client_get_remote_fd (xl, fd, flags, &remote_fd);\
+ if (_ret < 0) { \
+ op_errno = errno; \
+ goto label; \
} \
if (remote_fd == -1) { \
- gf_log (THIS->name, GF_LOG_WARNING, " (%s) " \
+ gf_log (xl->name, GF_LOG_WARNING, " (%s) " \
"remote_fd is -1. EBADFD", \
uuid_utoa (fd->inode->gfid)); \
op_errno = EBADFD; \
goto label; \
} \
- } while (0);
+ } while (0)
#define CLIENT_STACK_UNWIND(op, frame, params ...) do { \
clnt_local_t *__local = frame->local; \
@@ -93,6 +93,10 @@ typedef struct clnt_conf {
which was sent earlier */
char portmap_err_logged; /* flag used to prevent
excessive logging */
+ char disconnect_err_logged; /* flag used to prevent
+ excessive disconnect
+ logging */
+
char need_different_port; /* flag used to change the
portmap path in case of
'tcp,rdma' on server */
@@ -100,7 +104,7 @@ typedef struct clnt_conf {
uint16_t lk_version; /* this variable is used to distinguish
client-server transaction while
performing lock healing */
- struct timeval grace_tv;
+ struct timespec grace_ts;
gf_timer_t *grace_timer;
gf_boolean_t grace_timer_needed; /* The state of this flag will
be used to decide whether
@@ -114,6 +118,13 @@ typedef struct clnt_conf {
the reconnection happen after
the usual 3-second wait
*/
+ gf_boolean_t filter_o_direct; /* if set, filter O_DIRECT from
+ the flags list of open() */
+ /* set volume is the op which results in creating/re-using
+ * the conn-id and is called once per connection, this remembers
+ * how manytimes set_volume is called
+ */
+ uint64_t setvol_count;
} clnt_conf_t;
typedef struct _client_fd_ctx {
@@ -128,7 +139,9 @@ typedef struct _client_fd_ctx {
pthread_mutex_t mutex;
lk_heal_state_t lk_heal_state;
uuid_t gfid;
+ void (*reopen_done) (struct _client_fd_ctx*, xlator_t *);
struct list_head lock_list; /* List of all granted locks on this fd */
+ int32_t reopen_attempts;
} clnt_fd_ctx_t;
typedef struct _client_posix_lock {
@@ -156,7 +169,8 @@ typedef struct client_local {
int32_t cmd;
struct list_head lock_list;
pthread_mutex_t mutex;
- char *name;
+ char *name;
+ gf_boolean_t attempt_reopen;
} clnt_local_t;
typedef struct client_args {
@@ -208,9 +222,6 @@ int client_submit_request (xlator_t *this, void *req,
struct iovec *rsp_payload, int rsp_count,
struct iobref *rsp_iobref, xdrproc_t xdrproc);
-int protocol_client_reopendir (xlator_t *this, clnt_fd_ctx_t *fdctx);
-int protocol_client_reopen (xlator_t *this, clnt_fd_ctx_t *fdctx);
-
int unserialize_rsp_dirent (struct gfs3_readdir_rsp *rsp, gf_dirent_t *entries);
int unserialize_rsp_direntp (xlator_t *this, fd_t *fd,
struct gfs3_readdirp_rsp *rsp, gf_dirent_t *entries);
@@ -221,7 +232,6 @@ int client_attempt_lock_recovery (xlator_t *this, clnt_fd_ctx_t *fdctx);
int32_t delete_granted_locks_owner (fd_t *fd, gf_lkowner_t *owner);
int client_add_lock_for_recovery (fd_t *fd, struct gf_flock *flock,
gf_lkowner_t *owner, int32_t cmd);
-uint64_t decrement_reopen_fd_count (xlator_t *this, clnt_conf_t *conf);
int32_t delete_granted_locks_fd (clnt_fd_ctx_t *fdctx);
int32_t client_cmd_to_gf_cmd (int32_t cmd, int32_t *gf_cmd);
void client_save_number_fds (clnt_conf_t *conf, int count);
@@ -241,4 +251,12 @@ int client_mark_fd_bad (xlator_t *this);
int client_set_lk_version (xlator_t *this);
int client_fd_lk_list_empty (fd_lk_ctx_t *lk_ctx, gf_boolean_t use_try_lock);
+void client_default_reopen_done (clnt_fd_ctx_t *fdctx, xlator_t *this);
+void client_attempt_reopen (fd_t *fd, xlator_t *this);
+int client_get_remote_fd (xlator_t *this, fd_t *fd, int flags,
+ int64_t *remote_fd);
+int client_fd_fop_prepare_local (call_frame_t *frame, fd_t *fd,
+ int64_t remote_fd);
+gf_boolean_t
+__is_fd_reopen_in_progress (clnt_fd_ctx_t *fdctx);
#endif /* !_CLIENT_H */
diff --git a/xlators/protocol/server/src/Makefile.am b/xlators/protocol/server/src/Makefile.am
index c7ac38172..6a18bf025 100644
--- a/xlators/protocol/server/src/Makefile.am
+++ b/xlators/protocol/server/src/Makefile.am
@@ -1,7 +1,7 @@
xlator_LTLIBRARIES = server.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/protocol
-server_la_LDFLAGS = -module -avoidversion
+server_la_LDFLAGS = -module -avoid-version
server_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
$(top_builddir)/rpc/rpc-lib/src/libgfrpc.la \
@@ -16,10 +16,11 @@ AM_CPPFLAGS = $(GF_CPPFLAGS) \
-I$(top_srcdir)/libglusterfs/src \
-DCONFDIR=\"$(sysconfdir)/glusterfs\" \
-DLIBDIR=\"$(libdir)/glusterfs/$(PACKAGE_VERSION)/auth\" \
- -I$(top_srcdir)/xlators/protocol/lib/src \
- -I$(top_srcdir)/rpc/rpc-lib/src/ \
- -I$(top_srcdir)/rpc/xdr/src/
+ -I$(top_srcdir)/xlators/protocol/lib/src \
+ -I$(top_srcdir)/rpc/rpc-lib/src \
+ -I$(top_srcdir)/rpc/xdr/src
-AM_CFLAGS = -Wall $(GF_CFLAGS)
+AM_CFLAGS = -Wall $(GF_CFLAGS) \
+ -DDATADIR=\"$(localstatedir)\"
CLEANFILES = *~
diff --git a/xlators/protocol/server/src/authenticate.c b/xlators/protocol/server/src/authenticate.c
index 9c843d0bb..d8d138a84 100644
--- a/xlators/protocol/server/src/authenticate.c
+++ b/xlators/protocol/server/src/authenticate.c
@@ -1,21 +1,14 @@
/*
- Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2007-2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
-*/
#ifndef _CONFIG_H
#define _CONFIG_H
@@ -124,34 +117,41 @@ fini (dict_t *this, char *key, data_t *value, void *data)
return 0;
}
+static int
+_gf_auth_option_validate (dict_t *d, char *k, data_t *v, void *tmp)
+{
+ auth_handle_t *handle = NULL;
+ xlator_t *xl = NULL;
+ int ret = 0;
+
+ xl = tmp;
+
+ handle = data_to_ptr (v);
+ if (!handle)
+ return 0;
+
+ list_add_tail (&(handle->vol_opt->list), &(xl->volume_options));
+
+ ret = xlator_options_validate_list (xl, xl->options,
+ handle->vol_opt, NULL);
+ if (ret) {
+ gf_log ("authenticate", GF_LOG_ERROR,
+ "volume option validation failed");
+ return -1;
+ }
+ return 0;
+}
+
int32_t
gf_auth_init (xlator_t *xl, dict_t *auth_modules)
{
int ret = 0;
- auth_handle_t *handle = NULL;
dict_foreach (auth_modules, init, &ret);
if (ret)
goto out;
- int _auth_option_validate (dict_t *d, char *k, data_t *v, void *tmp)
- {
- handle = data_to_ptr (v);
- if (!handle)
- return 0;
-
- list_add_tail (&(handle->vol_opt->list),
- &(xl->volume_options));
- ret = xlator_options_validate_list (xl, xl->options,
- handle->vol_opt, NULL);
- if (ret) {
- gf_log ("authenticate", GF_LOG_ERROR,
- "volume option validation failed");
- return -1;
- }
- return 0;
- }
- ret = dict_foreach (auth_modules, _auth_option_validate, NULL);
+ ret = dict_foreach (auth_modules, _gf_auth_option_validate, xl);
out:
if (ret) {
diff --git a/xlators/protocol/server/src/authenticate.h b/xlators/protocol/server/src/authenticate.h
index bb9f77e45..d4d43e498 100644
--- a/xlators/protocol/server/src/authenticate.h
+++ b/xlators/protocol/server/src/authenticate.h
@@ -1,20 +1,11 @@
/*
- Copyright (c) 2007-2011 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2007-2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _AUTHENTICATE_H
diff --git a/xlators/protocol/server/src/server-handshake.c b/xlators/protocol/server/src/server-handshake.c
index d893716cc..a459f89e1 100644
--- a/xlators/protocol/server/src/server-handshake.c
+++ b/xlators/protocol/server/src/server-handshake.c
@@ -1,20 +1,11 @@
/*
- Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2010-2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
@@ -103,9 +94,9 @@ _volfile_update_checksum (xlator_t *this, char *key, uint32_t checksum)
if (temp_volfile->checksum != checksum) {
gf_log (this->name, GF_LOG_INFO,
- "the volume file got modified between earlier access "
- "and now, this may lead to inconsistency between "
- "clients, advised to remount client");
+ "the volume file was modified between a prior access "
+ "and now. This may lead to inconsistency between "
+ "clients, you are advised to remount client");
temp_volfile->checksum = checksum;
}
@@ -118,10 +109,10 @@ static size_t
getspec_build_volfile_path (xlator_t *this, const char *key, char *path,
size_t path_len)
{
- int ret = -1;
+ char *filename = NULL;
+ server_conf_t *conf = NULL;
+ int ret = -1;
int free_filename = 0;
- char *filename = NULL;
- server_conf_t *conf = NULL;
char data_key[256] = {0,};
conf = this->private;
@@ -264,7 +255,9 @@ server_getspec (rpcsvc_request_t *req)
this = req->svc->mydata;
conf = this->private;
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gf_getspec_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gf_getspec_req);
+ if (ret < 0) {
//failed to decode msg;
req->rpc_err = GARBAGE_ARGS;
op_errno = EINVAL;
@@ -336,14 +329,15 @@ server_setvolume (rpcsvc_request_t *req)
{
gf_setvolume_req args = {{0,},};
gf_setvolume_rsp rsp = {0,};
- server_connection_t *conn = NULL;
+ client_t *client = NULL;
+ server_ctx_t *serv_ctx = NULL;
server_conf_t *conf = NULL;
peer_info_t *peerinfo = NULL;
dict_t *reply = NULL;
dict_t *config_params = NULL;
dict_t *params = NULL;
char *name = NULL;
- char *process_uuid = NULL;
+ char *client_uid = NULL;
char *clnt_version = NULL;
xlator_t *xl = NULL;
char *msg = NULL;
@@ -361,7 +355,9 @@ server_setvolume (rpcsvc_request_t *req)
params = dict_new ();
reply = dict_new ();
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gf_setvolume_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gf_setvolume_req);
+ if (ret < 0) {
//failed to decode msg;
req->rpc_err = GARBAGE_ARGS;
goto fail;
@@ -398,7 +394,7 @@ server_setvolume (rpcsvc_request_t *req)
params->extra_free = buf;
buf = NULL;
- ret = dict_get_str (params, "process-uuid", &process_uuid);
+ ret = dict_get_str (params, "process-uuid", &client_uid);
if (ret < 0) {
ret = dict_set_str (reply, "ERROR",
"UUID not specified");
@@ -425,25 +421,34 @@ server_setvolume (rpcsvc_request_t *req)
goto fail;
}
- conn = server_connection_get (this, process_uuid);
- if (!conn) {
+ client = gf_client_get (this, &req->cred, client_uid);
+ if (client == NULL) {
op_ret = -1;
op_errno = ENOMEM;
goto fail;
}
- gf_log (this->name, GF_LOG_DEBUG, "Connected to %s", conn->id);
- cancelled = server_cancel_conn_timer (this, conn);
- if (cancelled)//Do connection_put on behalf of grace-timer-handler.
- server_connection_put (this, conn, NULL);
- if (conn->lk_version != 0 &&
- conn->lk_version != lk_version) {
- (void) server_connection_cleanup (this, conn,
+ gf_log (this->name, GF_LOG_DEBUG, "Connected to %s", client->client_uid);
+ cancelled = server_cancel_grace_timer (this, client);
+ if (cancelled)//Do gf_client_put on behalf of grace-timer-handler.
+ gf_client_put (client, NULL);
+
+ serv_ctx = server_ctx_get (client, client->this);
+ if (serv_ctx == NULL) {
+ gf_log (this->name, GF_LOG_INFO, "server_ctx_get() failed");
+ goto fail;
+ }
+
+ if (serv_ctx->lk_version != 0 &&
+ serv_ctx->lk_version != lk_version) {
+ (void) server_connection_cleanup (this, client,
INTERNAL_LOCKS | POSIX_LOCKS);
}
- if (req->trans->xl_private != conn)
- req->trans->xl_private = conn;
+ if (req->trans->xl_private != client)
+ req->trans->xl_private = client;
+
+ auth_set_username_passwd (params, config_params, client);
ret = dict_get_int32 (params, "fops-version", &fop_version);
if (ret < 0) {
@@ -568,10 +573,10 @@ server_setvolume (rpcsvc_request_t *req)
gf_log (this->name, GF_LOG_INFO,
"accepted client from %s (version: %s)",
- conn->id,
+ client->client_uid,
(clnt_version) ? clnt_version : "old");
op_ret = 0;
- conn->bound_xl = xl;
+ client->bound_xl = xl;
ret = dict_set_str (reply, "ERROR", "Success");
if (ret < 0)
gf_log (this->name, GF_LOG_DEBUG,
@@ -579,7 +584,7 @@ server_setvolume (rpcsvc_request_t *req)
} else {
gf_log (this->name, GF_LOG_ERROR,
"Cannot authenticate client from %s %s",
- conn->id,
+ client->client_uid,
(clnt_version) ? clnt_version : "old");
op_ret = -1;
@@ -591,7 +596,7 @@ server_setvolume (rpcsvc_request_t *req)
goto fail;
}
- if (conn->bound_xl == NULL) {
+ if (client->bound_xl == NULL) {
ret = dict_set_str (reply, "ERROR",
"Check volfile and handshake "
"options in protocol/client");
@@ -604,20 +609,21 @@ server_setvolume (rpcsvc_request_t *req)
goto fail;
}
- if ((conn->bound_xl != NULL) &&
+ if ((client->bound_xl != NULL) &&
(ret >= 0) &&
- (conn->bound_xl->itable == NULL)) {
+ (client->bound_xl->itable == NULL)) {
/* create inode table for this bound_xl, if one doesn't
already exist */
gf_log (this->name, GF_LOG_TRACE,
"creating inode table with lru_limit=%"PRId32", "
"xlator=%s", conf->inode_lru_limit,
- conn->bound_xl->name);
+ client->bound_xl->name);
/* TODO: what is this ? */
- conn->bound_xl->itable = inode_table_new (conf->inode_lru_limit,
- conn->bound_xl);
+ client->bound_xl->itable =
+ inode_table_new (conf->inode_lru_limit,
+ client->bound_xl);
}
ret = dict_set_str (reply, "process-uuid",
@@ -626,8 +632,7 @@ server_setvolume (rpcsvc_request_t *req)
gf_log (this->name, GF_LOG_DEBUG,
"failed to set 'process-uuid'");
- ret = dict_set_uint32 (reply, "clnt-lk-version",
- conn->lk_version);
+ ret = dict_set_uint32 (reply, "clnt-lk-version", serv_ctx->lk_version);
if (ret)
gf_log (this->name, GF_LOG_WARNING,
"failed to set 'clnt-lk-version'");
@@ -640,7 +645,7 @@ server_setvolume (rpcsvc_request_t *req)
fail:
rsp.dict.dict_len = dict_serialized_length (reply);
- if (rsp.dict.dict_len < 0) {
+ if (rsp.dict.dict_len > UINT_MAX) {
gf_log ("server-handshake", GF_LOG_DEBUG,
"failed to get serialized length of reply dict");
op_ret = -1;
@@ -669,8 +674,16 @@ fail:
* list of connections the server is maintaining and might segfault
* during statedump when bound_xl of the connection is accessed.
*/
- if (op_ret && conn)
- server_connection_put (this, conn, NULL);
+ if (op_ret && !xl) {
+ /* We would have set the xl_private of the transport to the
+ * @conn. But if we have put the connection i.e shutting down
+ * the connection, then we should set xl_private to NULL as it
+ * would be pointing to a freed memory and would segfault when
+ * accessed upon getting DISCONNECT.
+ */
+ gf_client_put (client, NULL);
+ req->trans->xl_private = NULL;
+ }
server_submit_reply (NULL, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_setvolume_rsp);
@@ -706,12 +719,13 @@ server_ping (rpcsvc_request_t *req)
int
server_set_lk_version (rpcsvc_request_t *req)
{
- int op_ret = -1;
- int op_errno = EINVAL;
- gf_set_lk_ver_req args = {0, };
- gf_set_lk_ver_rsp rsp = {0,};
- server_connection_t *conn = NULL;
- xlator_t *this = NULL;
+ int op_ret = -1;
+ int op_errno = EINVAL;
+ gf_set_lk_ver_req args = {0,};
+ gf_set_lk_ver_rsp rsp = {0,};
+ client_t *client = NULL;
+ server_ctx_t *serv_ctx = NULL;
+ xlator_t *this = NULL;
this = req->svc->mydata;
//TODO: Decide on an appropriate errno for the error-path
@@ -719,16 +733,23 @@ server_set_lk_version (rpcsvc_request_t *req)
if (!this)
goto fail;
- if (!xdr_to_generic (req->msg[0], &args,
- (xdrproc_t)xdr_gf_set_lk_ver_req)) {
+ op_ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gf_set_lk_ver_req);
+ if (op_ret < 0) {
//failed to decode msg;
req->rpc_err = GARBAGE_ARGS;
goto fail;
}
- conn = server_connection_get (this, args.uid);
- conn->lk_version = args.lk_ver;
- server_connection_put (this, conn, NULL);
+ client = gf_client_get (this, &req->cred, args.uid);
+ serv_ctx = server_ctx_get (client, client->this);
+ if (serv_ctx == NULL) {
+ gf_log (this->name, GF_LOG_INFO, "server_ctx_get() failed");
+ goto fail;
+ }
+
+ serv_ctx->lk_version = args.lk_ver;
+ gf_client_put (client, NULL);
rsp.lk_ver = args.lk_ver;
@@ -745,11 +766,11 @@ fail:
}
rpcsvc_actor_t gluster_handshake_actors[] = {
- [GF_HNDSK_NULL] = {"NULL", GF_HNDSK_NULL, server_null, NULL, 0},
- [GF_HNDSK_SETVOLUME] = {"SETVOLUME", GF_HNDSK_SETVOLUME, server_setvolume, NULL, 0},
- [GF_HNDSK_GETSPEC] = {"GETSPEC", GF_HNDSK_GETSPEC, server_getspec, NULL, 0},
- [GF_HNDSK_PING] = {"PING", GF_HNDSK_PING, server_ping, NULL, 0},
- [GF_HNDSK_SET_LK_VER] = {"SET_LK_VER", GF_HNDSK_SET_LK_VER, server_set_lk_version, NULL, 0},
+ [GF_HNDSK_NULL] = {"NULL", GF_HNDSK_NULL, server_null, NULL, 0, DRC_NA},
+ [GF_HNDSK_SETVOLUME] = {"SETVOLUME", GF_HNDSK_SETVOLUME, server_setvolume, NULL, 0, DRC_NA},
+ [GF_HNDSK_GETSPEC] = {"GETSPEC", GF_HNDSK_GETSPEC, server_getspec, NULL, 0, DRC_NA},
+ [GF_HNDSK_PING] = {"PING", GF_HNDSK_PING, server_ping, NULL, 0, DRC_NA},
+ [GF_HNDSK_SET_LK_VER] = {"SET_LK_VER", GF_HNDSK_SET_LK_VER, server_set_lk_version, NULL, 0, DRC_NA},
};
diff --git a/xlators/protocol/server/src/server-helpers.c b/xlators/protocol/server/src/server-helpers.c
index 0adcd1cf1..600a311c3 100644
--- a/xlators/protocol/server/src/server-helpers.c
+++ b/xlators/protocol/server/src/server-helpers.c
@@ -1,20 +1,11 @@
/*
- Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2010-2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _CONFIG_H
@@ -35,10 +26,14 @@ server_decode_groups (call_frame_t *frame, rpcsvc_request_t *req)
GF_VALIDATE_OR_GOTO ("server", frame, out);
GF_VALIDATE_OR_GOTO ("server", req, out);
+ if (call_stack_alloc_groups (frame->root, req->auxgidcount) != 0)
+ return -1;
+
frame->root->ngrps = req->auxgidcount;
if (frame->root->ngrps == 0)
return 0;
+ /* ngrps cannot be bigger than USHRT_MAX(65535) */
if (frame->root->ngrps > GF_MAX_AUX_GROUPS)
return -1;
@@ -48,6 +43,7 @@ out:
return 0;
}
+
void
server_loc_wipe (loc_t *loc)
{
@@ -79,11 +75,6 @@ server_resolve_wipe (server_resolve_t *resolve)
void
free_state (server_state_t *state)
{
- if (state->conn) {
- //xprt_svc_unref (state->conn);
- state->conn = NULL;
- }
-
if (state->xprt) {
rpc_transport_unref (state->xprt);
state->xprt = NULL;
@@ -132,301 +123,26 @@ free_state (server_state_t *state)
}
-int
-gf_add_locker (server_connection_t *conn, const char *volume,
- loc_t *loc, fd_t *fd, pid_t pid, gf_lkowner_t *owner,
- glusterfs_fop_t type)
-{
- int32_t ret = -1;
- struct _locker *new = NULL;
- struct _lock_table *table = NULL;
-
- GF_VALIDATE_OR_GOTO ("server", volume, out);
-
- new = GF_CALLOC (1, sizeof (struct _locker), gf_server_mt_locker_t);
- if (new == NULL) {
- goto out;
- }
- INIT_LIST_HEAD (&new->lockers);
-
- new->volume = gf_strdup (volume);
-
- if (fd == NULL) {
- loc_copy (&new->loc, loc);
- } else {
- new->fd = fd_ref (fd);
- }
-
- new->pid = pid;
- new->owner = *owner;
-
- pthread_mutex_lock (&conn->lock);
- {
- table = conn->ltable;
- if (type == GF_FOP_ENTRYLK)
- list_add_tail (&new->lockers, &table->entrylk_lockers);
- else
- list_add_tail (&new->lockers, &table->inodelk_lockers);
- }
- pthread_mutex_unlock (&conn->lock);
-out:
- return ret;
-}
-
-
-int
-gf_del_locker (server_connection_t *conn, const char *volume,
- loc_t *loc, fd_t *fd, gf_lkowner_t *owner,
- glusterfs_fop_t type)
-{
- struct _locker *locker = NULL;
- struct _locker *tmp = NULL;
- int32_t ret = -1;
- struct list_head *head = NULL;
- struct _lock_table *table = NULL;
- int found = 0;
-
- GF_VALIDATE_OR_GOTO ("server", volume, out);
-
- pthread_mutex_lock (&conn->lock);
- {
- table = conn->ltable;
- if (type == GF_FOP_ENTRYLK) {
- head = &table->entrylk_lockers;
- } else {
- head = &table->inodelk_lockers;
- }
-
- list_for_each_entry_safe (locker, tmp, head, lockers) {
- if (!is_same_lkowner (&locker->owner, owner) ||
- strcmp (locker->volume, volume))
- continue;
-
- if (locker->fd && fd && (locker->fd == fd))
- found = 1;
- else if (locker->loc.inode && loc &&
- (locker->loc.inode == loc->inode))
- found = 1;
- if (found) {
- list_del_init (&locker->lockers);
- break;
- }
- }
- if (!found)
- locker = NULL;
- }
- pthread_mutex_unlock (&conn->lock);
-
- if (locker) {
- if (locker->fd)
- fd_unref (locker->fd);
- else
- loc_wipe (&locker->loc);
-
- GF_FREE (locker->volume);
- GF_FREE (locker);
- }
-
- ret = 0;
-out:
- return ret;
-}
-
-static struct _lock_table *
-gf_lock_table_new (void)
-{
- struct _lock_table *new = NULL;
-
- new = GF_CALLOC (1, sizeof (struct _lock_table), gf_server_mt_lock_table_t);
- if (new == NULL) {
- goto out;
- }
- INIT_LIST_HEAD (&new->entrylk_lockers);
- INIT_LIST_HEAD (&new->inodelk_lockers);
-out:
- return new;
-}
-
-static int
-server_nop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- int ret = -1;
- server_state_t *state = NULL;
-
- GF_VALIDATE_OR_GOTO ("server", frame, out);
- GF_VALIDATE_OR_GOTO ("server", cookie, out);
- GF_VALIDATE_OR_GOTO ("server", this, out);
-
- if (frame->root->trans)
- server_conn_unref (frame->root->trans);
- state = CALL_STATE(frame);
-
- if (state)
- free_state (state);
- STACK_DESTROY (frame->root);
-
- ret = 0;
-out:
- return ret;
-}
-
-int
-do_lock_table_cleanup (xlator_t *this, server_connection_t *conn,
- call_frame_t *frame, struct _lock_table *ltable)
-{
- struct list_head inodelk_lockers, entrylk_lockers;
- call_frame_t *tmp_frame = NULL;
- struct gf_flock flock = {0, };
- xlator_t *bound_xl = NULL;
- struct _locker *locker = NULL, *tmp = NULL;
- int ret = -1;
- char *path = NULL;
-
- GF_VALIDATE_OR_GOTO ("server", this, out);
- GF_VALIDATE_OR_GOTO ("server", conn, out);
- GF_VALIDATE_OR_GOTO ("server", frame, out);
- GF_VALIDATE_OR_GOTO ("server", ltable, out);
-
- bound_xl = conn->bound_xl;
- INIT_LIST_HEAD (&inodelk_lockers);
- INIT_LIST_HEAD (&entrylk_lockers);
-
- list_splice_init (&ltable->inodelk_lockers,
- &inodelk_lockers);
-
- list_splice_init (&ltable->entrylk_lockers, &entrylk_lockers);
- GF_FREE (ltable);
-
- flock.l_type = F_UNLCK;
- flock.l_start = 0;
- flock.l_len = 0;
- list_for_each_entry_safe (locker,
- tmp, &inodelk_lockers, lockers) {
- tmp_frame = copy_frame (frame);
- if (tmp_frame == NULL) {
- goto out;
- }
- /*
- lock owner = 0 is a special case that tells posix-locks
- to release all locks from this transport
- */
- tmp_frame->root->pid = 0;
- tmp_frame->root->trans = server_conn_ref (conn);
- memset (&tmp_frame->root->lk_owner, 0, sizeof (gf_lkowner_t));
-
- if (locker->fd) {
- GF_ASSERT (locker->fd->inode);
-
- ret = inode_path (locker->fd->inode, NULL, &path);
-
- if (ret > 0) {
- gf_log (this->name, GF_LOG_INFO, "finodelk "
- "released on %s", path);
- GF_FREE (path);
- } else {
-
- gf_log (this->name, GF_LOG_INFO, "finodelk "
- "released on inode with gfid %s",
- uuid_utoa (locker->fd->inode->gfid));
- }
-
- STACK_WIND (tmp_frame, server_nop_cbk, bound_xl,
- bound_xl->fops->finodelk,
- locker->volume,
- locker->fd, F_SETLK, &flock, NULL);
- fd_unref (locker->fd);
- } else {
- gf_log (this->name, GF_LOG_INFO, "inodelk released "
- "on %s", locker->loc.path);
-
- STACK_WIND (tmp_frame, server_nop_cbk, bound_xl,
- bound_xl->fops->inodelk,
- locker->volume,
- &(locker->loc), F_SETLK, &flock, NULL);
- loc_wipe (&locker->loc);
- }
-
- GF_FREE (locker->volume);
-
- list_del_init (&locker->lockers);
- GF_FREE (locker);
- }
-
- tmp = NULL;
- locker = NULL;
- list_for_each_entry_safe (locker, tmp, &entrylk_lockers, lockers) {
- tmp_frame = copy_frame (frame);
-
- tmp_frame->root->pid = 0;
- tmp_frame->root->trans = server_conn_ref (conn);
- memset (&tmp_frame->root->lk_owner, 0, sizeof (gf_lkowner_t));
-
- if (locker->fd) {
- GF_ASSERT (locker->fd->inode);
-
- ret = inode_path (locker->fd->inode, NULL, &path);
-
- if (ret > 0) {
- gf_log (this->name, GF_LOG_INFO, "fentrylk "
- "released on %s", path);
- GF_FREE (path);
- } else {
-
- gf_log (this->name, GF_LOG_INFO, "fentrylk "
- "released on inode with gfid %s",
- uuid_utoa (locker->fd->inode->gfid));
- }
-
- STACK_WIND (tmp_frame, server_nop_cbk, bound_xl,
- bound_xl->fops->fentrylk,
- locker->volume,
- locker->fd, NULL,
- ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL);
- fd_unref (locker->fd);
- } else {
- gf_log (this->name, GF_LOG_INFO, "entrylk released "
- "on %s", locker->loc.path);
-
- STACK_WIND (tmp_frame, server_nop_cbk, bound_xl,
- bound_xl->fops->entrylk,
- locker->volume,
- &(locker->loc), NULL,
- ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL);
- loc_wipe (&locker->loc);
- }
-
- GF_FREE (locker->volume);
-
- list_del_init (&locker->lockers);
- GF_FREE (locker);
- }
- ret = 0;
-
-out:
- return ret;
-}
-
-
static int
server_connection_cleanup_flush_cbk (call_frame_t *frame, void *cookie,
xlator_t *this, int32_t op_ret,
int32_t op_errno, dict_t *xdata)
{
- int32_t ret = -1;
- fd_t *fd = NULL;
+ int32_t ret = -1;
+ fd_t *fd = NULL;
+ client_t *client = NULL;
GF_VALIDATE_OR_GOTO ("server", this, out);
GF_VALIDATE_OR_GOTO ("server", cookie, out);
GF_VALIDATE_OR_GOTO ("server", frame, out);
fd = frame->local;
+ client = frame->root->client;
fd_unref (fd);
frame->local = NULL;
- if (frame->root->trans)
- server_conn_unref (frame->root->trans);
+ gf_client_unref (client);
STACK_DESTROY (frame->root);
ret = 0;
@@ -435,9 +151,8 @@ out:
}
-int
-do_fd_cleanup (xlator_t *this, server_connection_t *conn, call_frame_t *frame,
- fdentry_t *fdentries, int fd_count)
+static int
+do_fd_cleanup (xlator_t *this, client_t* client, fdentry_t *fdentries, int fd_count)
{
fd_t *fd = NULL;
int i = 0, ret = -1;
@@ -446,16 +161,14 @@ do_fd_cleanup (xlator_t *this, server_connection_t *conn, call_frame_t *frame,
char *path = NULL;
GF_VALIDATE_OR_GOTO ("server", this, out);
- GF_VALIDATE_OR_GOTO ("server", conn, out);
- GF_VALIDATE_OR_GOTO ("server", frame, out);
GF_VALIDATE_OR_GOTO ("server", fdentries, out);
- bound_xl = conn->bound_xl;
+ bound_xl = client->bound_xl;
for (i = 0;i < fd_count; i++) {
fd = fdentries[i].fd;
if (fd != NULL) {
- tmp_frame = copy_frame (frame);
+ tmp_frame = create_frame (this, this->ctx->pool);
if (tmp_frame == NULL) {
goto out;
}
@@ -465,20 +178,20 @@ do_fd_cleanup (xlator_t *this, server_connection_t *conn, call_frame_t *frame,
ret = inode_path (fd->inode, NULL, &path);
if (ret > 0) {
- gf_log (this->name, GF_LOG_INFO, "fd cleanup on "
- "%s", path);
+ gf_log (this->name, GF_LOG_INFO,
+ "fd cleanup on %s", path);
GF_FREE (path);
} else {
- gf_log (this->name, GF_LOG_INFO, "fd cleanup on"
- " inode with gfid %s",
+ gf_log (this->name, GF_LOG_INFO,
+ "fd cleanup on inode with gfid %s",
uuid_utoa (fd->inode->gfid));
}
tmp_frame->local = fd;
tmp_frame->root->pid = 0;
- tmp_frame->root->trans = server_conn_ref (conn);
+ gf_client_ref (client);
memset (&tmp_frame->root->lk_owner, 0,
sizeof (gf_lkowner_t));
@@ -495,268 +208,72 @@ out:
return ret;
}
-int
-do_connection_cleanup (xlator_t *this, server_connection_t *conn,
- struct _lock_table *ltable, fdentry_t *fdentries, int fd_count)
-{
- int ret = 0;
- int saved_ret = 0;
- call_frame_t *frame = NULL;
- server_state_t *state = NULL;
-
- GF_VALIDATE_OR_GOTO ("server", this, out);
- GF_VALIDATE_OR_GOTO ("server", conn, out);
-
- if (!ltable && !fdentries)
- goto out;
-
- frame = create_frame (this, this->ctx->pool);
- if (frame == NULL) {
- goto out;
- }
-
- if (ltable)
- saved_ret = do_lock_table_cleanup (this, conn, frame, ltable);
-
- if (fdentries != NULL) {
- ret = do_fd_cleanup (this, conn, frame, fdentries, fd_count);
- }
-
- state = CALL_STATE (frame);
- GF_FREE (state);
-
- STACK_DESTROY (frame->root);
-
- if (saved_ret || ret) {
- ret = -1;
- }
-
-out:
- return ret;
-}
-
int
-server_connection_cleanup (xlator_t *this, server_connection_t *conn,
+server_connection_cleanup (xlator_t *this, client_t *client,
int32_t flags)
{
- struct _lock_table *ltable = NULL;
- fdentry_t *fdentries = NULL;
- uint32_t fd_count = 0;
- int ret = 0;
+ server_ctx_t *serv_ctx = NULL;
+ fdentry_t *fdentries = NULL;
+ uint32_t fd_count = 0;
+ int cd_ret = 0;
+ int ret = 0;
GF_VALIDATE_OR_GOTO (this->name, this, out);
- GF_VALIDATE_OR_GOTO (this->name, conn, out);
+ GF_VALIDATE_OR_GOTO (this->name, client, out);
GF_VALIDATE_OR_GOTO (this->name, flags, out);
- pthread_mutex_lock (&conn->lock);
- {
- if (conn->ltable && (flags & INTERNAL_LOCKS)) {
- ltable = conn->ltable;
- conn->ltable = gf_lock_table_new ();
- }
-
- if (conn->fdtable && (flags & POSIX_LOCKS))
- fdentries = gf_fd_fdtable_get_all_fds (conn->fdtable,
- &fd_count);
- }
- pthread_mutex_unlock (&conn->lock);
-
- if (conn->bound_xl)
- ret = do_connection_cleanup (this, conn, ltable,
- fdentries, fd_count);
-
-out:
- return ret;
-}
-
-
-int
-server_connection_destroy (xlator_t *this, server_connection_t *conn)
-{
- xlator_t *bound_xl = NULL;
- int32_t ret = -1;
- struct list_head inodelk_lockers;
- struct list_head entrylk_lockers;
- struct _lock_table *ltable = NULL;
- fdtable_t *fdtable = NULL;
-
- GF_VALIDATE_OR_GOTO ("server", this, out);
- GF_VALIDATE_OR_GOTO ("server", conn, out);
-
- bound_xl = (xlator_t *) (conn->bound_xl);
+ serv_ctx = server_ctx_get (client, client->this);
- if (bound_xl) {
- pthread_mutex_lock (&(conn->lock));
- {
- if (conn->ltable) {
- ltable = conn->ltable;
- conn->ltable = NULL;
- }
- if (conn->fdtable) {
- fdtable = conn->fdtable;
- conn->fdtable = NULL;
- }
- }
- pthread_mutex_unlock (&conn->lock);
-
- INIT_LIST_HEAD (&inodelk_lockers);
- INIT_LIST_HEAD (&entrylk_lockers);
-
- if (ltable) {
- list_splice_init (&ltable->inodelk_lockers,
- &inodelk_lockers);
-
- list_splice_init (&ltable->entrylk_lockers,
- &entrylk_lockers);
- GF_FREE (ltable);
- }
-
- GF_ASSERT (list_empty (&inodelk_lockers));
- GF_ASSERT (list_empty (&entrylk_lockers));
-
- if (fdtable)
- gf_fd_fdtable_destroy (fdtable);
+ if (serv_ctx == NULL) {
+ gf_log (this->name, GF_LOG_INFO, "server_ctx_get() failed");
+ goto out;
}
- gf_log (this->name, GF_LOG_INFO, "destroyed connection of %s",
- conn->id);
-
- pthread_mutex_destroy (&conn->lock);
- GF_FREE (conn->id);
- GF_FREE (conn);
- ret = 0;
-out:
- return ret;
-}
-
-server_connection_t*
-server_conn_unref (server_connection_t *conn)
-{
- server_connection_t *todel = NULL;
- xlator_t *this = NULL;
-
- pthread_mutex_lock (&conn->lock);
+ LOCK (&serv_ctx->fdtable_lock);
{
- conn->ref--;
-
- if (!conn->ref) {
- todel = conn;
- }
+ if (serv_ctx->fdtable && (flags & POSIX_LOCKS))
+ fdentries = gf_fd_fdtable_get_all_fds (serv_ctx->fdtable,
+ &fd_count);
}
- pthread_mutex_unlock (&conn->lock);
+ UNLOCK (&serv_ctx->fdtable_lock);
- if (todel) {
- this = THIS;
- server_connection_destroy (this, todel);
- conn = NULL;
- }
- return conn;
-}
+ if (client->bound_xl == NULL)
+ goto out;
-server_connection_t*
-server_conn_ref (server_connection_t *conn)
-{
- pthread_mutex_lock (&conn->lock);
- {
- conn->ref++;
+ if (flags & INTERNAL_LOCKS) {
+ cd_ret = gf_client_disconnect (client);
}
- pthread_mutex_unlock (&conn->lock);
-
- return conn;
-}
-
-server_connection_t *
-server_connection_get (xlator_t *this, const char *id)
-{
- server_connection_t *conn = NULL;
- server_connection_t *trav = NULL;
- server_conf_t *conf = NULL;
-
- GF_VALIDATE_OR_GOTO ("server", this, out);
- GF_VALIDATE_OR_GOTO ("server", id, out);
-
- conf = this->private;
- pthread_mutex_lock (&conf->mutex);
- {
- list_for_each_entry (trav, &conf->conns, list) {
- if (!strcmp (trav->id, id)) {
- conn = trav;
- conn->bind_ref++;
- goto unlock;
- }
- }
+ if (fdentries != NULL)
+ ret = do_fd_cleanup (this, client, fdentries, fd_count);
+ else
+ gf_log (this->name, GF_LOG_INFO, "no fdentries to clean");
- conn = (void *) GF_CALLOC (1, sizeof (*conn),
- gf_server_mt_conn_t);
- if (!conn)
- goto unlock;
-
- conn->id = gf_strdup (id);
- /*'0' denotes uninitialised lock state*/
- conn->lk_version = 0;
- conn->fdtable = gf_fd_fdtable_alloc ();
- conn->ltable = gf_lock_table_new ();
- conn->this = this;
- conn->bind_ref = 1;
- conn->ref = 1;//when bind_ref becomes 0 it calls conn_unref
- pthread_mutex_init (&conn->lock, NULL);
- list_add (&conn->list, &conf->conns);
+ if (cd_ret || ret)
+ ret = -1;
- }
-unlock:
- pthread_mutex_unlock (&conf->mutex);
out:
- return conn;
+ return ret;
}
-server_connection_t*
-server_connection_put (xlator_t *this, server_connection_t *conn,
- gf_boolean_t *detached)
-{
- server_conf_t *conf = NULL;
- gf_boolean_t unref = _gf_false;
-
- if (detached)
- *detached = _gf_false;
- conf = this->private;
- pthread_mutex_lock (&conf->mutex);
- {
- conn->bind_ref--;
- if (!conn->bind_ref) {
- list_del_init (&conn->list);
- unref = _gf_true;
- }
- }
- pthread_mutex_unlock (&conf->mutex);
- if (unref) {
- gf_log (this->name, GF_LOG_INFO, "Shutting down connection %s",
- conn->id);
- if (detached)
- *detached = _gf_true;
- server_conn_unref (conn);
- conn = NULL;
- }
- return conn;
-}
static call_frame_t *
server_alloc_frame (rpcsvc_request_t *req)
{
- call_frame_t *frame = NULL;
- server_state_t *state = NULL;
- server_connection_t *conn = NULL;
+ call_frame_t *frame = NULL;
+ server_state_t *state = NULL;
+ client_t *client = NULL;
GF_VALIDATE_OR_GOTO ("server", req, out);
GF_VALIDATE_OR_GOTO ("server", req->trans, out);
GF_VALIDATE_OR_GOTO ("server", req->svc, out);
GF_VALIDATE_OR_GOTO ("server", req->svc->ctx, out);
- conn = (server_connection_t *)req->trans->xl_private;
- GF_VALIDATE_OR_GOTO ("server", conn, out);
+ client = req->trans->xl_private;
+ GF_VALIDATE_OR_GOTO ("server", client, out);
- frame = create_frame (conn->this, req->svc->ctx->pool);
+ frame = create_frame (client->this, req->svc->ctx->pool);
if (!frame)
goto out;
@@ -764,45 +281,102 @@ server_alloc_frame (rpcsvc_request_t *req)
if (!state)
goto out;
- if (conn->bound_xl)
- state->itable = conn->bound_xl->itable;
+ if (client->bound_xl)
+ state->itable = client->bound_xl->itable;
state->xprt = rpc_transport_ref (req->trans);
- state->conn = conn;
-
state->resolve.fd_no = -1;
state->resolve2.fd_no = -1;
+ frame->root->client = client;
frame->root->state = state; /* which socket */
frame->root->unique = 0; /* which call */
- frame->this = conn->this;
+ frame->this = client->this;
out:
return frame;
}
-
call_frame_t *
get_frame_from_request (rpcsvc_request_t *req)
{
- call_frame_t *frame = NULL;
+ call_frame_t *frame = NULL;
+ client_t *client = NULL;
+ client_t *tmp_client = NULL;
+ xlator_t *this = NULL;
+ server_conf_t *priv = NULL;
+ clienttable_t *clienttable = NULL;
+ unsigned int i = 0;
GF_VALIDATE_OR_GOTO ("server", req, out);
+ client = req->trans->xl_private;
+
frame = server_alloc_frame (req);
if (!frame)
goto out;
frame->root->op = req->procnum;
- frame->root->type = req->type;
frame->root->unique = req->xid;
+ client = req->trans->xl_private;
+ this = req->trans->xl;
+ priv = this->private;
+ clienttable = this->ctx->clienttable;
+
+ for (i = 0; i < clienttable->max_clients; i++) {
+ tmp_client = clienttable->cliententries[i].client;
+ if (client == tmp_client) {
+ /* for non trusted clients username and password
+ would not have been set. So for non trusted clients
+ (i.e clients not from the same machine as the brick,
+ and clients from outside the storage pool)
+ do the root-squashing.
+ TODO: If any client within the storage pool (i.e
+ mounting within a machine from the pool but using
+ other machine's ip/hostname from the same pool)
+ is present treat it as a trusted client
+ */
+ if (!client->auth.username && req->pid != NFS_PID)
+ RPC_AUTH_ROOT_SQUASH (req);
+
+ /* Problem: If we just check whether the client is
+ trusted client and do not do root squashing for
+ them, then for smb clients and UFO clients root
+ squashing will never happen as they use the fuse
+ mounts done within the trusted pool (i.e they are
+ trusted clients).
+ Solution: To fix it, do root squashing for trusted
+ clients also. If one wants to have a client within
+ the storage pool for which root-squashing does not
+ happen, then the client has to be mounted with
+ --no-root-squash option. But for defrag client and
+ gsyncd client do not do root-squashing.
+ */
+ if (client->auth.username &&
+ req->pid != GF_CLIENT_PID_NO_ROOT_SQUASH &&
+ req->pid != GF_CLIENT_PID_GSYNCD &&
+ req->pid != GF_CLIENT_PID_DEFRAG)
+ RPC_AUTH_ROOT_SQUASH (req);
+
+ /* For nfs clients the server processes will be running
+ within the trusted storage pool machines. So if we
+ do not do root-squashing for nfs servers, thinking
+ that its a trusted client, then root-squashing wont
+ work for nfs clients.
+ */
+ if (req->pid == NFS_PID)
+ RPC_AUTH_ROOT_SQUASH (req);
+ }
+ }
+
frame->root->uid = req->uid;
frame->root->gid = req->gid;
frame->root->pid = req->pid;
- frame->root->trans = server_conn_ref (req->trans->xl_private);
+ gf_client_ref (client);
+ frame->root->client = client;
frame->root->lk_owner = req->lk_owner;
server_decode_groups (frame, req);
@@ -887,84 +461,6 @@ out:
return ret;
}
-void
-put_server_conn_state (xlator_t *this, rpc_transport_t *xprt)
-{
- GF_VALIDATE_OR_GOTO ("server", this, out);
- GF_VALIDATE_OR_GOTO ("server", xprt, out);
-
- xprt->xl_private = NULL;
-out:
- return;
-}
-
-server_connection_t *
-get_server_conn_state (xlator_t *this, rpc_transport_t *xprt)
-{
- GF_VALIDATE_OR_GOTO ("server", this, out);
- GF_VALIDATE_OR_GOTO ("server", xprt, out);
-
- return (server_connection_t *)xprt->xl_private;
-out:
- return NULL;
-}
-
-server_connection_t *
-create_server_conn_state (xlator_t *this, rpc_transport_t *xprt)
-{
- server_connection_t *conn = NULL;
- int ret = -1;
-
- GF_VALIDATE_OR_GOTO ("server", this, out);
- GF_VALIDATE_OR_GOTO ("server", xprt, out);
-
- conn = GF_CALLOC (1, sizeof (*conn), gf_server_mt_conn_t);
- if (!conn)
- goto out;
-
- pthread_mutex_init (&conn->lock, NULL);
-
- conn->fdtable = gf_fd_fdtable_alloc ();
- if (!conn->fdtable)
- goto out;
-
- conn->ltable = gf_lock_table_new ();
- if (!conn->ltable)
- goto out;
-
- conn->this = this;
-
- xprt->xl_private = conn;
-
- ret = 0;
-out:
- if (ret)
- destroy_server_conn_state (conn);
-
- return conn;
-}
-
-void
-destroy_server_conn_state (server_connection_t *conn)
-{
- GF_VALIDATE_OR_GOTO ("server", conn, out);
-
- if (conn->ltable) {
- /* TODO */
- //FREE (conn->ltable);
- ;
- }
-
- if (conn->fdtable)
- gf_fd_fdtable_destroy (conn->fdtable);
-
- pthread_mutex_destroy (&conn->lock);
-
- GF_FREE (conn);
-out:
- return;
-}
-
void
print_caller (char *str, int size, call_frame_t *frame)
@@ -1091,12 +587,15 @@ server_print_params (char *str, int size, server_state_t *state)
filled += snprintf (str + filled, size - filled,
"volume=%s,", state->volume);
+/* FIXME
snprintf (str + filled, size - filled,
- "bound_xl=%s}", state->conn->bound_xl->name);
+ "bound_xl=%s}", state->client->bound_xl->name);
+*/
out:
return;
}
+
int
server_resolve_is_empty (server_resolve_t *resolve)
{
@@ -1112,6 +611,7 @@ server_resolve_is_empty (server_resolve_t *resolve)
return 1;
}
+
void
server_print_reply (call_frame_t *frame, int op_ret, int op_errno)
{
@@ -1157,16 +657,16 @@ out:
void
server_print_request (call_frame_t *frame)
{
- server_conf_t *conf = NULL;
- xlator_t *this = NULL;
+ server_conf_t *conf = NULL;
+ xlator_t *this = NULL;
server_state_t *state = NULL;
+ char *op = "UNKNOWN";
char resolve_vars[256];
char resolve2_vars[256];
char loc_vars[256];
char loc2_vars[256];
char other_vars[512];
char caller[512];
- char *op = "UNKNOWN";
GF_VALIDATE_OR_GOTO ("server", frame, out);
@@ -1217,13 +717,14 @@ out:
return;
}
+
int
serialize_rsp_direntp (gf_dirent_t *entries, gfs3_readdirp_rsp *rsp)
{
gf_dirent_t *entry = NULL;
- gfs3_dirplist *trav = NULL;
- gfs3_dirplist *prev = NULL;
- int ret = -1;
+ gfs3_dirplist *trav = NULL;
+ gfs3_dirplist *prev = NULL;
+ int ret = -1;
GF_VALIDATE_OR_GOTO ("server", entries, out);
GF_VALIDATE_OR_GOTO ("server", rsp, out);
@@ -1244,7 +745,7 @@ serialize_rsp_direntp (gf_dirent_t *entries, gfs3_readdirp_rsp *rsp)
/* if 'dict' is present, pack it */
if (entry->dict) {
trav->dict.dict_len = dict_serialized_length (entry->dict);
- if (trav->dict.dict_len < 0) {
+ if (trav->dict.dict_len > UINT_MAX) {
gf_log (THIS->name, GF_LOG_ERROR,
"failed to get serialized length "
"of reply dict");
@@ -1291,10 +792,10 @@ out:
int
serialize_rsp_dirent (gf_dirent_t *entries, gfs3_readdir_rsp *rsp)
{
- gf_dirent_t *entry = NULL;
- gfs3_dirlist *trav = NULL;
- gfs3_dirlist *prev = NULL;
- int ret = -1;
+ gf_dirent_t *entry = NULL;
+ gfs3_dirlist *trav = NULL;
+ gfs3_dirlist *prev = NULL;
+ int ret = -1;
GF_VALIDATE_OR_GOTO ("server", entries, out);
GF_VALIDATE_OR_GOTO ("server", rsp, out);
@@ -1321,11 +822,12 @@ out:
return ret;
}
+
int
readdir_rsp_cleanup (gfs3_readdir_rsp *rsp)
{
- gfs3_dirlist *prev = NULL;
- gfs3_dirlist *trav = NULL;
+ gfs3_dirlist *prev = NULL;
+ gfs3_dirlist *trav = NULL;
trav = rsp->reply;
prev = trav;
@@ -1338,6 +840,7 @@ readdir_rsp_cleanup (gfs3_readdir_rsp *rsp)
return 0;
}
+
int
readdirp_rsp_cleanup (gfs3_readdirp_rsp *rsp)
{
@@ -1356,6 +859,7 @@ readdirp_rsp_cleanup (gfs3_readdirp_rsp *rsp)
return 0;
}
+
int
gf_server_check_getxattr_cmd (call_frame_t *frame, const char *key)
{
@@ -1384,29 +888,22 @@ gf_server_check_getxattr_cmd (call_frame_t *frame, const char *key)
return 0;
}
+
int
gf_server_check_setxattr_cmd (call_frame_t *frame, dict_t *dict)
{
- server_conf_t *conf = NULL;
- rpc_transport_t *xprt = NULL;
- uint64_t total_read = 0;
+ server_conf_t *conf = NULL;
+ rpc_transport_t *xprt = NULL;
+ uint64_t total_read = 0;
uint64_t total_write = 0;
conf = frame->this->private;
if (!conf || !dict)
return 0;
- /* this exact key is used in 'io-stats' too.
- * But this is better place for this information dump.
- */
- int _handle_keyvalue_pair (dict_t *d, char *k,
- data_t *v, void *tmp)
- {
- return 0;
- }
if (dict_foreach_fnmatch (dict, "*io*stat*dump",
- _handle_keyvalue_pair, NULL ) > 0) {
+ dict_null_foreach_fn, NULL ) > 0) {
list_for_each_entry (xprt, &conf->xprt_list, list) {
total_read += xprt->total_bytes_read;
total_write += xprt->total_bytes_write;
@@ -1419,32 +916,504 @@ gf_server_check_setxattr_cmd (call_frame_t *frame, dict_t *dict)
return 0;
}
+
gf_boolean_t
-server_cancel_conn_timer (xlator_t *this, server_connection_t *conn)
+server_cancel_grace_timer (xlator_t *this, client_t *client)
{
- gf_timer_t *timer = NULL;
- gf_boolean_t cancelled = _gf_false;
+ server_ctx_t *serv_ctx = NULL;
+ gf_timer_t *timer = NULL;
+ gf_boolean_t cancelled = _gf_false;
- if (!this || !conn) {
- gf_log (THIS->name, GF_LOG_ERROR, "Invalid arguments to "
- "cancel connection timer");
+ if (!this || !client) {
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "Invalid arguments to cancel connection timer");
return cancelled;
}
- pthread_mutex_lock (&conn->lock);
- {
- if (!conn->timer)
- goto unlock;
+ serv_ctx = server_ctx_get (client, client->this);
- timer = conn->timer;
- conn->timer = NULL;
+ if (serv_ctx == NULL) {
+ gf_log (this->name, GF_LOG_INFO, "server_ctx_get() failed");
+ goto out;
}
-unlock:
- pthread_mutex_unlock (&conn->lock);
+
+ LOCK (&serv_ctx->fdtable_lock);
+ {
+ if (serv_ctx->grace_timer) {
+ timer = serv_ctx->grace_timer;
+ serv_ctx->grace_timer = NULL;
+ }
+ }
+ UNLOCK (&serv_ctx->fdtable_lock);
if (timer) {
gf_timer_call_cancel (this->ctx, timer);
cancelled = _gf_true;
}
+out:
return cancelled;
}
+
+server_ctx_t*
+server_ctx_get (client_t *client, xlator_t *xlator)
+{
+ void *tmp = NULL;
+ server_ctx_t *ctx = NULL;
+
+ client_ctx_get (client, xlator, &tmp);
+
+ ctx = tmp;
+
+ if (ctx != NULL)
+ goto out;
+
+ ctx = GF_CALLOC (1, sizeof (server_ctx_t), gf_server_mt_server_conf_t);
+
+ if (ctx == NULL)
+ goto out;
+
+ /* ctx->lk_version = 0; redundant */
+ ctx->fdtable = gf_fd_fdtable_alloc ();
+
+ if (ctx->fdtable == NULL) {
+ GF_FREE (ctx);
+ ctx = NULL;
+ goto out;
+ }
+
+ LOCK_INIT (&ctx->fdtable_lock);
+
+ if (client_ctx_set (client, xlator, ctx) != 0) {
+ LOCK_DESTROY (&ctx->fdtable_lock);
+ GF_FREE (ctx);
+ ctx = NULL;
+ }
+
+out:
+ return ctx;
+}
+
+int
+auth_set_username_passwd (dict_t *input_params, dict_t *config_params,
+ client_t *client)
+{
+ int ret = 0;
+ data_t *allow_user = NULL;
+ data_t *passwd_data = NULL;
+ char *username = NULL;
+ char *password = NULL;
+ char *brick_name = NULL;
+ char *searchstr = NULL;
+ char *username_str = NULL;
+ char *tmp = NULL;
+ char *username_cpy = NULL;
+
+ ret = dict_get_str (input_params, "username", &username);
+ if (ret) {
+ gf_log ("auth/login", GF_LOG_DEBUG,
+ "username not found, returning DONT-CARE");
+ /* For non trusted clients username and password
+ will not be there. So dont reject the client.
+ */
+ ret = 0;
+ goto out;
+ }
+
+ ret = dict_get_str (input_params, "password", &password);
+ if (ret) {
+ gf_log ("auth/login", GF_LOG_WARNING,
+ "password not found, returning DONT-CARE");
+ goto out;
+ }
+
+ ret = dict_get_str (input_params, "remote-subvolume", &brick_name);
+ if (ret) {
+ gf_log ("auth/login", GF_LOG_ERROR,
+ "remote-subvolume not specified");
+ ret = -1;
+ goto out;
+ }
+
+ ret = gf_asprintf (&searchstr, "auth.login.%s.allow", brick_name);
+ if (-1 == ret) {
+ ret = 0;
+ goto out;
+ }
+
+ allow_user = dict_get (config_params, searchstr);
+ GF_FREE (searchstr);
+
+ if (allow_user) {
+ username_cpy = gf_strdup (allow_user->data);
+ if (!username_cpy)
+ goto out;
+
+ username_str = strtok_r (username_cpy, " ,", &tmp);
+
+ while (username_str) {
+ if (!fnmatch (username_str, username, 0)) {
+ ret = gf_asprintf (&searchstr,
+ "auth.login.%s.password",
+ username);
+ if (-1 == ret)
+ goto out;
+
+ passwd_data = dict_get (config_params,
+ searchstr);
+ GF_FREE (searchstr);
+
+ if (!passwd_data) {
+ gf_log ("auth/login", GF_LOG_ERROR,
+ "wrong username/password "
+ "combination");
+ ret = -1;
+ goto out;
+ }
+
+ ret = !((strcmp (data_to_str (passwd_data),
+ password))?0: -1);
+ if (!ret) {
+ client->auth.username =
+ gf_strdup (username);
+ client->auth.passwd =
+ gf_strdup (password);
+ }
+ if (ret == -1)
+ gf_log ("auth/login", GF_LOG_ERROR,
+ "wrong password for user %s",
+ username);
+ break;
+ }
+ username_str = strtok_r (NULL, " ,", &tmp);
+ }
+ }
+
+out:
+ GF_FREE (username_cpy);
+
+ return ret;
+}
+
+int32_t
+gf_barrier_transmit (server_conf_t *conf, gf_barrier_payload_t *payload)
+{
+ gf_barrier_t *barrier = NULL;
+ int32_t ret = -1;
+ client_t *client = NULL;
+ gf_boolean_t lk_heal = _gf_false;
+ call_frame_t *frame = NULL;
+ server_state_t *state = NULL;
+
+ GF_VALIDATE_OR_GOTO ("barrier", conf, out);
+ GF_VALIDATE_OR_GOTO ("barrier", conf->barrier, out);
+ GF_VALIDATE_OR_GOTO ("barrier", payload, out);
+
+ barrier = conf->barrier;
+
+ frame = payload->frame;
+ if (frame) {
+ state = CALL_STATE (frame);
+ frame->local = NULL;
+ client = frame->root->client;
+ }
+ /* currently lk fops are not barrier'ed. This is reflecting code in
+ * server_submit_reply */
+ if (client)
+ lk_heal = ((server_conf_t *) client->this->private)->lk_heal;
+
+ ret = rpcsvc_submit_generic (payload->req, &payload->rsp, 1,
+ payload->payload, payload->payload_count,
+ payload->iobref);
+ iobuf_unref (payload->iob);
+ if (ret == -1) {
+ gf_log_callingfn ("", GF_LOG_ERROR, "Reply submission failed");
+ if (frame && client && !lk_heal) {
+ server_connection_cleanup (frame->this, client,
+ INTERNAL_LOCKS | POSIX_LOCKS);
+ } else {
+ /* TODO: Failure of open(dir), create, inodelk, entrylk
+ or lk fops send failure must be handled specially. */
+ }
+ goto ret;
+ }
+
+ ret = 0;
+ret:
+ if (state) {
+ free_state (state);
+ }
+
+ if (frame) {
+ gf_client_unref (client);
+ STACK_DESTROY (frame->root);
+ }
+
+ if (payload->free_iobref) {
+ iobref_unref (payload->iobref);
+ }
+out:
+ return ret;
+}
+
+gf_barrier_payload_t *
+gf_barrier_dequeue (gf_barrier_t *barrier)
+{
+ gf_barrier_payload_t *payload = NULL;
+
+ if (!barrier || list_empty (&barrier->queue))
+ return NULL;
+
+ payload = list_entry (barrier->queue.next,
+ gf_barrier_payload_t, list);
+ if (payload) {
+ list_del_init (&payload->list);
+ barrier->cur_size--;
+ }
+
+ return payload;
+}
+
+
+void*
+gf_barrier_dequeue_start (void *data)
+{
+ server_conf_t *conf = NULL;
+ gf_barrier_t *barrier = NULL;
+ gf_barrier_payload_t *payload = NULL;
+
+ conf = (server_conf_t *)data;
+ if (!conf || !conf->barrier)
+ return NULL;
+ barrier = conf->barrier;
+
+ LOCK (&barrier->lock);
+ {
+ while (barrier->cur_size) {
+ payload = gf_barrier_dequeue (barrier);
+ if (payload) {
+ if (gf_barrier_transmit (conf, payload)) {
+ gf_log ("server", GF_LOG_WARNING,
+ "Failed to transmit");
+ }
+ GF_FREE (payload);
+ }
+ }
+ }
+ UNLOCK (&barrier->lock);
+ return NULL;
+}
+
+void
+gf_barrier_timeout (void *data)
+{
+ server_conf_t *conf = NULL;
+ gf_barrier_t *barrier = NULL;
+ gf_boolean_t need_dequeue = _gf_false;
+
+ conf = (server_conf_t *)data;
+ if (!conf || !conf->barrier)
+ goto out;
+ barrier = conf->barrier;
+
+ gf_log ("", GF_LOG_INFO, "barrier timed-out");
+ LOCK (&barrier->lock);
+ {
+ need_dequeue = barrier->on;
+ barrier->on = _gf_false;
+ }
+ UNLOCK (&barrier->lock);
+
+ if (need_dequeue == _gf_true)
+ gf_barrier_dequeue_start (data);
+out:
+ return;
+}
+
+
+int32_t
+gf_barrier_start (xlator_t *this)
+{
+ server_conf_t *conf = NULL;
+ gf_barrier_t *barrier = NULL;
+ int32_t ret = -1;
+ struct timespec time = {0,};
+
+ conf = this->private;
+
+ GF_VALIDATE_OR_GOTO ("server", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, conf, out);
+ GF_VALIDATE_OR_GOTO (this->name, conf->barrier, out);
+
+ barrier = conf->barrier;
+
+ gf_log (this->name, GF_LOG_INFO, "barrier start called");
+ LOCK (&barrier->lock);
+ {
+ /* if barrier is on, reset timer */
+ if (barrier->on == _gf_true) {
+ ret = gf_timer_call_cancel (this->ctx, barrier->timer);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to "
+ "unset timer, failing barrier start");
+ goto unlock;
+ }
+ }
+
+ barrier->on = _gf_true;
+ time.tv_sec = barrier->time_out;
+ time.tv_nsec = 0;
+
+ barrier->timer = gf_timer_call_after (this->ctx, time,
+ gf_barrier_timeout,
+ (void *)conf);
+ if (!barrier->timer) {
+ gf_log (this->name, GF_LOG_ERROR, "Failed to set "
+ "timer, failing barrier start");
+ barrier->on = _gf_false;
+ }
+ }
+unlock:
+ UNLOCK (&barrier->lock);
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int32_t
+gf_barrier_stop (xlator_t *this)
+{
+ server_conf_t *conf = NULL;
+ gf_barrier_t *barrier = NULL;
+ int32_t ret = -1;
+ gf_boolean_t need_dequeue = _gf_false;
+
+ conf = this->private;
+
+ GF_VALIDATE_OR_GOTO ("server", this, out);
+ GF_VALIDATE_OR_GOTO (this->name, conf, out);
+ GF_VALIDATE_OR_GOTO (this->name, conf->barrier, out);
+
+ barrier = conf->barrier;
+
+ gf_log (this->name, GF_LOG_INFO, "barrier stop called");
+ LOCK (&barrier->lock);
+ {
+ need_dequeue = barrier->on;
+ barrier->on = _gf_false;
+ }
+ UNLOCK (&barrier->lock);
+
+ if (need_dequeue == _gf_true) {
+ gf_timer_call_cancel (this->ctx, barrier->timer);
+ ret = gf_thread_create (&conf->barrier_th, NULL,
+ gf_barrier_dequeue_start,
+ conf);
+ if (ret) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "Failed to start un-barriering");
+ goto out;
+ }
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+int32_t
+gf_barrier_fops_configure (xlator_t *this, gf_barrier_t *barrier, char *str)
+{
+ int32_t ret = -1;
+ char *dup_str = NULL;
+ char *str_tok = NULL;
+ char *save_ptr = NULL;
+ uint64_t fops = 0;
+
+ /* by defaul fsync & flush needs to be barriered */
+
+ fops |= 1 << GFS3_OP_FSYNC;
+ fops |= 1 << GFS3_OP_FLUSH;
+
+ if (!str)
+ goto done;
+
+ dup_str = gf_strdup (str);
+ if (!dup_str)
+ goto done;
+
+ str_tok = strtok_r (dup_str, ",", &save_ptr);
+ if (!str_tok)
+ goto done;
+
+ fops = 0;
+ while (str_tok) {
+ if (!strcmp(str_tok, "writev")) {
+ fops |= ((uint64_t)1 << GFS3_OP_WRITE);
+ } else if (!strcmp(str_tok, "fsync")) {
+ fops |= ((uint64_t)1 << GFS3_OP_FSYNC);
+ } else if (!strcmp(str_tok, "read")) {
+ fops |= ((uint64_t)1 << GFS3_OP_READ);
+ } else if (!strcmp(str_tok, "rename")) {
+ fops |= ((uint64_t)1 << GFS3_OP_RENAME);
+ } else if (!strcmp(str_tok, "flush")) {
+ fops |= ((uint64_t)1 << GFS3_OP_FLUSH);
+ } else if (!strcmp(str_tok, "ftruncate")) {
+ fops |= ((uint64_t)1 << GFS3_OP_FTRUNCATE);
+ } else if (!strcmp(str_tok, "fallocate")) {
+ fops |= ((uint64_t)1 << GFS3_OP_FALLOCATE);
+ } else if (!strcmp(str_tok, "rmdir")) {
+ fops |= ((uint64_t)1 << GFS3_OP_RMDIR);
+ } else {
+ gf_log ("barrier", GF_LOG_ERROR,
+ "Invalid barrier fop %s", str_tok);
+ }
+
+ str_tok = strtok_r (NULL, ",", &save_ptr);
+ }
+done:
+ LOCK (&barrier->lock);
+ {
+ barrier->fops = fops;
+ }
+ UNLOCK (&barrier->lock);
+ ret = 0;
+
+ GF_FREE (dup_str);
+ return ret;
+}
+
+void
+gf_barrier_enqueue (gf_barrier_t *barrier, gf_barrier_payload_t *payload)
+{
+ list_add_tail (&payload->list, &barrier->queue);
+ barrier->cur_size++;
+}
+
+gf_barrier_payload_t *
+gf_barrier_payload (rpcsvc_request_t *req, struct iovec *rsp,
+ call_frame_t *frame, struct iovec *payload_orig,
+ int payloadcount, struct iobref *iobref,
+ struct iobuf *iob, gf_boolean_t free_iobref)
+{
+ gf_barrier_payload_t *payload = NULL;
+
+ if (!rsp)
+ return NULL;
+
+ payload = GF_CALLOC (1, sizeof (*payload),1);
+ if (!payload)
+ return NULL;
+
+ INIT_LIST_HEAD (&payload->list);
+
+ payload->req = req;
+ memcpy (&payload->rsp, rsp, sizeof (struct iovec));
+ payload->frame = frame;
+ payload->payload = payload_orig;
+ payload->payload_count = payloadcount;
+ payload->iobref = iobref;
+ payload->iob = iob;
+ payload->free_iobref = free_iobref;
+
+ return payload;
+}
diff --git a/xlators/protocol/server/src/server-helpers.h b/xlators/protocol/server/src/server-helpers.h
index 5ce5e36ac..486048b8a 100644
--- a/xlators/protocol/server/src/server-helpers.h
+++ b/xlators/protocol/server/src/server-helpers.h
@@ -1,20 +1,11 @@
/*
- Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2010-2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _SERVER_HELPERS_H
@@ -24,13 +15,8 @@
#define CALL_STATE(frame) ((server_state_t *)frame->root->state)
-#define BOUND_XL(frame) ((xlator_t *) CALL_STATE(frame)->conn->bound_xl)
-
#define XPRT_FROM_FRAME(frame) ((rpc_transport_t *) CALL_STATE(frame)->xprt)
-#define SERVER_CONNECTION(frame) \
- ((server_connection_t *) CALL_STATE(frame)->conn)
-
#define SERVER_CONF(frame) \
((server_conf_t *)XPRT_FROM_FRAME(frame)->this->private)
@@ -43,45 +29,26 @@
#define IS_NOT_ROOT(pathlen) ((pathlen > 2)? 1 : 0)
+#define is_fop_barriered(fops, procnum) (fops & ((uint64_t)1 << procnum))
+
+#define barrier_add_to_queue(barrier) (barrier->on || barrier->cur_size)
+
void free_state (server_state_t *state);
void server_loc_wipe (loc_t *loc);
-int32_t
-gf_add_locker (server_connection_t *conn, const char *volume,
- loc_t *loc,
- fd_t *fd,
- pid_t pid,
- gf_lkowner_t *owner,
- glusterfs_fop_t type);
-
-int32_t
-gf_del_locker (server_connection_t *conn, const char *volume,
- loc_t *loc,
- fd_t *fd,
- gf_lkowner_t *owner,
- glusterfs_fop_t type);
-
void
server_print_request (call_frame_t *frame);
call_frame_t *
get_frame_from_request (rpcsvc_request_t *req);
-gf_boolean_t
-server_cancel_conn_timer (xlator_t *this, server_connection_t *conn);
-
-void
-put_server_conn_state (xlator_t *this, rpc_transport_t *xprt);
-
-server_connection_t *
-get_server_conn_state (xlator_t *this, rpc_transport_t *xptr);
-
-server_connection_t *
-create_server_conn_state (xlator_t *this, rpc_transport_t *xptr);
+int
+server_connection_cleanup (xlator_t *this, struct _client_t *client,
+ int32_t flags);
-void
-destroy_server_conn_state (server_connection_t *conn);
+gf_boolean_t
+server_cancel_grace_timer (xlator_t *this, struct _client_t *client);
int
server_build_config (xlator_t *this, server_conf_t *conf);
@@ -90,5 +57,20 @@ int serialize_rsp_dirent (gf_dirent_t *entries, gfs3_readdir_rsp *rsp);
int serialize_rsp_direntp (gf_dirent_t *entries, gfs3_readdirp_rsp *rsp);
int readdirp_rsp_cleanup (gfs3_readdirp_rsp *rsp);
int readdir_rsp_cleanup (gfs3_readdir_rsp *rsp);
+int auth_set_username_passwd (dict_t *input_params, dict_t *config_params,
+ struct _client_t *client);
+
+server_ctx_t *server_ctx_get (client_t *client, xlator_t *xlator);
+
+int32_t gf_barrier_start (xlator_t *this);
+int32_t gf_barrier_stop (xlator_t *this);
+int32_t gf_barrier_fops_configure (xlator_t *this, gf_barrier_t *barrier,
+ char *str);
+void gf_barrier_enqueue (gf_barrier_t *barrier, gf_barrier_payload_t *stub);
+gf_barrier_payload_t *
+gf_barrier_payload (rpcsvc_request_t *req, struct iovec *rsp,
+ call_frame_t *frame, struct iovec *payload,
+ int payloadcount, struct iobref *iobref,
+ struct iobuf *iob, gf_boolean_t free_iobref);
#endif /* !_SERVER_HELPERS_H */
diff --git a/xlators/protocol/server/src/server-mem-types.h b/xlators/protocol/server/src/server-mem-types.h
index 5438ed6db..19c3466d3 100644
--- a/xlators/protocol/server/src/server-mem-types.h
+++ b/xlators/protocol/server/src/server-mem-types.h
@@ -1,20 +1,11 @@
/*
- Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2010-2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
diff --git a/xlators/protocol/server/src/server-resolve.c b/xlators/protocol/server/src/server-resolve.c
index 11b488187..b2bff5c53 100644
--- a/xlators/protocol/server/src/server-resolve.c
+++ b/xlators/protocol/server/src/server-resolve.c
@@ -1,20 +1,11 @@
/*
- Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2010-2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _CONFIG_H
@@ -156,7 +147,8 @@ resolve_gfid_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
(char **) &resolve_loc->path);
STACK_WIND (frame, resolve_gfid_entry_cbk,
- BOUND_XL (frame), BOUND_XL (frame)->fops->lookup,
+ frame->root->client->bound_xl,
+ frame->root->client->bound_xl->fops->lookup,
&resolve->resolve_loc, NULL);
return 0;
out:
@@ -188,7 +180,8 @@ resolve_gfid (call_frame_t *frame)
ret = loc_path (resolve_loc, NULL);
STACK_WIND (frame, resolve_gfid_cbk,
- BOUND_XL (frame), BOUND_XL (frame)->fops->lookup,
+ frame->root->client->bound_xl,
+ frame->root->client->bound_xl->fops->lookup,
&resolve->resolve_loc, NULL);
return 0;
}
@@ -253,7 +246,7 @@ resolve_entry_simple (call_frame_t *frame)
/* simple resolution is indecisive. need to perform
deep resolution */
resolve->op_ret = -1;
- resolve->op_errno = ENOENT;
+ resolve->op_errno = ESTALE;
ret = 1;
goto out;
}
@@ -350,7 +343,7 @@ resolve_inode_simple (call_frame_t *frame)
if (!inode) {
resolve->op_ret = -1;
- resolve->op_errno = ENOENT;
+ resolve->op_errno = ESTALE;
ret = 1;
goto out;
}
@@ -458,23 +451,34 @@ server_resolve_anonfd (call_frame_t *frame)
int
server_resolve_fd (call_frame_t *frame)
{
- server_state_t *state = NULL;
- server_resolve_t *resolve = NULL;
- server_connection_t *conn = NULL;
- uint64_t fd_no = -1;
+ server_ctx_t *serv_ctx = NULL;
+ server_state_t *state = NULL;
+ client_t *client = NULL;
+ server_resolve_t *resolve = NULL;
+ uint64_t fd_no = -1;
state = CALL_STATE (frame);
resolve = state->resolve_now;
- conn = SERVER_CONNECTION (frame);
fd_no = resolve->fd_no;
- if (fd_no == -2) {
+ if (fd_no == GF_ANON_FD_NO) {
server_resolve_anonfd (frame);
return 0;
}
- state->fd = gf_fd_fdptr_get (conn->fdtable, fd_no);
+ client = frame->root->client;
+
+ serv_ctx = server_ctx_get (client, client->this);
+
+ if (serv_ctx == NULL) {
+ gf_log ("", GF_LOG_INFO, "server_ctx_get() failed");
+ resolve->op_ret = -1;
+ resolve->op_errno = ENOMEM;
+ return 0;
+ }
+
+ state->fd = gf_fd_fdptr_get (serv_ctx->fdtable, fd_no);
if (!state->fd) {
gf_log ("", GF_LOG_INFO, "fd not found in context");
@@ -529,14 +533,12 @@ int
server_resolve_done (call_frame_t *frame)
{
server_state_t *state = NULL;
- xlator_t *bound_xl = NULL;
state = CALL_STATE (frame);
- bound_xl = BOUND_XL (frame);
server_print_request (frame);
- state->resume_fn (frame, bound_xl);
+ state->resume_fn (frame, frame->root->client->bound_xl);
return 0;
}
diff --git a/xlators/protocol/server/src/server-rpc-fops.c b/xlators/protocol/server/src/server-rpc-fops.c
index 422ed3188..70b8ab3a6 100644
--- a/xlators/protocol/server/src/server-rpc-fops.c
+++ b/xlators/protocol/server/src/server-rpc-fops.c
@@ -1,20 +1,11 @@
/*
- Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2010-2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
@@ -33,6 +24,11 @@
#include "xdr-nfs3.h"
+#define SERVER_REQ_SET_ERROR(req, ret) \
+ do { \
+ rpcsvc_request_seterr (req, GARBAGE_ARGS); \
+ ret = RPCSVC_ACTOR_ERROR; \
+ } while (0)
/* Callback function section */
int
@@ -40,17 +36,14 @@ server_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct statvfs *buf,
dict_t *xdata)
{
- gfs3_statfs_rsp rsp = {0,};
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
+ gfs3_statfs_rsp rsp = {0,};
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret < 0) {
- gf_log (this->name, GF_LOG_WARNING,
- "%"PRId64": STATFS (%s)",
+ gf_log (this->name, GF_LOG_WARNING, "%"PRId64": STATFS (%s)",
frame->root->unique, strerror (op_errno));
goto out;
}
@@ -61,7 +54,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
-
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_statfs_rsp);
@@ -76,16 +69,15 @@ server_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
inode_t *inode, struct iatt *stbuf, dict_t *xdata,
struct iatt *postparent)
{
- rpcsvc_request_t *req = NULL;
- server_state_t *state = NULL;
- inode_t *root_inode = NULL;
- inode_t *link_inode = NULL;
- loc_t fresh_loc = {0,};
- gfs3_lookup_rsp rsp = {0,};
- uuid_t rootgfid = {0,};
+ rpcsvc_request_t *req = NULL;
+ server_state_t *state = NULL;
+ inode_t *root_inode = NULL;
+ inode_t *link_inode = NULL;
+ loc_t fresh_loc = {0,};
+ gfs3_lookup_rsp rsp = {0,};
+ uuid_t rootgfid = {0,};
- req = frame->local;
- state = CALL_STATE(frame);
+ state = CALL_STATE (frame);
if (state->is_revalidate == 1 && op_ret == -1) {
state->is_revalidate = 2;
@@ -93,8 +85,9 @@ server_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
inode_unref (fresh_loc.inode);
fresh_loc.inode = inode_new (state->itable);
- STACK_WIND (frame, server_lookup_cbk, BOUND_XL (frame),
- BOUND_XL (frame)->fops->lookup,
+ STACK_WIND (frame, server_lookup_cbk,
+ frame->root->client->bound_xl,
+ frame->root->client->bound_xl->fops->lookup,
&fresh_loc, state->xdata);
loc_wipe (&fresh_loc);
@@ -103,7 +96,7 @@ server_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
gf_stat_from_iatt (&rsp.postparent, postparent);
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret) {
@@ -117,7 +110,7 @@ server_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- root_inode = BOUND_XL(frame)->itable->root;
+ root_inode = frame->root->client->bound_xl->itable->root;
if (inode == root_inode) {
/* we just looked up root ("/") */
stbuf->ia_ino = 1;
@@ -146,8 +139,9 @@ out:
if (state->resolve.bname) {
gf_log (this->name, ((op_errno == ENOENT) ?
GF_LOG_TRACE : GF_LOG_INFO),
- "%"PRId64": LOOKUP %s (%s/%s) ==> (%s)",
- frame->root->unique, state->loc.path,
+ "%"PRId64": LOOKUP %s (%s/%s) ==> "
+ "(%s)", frame->root->unique,
+ state->loc.path,
uuid_utoa (state->resolve.pargfid),
state->resolve.bname,
strerror (op_errno));
@@ -161,6 +155,7 @@ out:
}
}
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_lookup_rsp);
@@ -175,21 +170,20 @@ server_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
dict_t *xdata)
{
- gfs3_lk_rsp rsp = {0,};
- rpcsvc_request_t *req = NULL;
- server_state_t *state = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gfs3_lk_rsp rsp = {0,};
+ rpcsvc_request_t *req = NULL;
+ server_state_t *state = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret) {
if ((op_errno != ENOSYS) && (op_errno != EAGAIN)) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": LK %"PRId64" (%s) ==> (%s)",
- frame->root->unique, state->resolve.fd_no,
+ "%"PRId64": LK %"PRId64" (%s) ==> "
+ "(%s)", frame->root->unique,
+ state->resolve.fd_no,
uuid_utoa (state->resolve.gfid),
strerror (op_errno));
}
@@ -218,6 +212,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_lk_rsp);
@@ -231,21 +226,19 @@ int
server_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- gf_common_rsp rsp = {0,};
- server_connection_t *conn = NULL;
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
+ gf_common_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- req = frame->local;
- conn = SERVER_CONNECTION(frame);
- state = CALL_STATE(frame);
-
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
+ state = CALL_STATE (frame);
+
if (op_ret < 0) {
if ((op_errno != ENOSYS) && (op_errno != EAGAIN)) {
- gf_log (this->name, GF_LOG_INFO,
+ gf_log (this->name, (op_errno == ENOENT)?
+ GF_LOG_DEBUG:GF_LOG_ERROR,
"%"PRId64": INODELK %s (%s) ==> (%s)",
frame->root->unique, state->loc.path,
uuid_utoa (state->resolve.gfid),
@@ -254,20 +247,11 @@ server_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- if (state->flock.l_type == F_UNLCK)
- gf_del_locker (conn, state->volume,
- &state->loc, NULL, &frame->root->lk_owner,
- GF_FOP_INODELK);
- else
- gf_add_locker (conn, state->volume,
- &state->loc, NULL, frame->root->pid,
- &frame->root->lk_owner,
- GF_FOP_INODELK);
-
out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_common_rsp);
@@ -281,43 +265,32 @@ int
server_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- gf_common_rsp rsp = {0,};
- server_state_t *state = NULL;
- server_connection_t *conn = NULL;
- rpcsvc_request_t *req = NULL;
+ gf_common_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- req = frame->local;
- conn = SERVER_CONNECTION(frame);
- state = CALL_STATE(frame);
-
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
+ state = CALL_STATE (frame);
+
if (op_ret < 0) {
if ((op_errno != ENOSYS) && (op_errno != EAGAIN)) {
gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": FINODELK %"PRId64" (%s) ==> (%s)",
- frame->root->unique, state->resolve.fd_no,
+ "%"PRId64": FINODELK %"PRId64" (%s) "
+ "==> (%s)", frame->root->unique,
+ state->resolve.fd_no,
uuid_utoa (state->resolve.gfid),
strerror (op_errno));
}
goto out;
}
- if (state->flock.l_type == F_UNLCK)
- gf_del_locker (conn, state->volume,
- NULL, state->fd,
- &frame->root->lk_owner, GF_FOP_INODELK);
- else
- gf_add_locker (conn, state->volume,
- NULL, state->fd,
- frame->root->pid,
- &frame->root->lk_owner, GF_FOP_INODELK);
-
out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_common_rsp);
@@ -330,18 +303,15 @@ int
server_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- server_connection_t *conn = NULL;
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
- gf_common_rsp rsp = {0,};
+ gf_common_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- req = frame->local;
- conn = SERVER_CONNECTION(frame);
- state = CALL_STATE(frame);
-
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
+ state = CALL_STATE (frame);
+
if (op_ret < 0) {
if ((op_errno != ENOSYS) && (op_errno != EAGAIN)) {
gf_log (this->name, GF_LOG_INFO,
@@ -353,20 +323,11 @@ server_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- if (state->cmd == ENTRYLK_UNLOCK)
- gf_del_locker (conn, state->volume,
- &state->loc, NULL, &frame->root->lk_owner,
- GF_FOP_ENTRYLK);
- else
- gf_add_locker (conn, state->volume,
- &state->loc, NULL, frame->root->pid,
- &frame->root->lk_owner,
- GF_FOP_ENTRYLK);
-
out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_common_rsp);
@@ -380,18 +341,15 @@ int
server_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- gf_common_rsp rsp = {0,};
- server_connection_t *conn = NULL;
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
+ gf_common_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- req = frame->local;
- conn = SERVER_CONNECTION(frame);
- state = CALL_STATE(frame);
-
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
+ state = CALL_STATE (frame);
+
if (op_ret < 0) {
if ((op_errno != ENOSYS) && (op_errno != EAGAIN)) {
gf_log (this->name, GF_LOG_INFO,
@@ -403,19 +361,11 @@ server_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- if (state->cmd == ENTRYLK_UNLOCK)
- gf_del_locker (conn, state->volume,
- NULL, state->fd, &frame->root->lk_owner,
- GF_FOP_ENTRYLK);
- else
- gf_add_locker (conn, state->volume,
- NULL, state->fd, frame->root->pid,
- &frame->root->lk_owner, GF_FOP_ENTRYLK);
-
out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_common_rsp);
@@ -429,17 +379,15 @@ int
server_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- gf_common_rsp rsp = {0,};
- rpcsvc_request_t *req = NULL;
- server_state_t *state = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gf_common_rsp rsp = {0,};
+ rpcsvc_request_t *req = NULL;
+ server_state_t *state = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": ACCESS %s (%s) ==> (%s)",
frame->root->unique, state->loc.path,
@@ -452,6 +400,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_common_rsp);
@@ -465,17 +414,16 @@ server_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- gfs3_rmdir_rsp rsp = {0,};
- server_state_t *state = NULL;
- inode_t *parent = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gfs3_rmdir_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ inode_t *parent = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
+ state = CALL_STATE (frame);
+
if (op_ret) {
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": RMDIR %s (%s/%s) ==> (%s)",
@@ -504,6 +452,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_rmdir_rsp);
@@ -518,17 +467,16 @@ server_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *stbuf, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- gfs3_mkdir_rsp rsp = {0,};
- server_state_t *state = NULL;
- inode_t *link_inode = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gfs3_mkdir_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ inode_t *link_inode = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
+ state = CALL_STATE (frame);
+
if (op_ret < 0) {
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": MKDIR %s (%s/%s) ==> (%s)",
@@ -551,6 +499,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_mkdir_rsp);
@@ -565,17 +514,16 @@ server_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
inode_t *inode, struct iatt *stbuf, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- gfs3_mknod_rsp rsp = {0,};
- server_state_t *state = NULL;
- inode_t *link_inode = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gfs3_mknod_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ inode_t *link_inode = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
+ state = CALL_STATE (frame);
+
if (op_ret < 0) {
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": MKNOD %s (%s/%s) ==> (%s)",
@@ -598,6 +546,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_mknod_rsp);
@@ -610,17 +559,15 @@ int
server_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- gf_common_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gf_common_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret < 0) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": FSYNCDIR %"PRId64" (%s) ==> (%s)",
frame->root->unique, state->resolve.fd_no,
@@ -633,6 +580,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_common_rsp);
@@ -646,18 +594,16 @@ server_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
dict_t *xdata)
{
- gfs3_readdir_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
- int ret = 0;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gfs3_readdir_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
+ int ret = 0;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret < 0) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": READDIR %"PRId64" (%s) ==> (%s)",
frame->root->unique, state->resolve.fd_no,
@@ -680,6 +626,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_readdir_rsp);
@@ -694,29 +641,33 @@ int
server_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
{
- server_connection_t *conn = NULL;
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
- gfs3_opendir_rsp rsp = {0,};
- uint64_t fd_no = 0;
-
- req = frame->local;
- conn = SERVER_CONNECTION (frame);
- state = CALL_STATE (frame);
+ server_state_t *state = NULL;
+ server_ctx_t *serv_ctx = NULL;
+ rpcsvc_request_t *req = NULL;
+ gfs3_opendir_rsp rsp = {0,};
+ uint64_t fd_no = 0;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
+ state = CALL_STATE (frame);
+ gf_log (this->name, (op_errno == ENOENT)?
+ GF_LOG_DEBUG:GF_LOG_ERROR,
"%"PRId64": OPENDIR %s (%s) ==> (%s)",
frame->root->unique, state->loc.path,
uuid_utoa (state->resolve.gfid), strerror (op_errno));
goto out;
}
+ serv_ctx = server_ctx_get (frame->root->client, this);
+ if (serv_ctx == NULL) {
+ gf_log (this->name, GF_LOG_INFO, "server_ctx_get() failed");
+ goto out;
+ }
+
fd_bind (fd);
- fd_no = gf_fd_unused_get (conn->fdtable, fd);
+ fd_no = gf_fd_unused_get (serv_ctx->fdtable, fd);
fd_ref (fd); // on behalf of the client
out:
@@ -724,6 +675,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_opendir_rsp);
@@ -736,17 +688,15 @@ int
server_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- gf_common_rsp rsp = {0,};
- rpcsvc_request_t *req = NULL;
- server_state_t *state = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gf_common_rsp rsp = {0,};
+ rpcsvc_request_t *req = NULL;
+ server_state_t *state = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret == -1) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": REMOVEXATTR %s (%s) of key %s ==> (%s)",
frame->root->unique, state->loc.path,
@@ -759,6 +709,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_common_rsp);
@@ -771,17 +722,15 @@ int
server_fremovexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- gf_common_rsp rsp = {0,};
- rpcsvc_request_t *req = NULL;
+ gf_common_rsp rsp = {0,};
+ rpcsvc_request_t *req = NULL;
server_state_t *state = NULL;
- req = frame->local;
- state = CALL_STATE(frame);
-
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret == -1) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": FREMOVEXATTR %"PRId64" (%s) (%s) ==> (%s)",
frame->root->unique, state->resolve.fd_no,
@@ -794,6 +743,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_common_rsp);
@@ -807,19 +757,18 @@ server_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *dict,
dict_t *xdata)
{
- gfs3_getxattr_rsp rsp = {0,};
- rpcsvc_request_t *req = NULL;
- server_state_t *state = NULL;
-
- req = frame->local;
- state = CALL_STATE (frame);
+ gfs3_getxattr_rsp rsp = {0,};
+ rpcsvc_request_t *req = NULL;
+ server_state_t *state = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret == -1) {
+ state = CALL_STATE (frame);
gf_log (this->name, (((op_errno == ENOTSUP) ||
- (op_errno == ENODATA)) ?
+ (op_errno == ENODATA) ||
+ (op_errno == ENOENT)) ?
GF_LOG_DEBUG : GF_LOG_INFO),
"%"PRId64": GETXATTR %s (%s) (%s) ==> (%s)",
frame->root->unique, state->loc.path,
@@ -828,13 +777,14 @@ server_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- GF_PROTOCOL_DICT_SERIALIZE (this, dict, (&rsp.dict.dict_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, dict, &rsp.dict.dict_val,
rsp.dict.dict_len, op_errno, out);
out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_getxattr_rsp);
@@ -851,18 +801,18 @@ server_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *dict,
dict_t *xdata)
{
- gfs3_fgetxattr_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE (frame);
+ gfs3_fgetxattr_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret == -1) {
- gf_log (this->name, ((op_errno == ENOTSUP) ?
+ state = CALL_STATE (frame);
+ gf_log (this->name, (((op_errno == ENOTSUP) ||
+ (op_errno == ENODATA) ||
+ (op_errno == ENOENT)) ?
GF_LOG_DEBUG : GF_LOG_INFO),
"%"PRId64": FGETXATTR %"PRId64" (%s) (%s) ==> (%s)",
frame->root->unique, state->resolve.fd_no,
@@ -871,7 +821,7 @@ server_fgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- GF_PROTOCOL_DICT_SERIALIZE (this, dict, (&rsp.dict.dict_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, dict, &rsp.dict.dict_val,
rsp.dict.dict_len, op_errno, out);
out:
@@ -879,6 +829,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_fgetxattr_rsp);
@@ -889,6 +840,24 @@ out:
return 0;
}
+/* print every key */
+static int
+_gf_server_log_setxattr_failure (dict_t *d, char *k, data_t *v,
+ void *tmp)
+{
+ server_state_t *state = NULL;
+ call_frame_t *frame = NULL;
+
+ frame = tmp;
+ state = CALL_STATE (frame);
+
+ gf_log (THIS->name, GF_LOG_INFO,
+ "%"PRId64": SETXATTR %s (%s) ==> %s",
+ frame->root->unique, state->loc.path,
+ uuid_utoa (state->resolve.gfid), k);
+ return 0;
+}
+
int
server_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
@@ -897,26 +866,19 @@ server_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
rpcsvc_request_t *req = NULL;
server_state_t *state = NULL;
- req = frame->local;
- state = CALL_STATE(frame);
-
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret == -1) {
- /* print every key */
- int _log_setxattr_failure (dict_t *d, char *k, data_t *v,
- void *tmp)
- {
- gf_log (this->name, ((op_errno == ENOTSUP) ?
- GF_LOG_DEBUG : GF_LOG_INFO),
- "%"PRId64": SETXATTR %s (%s) ==> %s (%s)",
- frame->root->unique, state->loc.path,
- uuid_utoa (state->resolve.gfid), k,
- strerror (op_errno));
- return 0;
- }
- dict_foreach (state->dict, _log_setxattr_failure, NULL);
+ state = CALL_STATE (frame);
+ if (op_errno != ENOTSUP)
+ dict_foreach (state->dict,
+ _gf_server_log_setxattr_failure,
+ frame);
+
+ gf_log (THIS->name, ((op_errno == ENOTSUP) ?
+ GF_LOG_DEBUG : GF_LOG_INFO),
+ "%s", strerror (op_errno));
goto out;
}
@@ -924,6 +886,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_common_rsp);
@@ -932,6 +895,24 @@ out:
return 0;
}
+/* print every key here */
+static int
+_gf_server_log_fsetxattr_failure (dict_t *d, char *k, data_t *v,
+ void *tmp)
+{
+ call_frame_t *frame = NULL;
+ server_state_t *state = NULL;
+
+ frame = tmp;
+ state = CALL_STATE (frame);
+
+ gf_log (THIS->name, GF_LOG_INFO,
+ "%"PRId64": FSETXATTR %"PRId64" (%s) ==> %s",
+ frame->root->unique, state->resolve.fd_no,
+ uuid_utoa (state->resolve.gfid), k);
+
+ return 0;
+}
int
server_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
@@ -941,26 +922,19 @@ server_fsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
rpcsvc_request_t *req = NULL;
server_state_t *state = NULL;
- req = frame->local;
- state = CALL_STATE(frame);
-
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret == -1) {
- /* print every key here */
- int _log_setxattr_failure (dict_t *d, char *k, data_t *v,
- void *tmp)
- {
- gf_log (this->name, ((op_errno == ENOTSUP) ?
- GF_LOG_DEBUG : GF_LOG_INFO),
- "%"PRId64": FSETXATTR %"PRId64" (%s) ==> %s (%s)",
- frame->root->unique, state->resolve.fd_no,
- uuid_utoa (state->resolve.gfid), k,
- strerror (op_errno));
- return 0;
+ state = CALL_STATE (frame);
+ if (op_errno != ENOTSUP) {
+ dict_foreach (state->dict,
+ _gf_server_log_fsetxattr_failure,
+ frame);
}
- dict_foreach (state->dict, _log_setxattr_failure, NULL);
+ gf_log (THIS->name, ((op_errno == ENOTSUP) ?
+ GF_LOG_DEBUG : GF_LOG_INFO),
+ "%s", strerror (op_errno));
goto out;
}
@@ -968,6 +942,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_common_rsp);
@@ -983,20 +958,19 @@ server_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *prenewparent, struct iatt *postnewparent,
dict_t *xdata)
{
- gfs3_rename_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
- inode_t *tmp_inode = NULL;
- inode_t *tmp_parent = NULL;
- char oldpar_str[50] = {0,};
- char newpar_str[50] = {0,};
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gfs3_rename_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
+ inode_t *tmp_inode = NULL;
+ inode_t *tmp_parent = NULL;
+ char oldpar_str[50] = {0,};
+ char newpar_str[50] = {0,};
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
+ state = CALL_STATE (frame);
+
if (op_ret == -1) {
uuid_utoa_r (state->resolve.gfid, oldpar_str);
uuid_utoa_r (state->resolve2.gfid, newpar_str);
@@ -1011,7 +985,7 @@ server_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
stbuf->ia_type = state->loc.inode->ia_type;
/* TODO: log gfid of the inodes */
- gf_log (state->conn->bound_xl->name, GF_LOG_TRACE,
+ gf_log (frame->root->client->bound_xl->name, GF_LOG_TRACE,
"%"PRId64": RENAME_CBK %s ==> %s",
frame->root->unique, state->loc.name, state->loc2.name);
@@ -1053,6 +1027,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_rename_rsp);
@@ -1066,17 +1041,16 @@ server_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- gfs3_unlink_rsp rsp = {0,};
- server_state_t *state = NULL;
- inode_t *parent = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gfs3_unlink_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ inode_t *parent = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
+ state = CALL_STATE (frame);
+
if (op_ret) {
gf_log (this->name, (op_errno == ENOENT)?
GF_LOG_DEBUG:GF_LOG_ERROR,
@@ -1088,7 +1062,7 @@ server_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
/* TODO: log gfid of the inodes */
- gf_log (state->conn->bound_xl->name, GF_LOG_TRACE,
+ gf_log (frame->root->client->bound_xl->name, GF_LOG_TRACE,
"%"PRId64": UNLINK_CBK %s",
frame->root->unique, state->loc.name);
@@ -1108,6 +1082,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_unlink_rsp);
@@ -1122,17 +1097,16 @@ server_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *stbuf, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- gfs3_symlink_rsp rsp = {0,};
- server_state_t *state = NULL;
- inode_t *link_inode = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gfs3_symlink_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ inode_t *link_inode = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
+ state = CALL_STATE (frame);
+
if (op_ret < 0) {
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": SYMLINK %s (%s/%s) ==> (%s)",
@@ -1155,6 +1129,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_symlink_rsp);
@@ -1170,27 +1145,27 @@ server_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *stbuf, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- gfs3_link_rsp rsp = {0,};
- server_state_t *state = NULL;
- inode_t *link_inode = NULL;
- rpcsvc_request_t *req = NULL;
+ gfs3_link_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ inode_t *link_inode = NULL;
+ rpcsvc_request_t *req = NULL;
char gfid_str[50] = {0,};
char newpar_str[50] = {0,};
- req = frame->local;
- state = CALL_STATE(frame);
-
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
+ state = CALL_STATE (frame);
+
if (op_ret) {
uuid_utoa_r (state->resolve.gfid, gfid_str);
uuid_utoa_r (state->resolve2.pargfid, newpar_str);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": LINK %s (%s) -> %s/%s ==> (%s)",
- frame->root->unique, state->loc.path, gfid_str,
- newpar_str, state->resolve2.bname, strerror (op_errno));
+ frame->root->unique, state->loc.path,
+ gfid_str, newpar_str, state->resolve2.bname,
+ strerror (op_errno));
goto out;
}
@@ -1206,6 +1181,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_link_rsp);
@@ -1219,17 +1195,15 @@ server_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
{
- gfs3_truncate_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE (frame);
+ gfs3_truncate_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": TRUNCATE %s (%s) ==> (%s)",
frame->root->unique, state->loc.path,
@@ -1244,6 +1218,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_truncate_rsp);
@@ -1257,17 +1232,15 @@ server_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *stbuf,
dict_t *xdata)
{
- gfs3_fstat_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gfs3_fstat_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": FSTAT %"PRId64" (%s) ==> (%s)",
frame->root->unique, state->resolve.fd_no,
@@ -1281,6 +1254,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_fstat_rsp);
@@ -1294,17 +1268,15 @@ server_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
{
- gfs3_ftruncate_rsp rsp = {0};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE (frame);
+ gfs3_ftruncate_rsp rsp = {0};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": FTRUNCATE %"PRId64" (%s)==> (%s)",
frame->root->unique, state->resolve.fd_no,
@@ -1319,6 +1291,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_ftruncate_rsp);
@@ -1331,21 +1304,20 @@ int
server_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- gf_common_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gf_common_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
- "%"PRId64": FLUSH %"PRId64" (%s) ==> (%s)",
+ state = CALL_STATE (frame);
+ gf_log (this->name, (op_errno == ENOENT)?
+ GF_LOG_DEBUG:GF_LOG_ERROR,
+ "%"PRId64": FLUSH %"PRId64" (%s) ==> (%s)",
frame->root->unique, state->resolve.fd_no,
- uuid_utoa (state->resolve.gfid), strerror (op_errno));
+ uuid_utoa (state->resolve.gfid), strerror (op_errno));
goto out;
}
@@ -1353,6 +1325,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_common_rsp);
@@ -1366,17 +1339,15 @@ server_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
{
- gfs3_fsync_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gfs3_fsync_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret < 0) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": FSYNC %"PRId64" (%s) ==> (%s)",
frame->root->unique, state->resolve.fd_no,
@@ -1391,6 +1362,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_fsync_rsp);
@@ -1404,17 +1376,15 @@ server_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
{
- gfs3_write_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gfs3_write_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret < 0) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": WRITEV %"PRId64" (%s) ==> (%s)",
frame->root->unique, state->resolve.fd_no,
@@ -1429,6 +1399,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_write_rsp);
@@ -1444,12 +1415,9 @@ server_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iovec *vector, int32_t count,
struct iatt *stbuf, struct iobref *iobref, dict_t *xdata)
{
- gfs3_read_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gfs3_read_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
#ifdef GF_TESTING_IO_XDATA
{
@@ -1461,10 +1429,11 @@ server_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
"testing-xdata-value");
}
#endif
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret < 0) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": READV %"PRId64" (%s) ==> (%s)",
frame->root->unique, state->resolve.fd_no,
@@ -1479,6 +1448,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, vector, count, iobref,
(xdrproc_t)xdr_gfs3_read_rsp);
@@ -1493,17 +1463,15 @@ server_rchecksum_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
uint32_t weak_checksum, uint8_t *strong_checksum,
dict_t *xdata)
{
- gfs3_rchecksum_rsp rsp = {0,};
- rpcsvc_request_t *req = NULL;
+ gfs3_rchecksum_rsp rsp = {0,};
+ rpcsvc_request_t *req = NULL;
server_state_t *state = NULL;
- req = frame->local;
- state = CALL_STATE(frame);
-
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret < 0) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": RCHECKSUM %"PRId64" (%s)==> (%s)",
frame->root->unique, state->resolve.fd_no,
@@ -1520,6 +1488,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_rchecksum_rsp);
@@ -1533,21 +1502,19 @@ int
server_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
{
- server_connection_t *conn = NULL;
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
- uint64_t fd_no = 0;
- gfs3_open_rsp rsp = {0,};
-
- req = frame->local;
- conn = SERVER_CONNECTION (frame);
- state = CALL_STATE (frame);
+ server_state_t *state = NULL;
+ server_ctx_t *serv_ctx = NULL;
+ rpcsvc_request_t *req = NULL;
+ uint64_t fd_no = 0;
+ gfs3_open_rsp rsp = {0,};
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret < 0) {
- gf_log (this->name, GF_LOG_INFO,
+ state = CALL_STATE (frame);
+ gf_log (this->name, (op_errno == ENOENT)?
+ GF_LOG_DEBUG:GF_LOG_ERROR,
"%"PRId64": OPEN %s (%s) ==> (%s)",
frame->root->unique, state->loc.path,
uuid_utoa (state->resolve.gfid),
@@ -1555,8 +1522,14 @@ server_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
+ serv_ctx = server_ctx_get (frame->root->client, this);
+ if (serv_ctx == NULL) {
+ gf_log (this->name, GF_LOG_INFO, "server_ctx_get() failed");
+ goto out;
+ }
+
fd_bind (fd);
- fd_no = gf_fd_unused_get (conn->fdtable, fd);
+ fd_no = gf_fd_unused_get (serv_ctx->fdtable, fd);
fd_ref (fd);
rsp.fd = fd_no;
@@ -1564,6 +1537,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_open_rsp);
GF_FREE (rsp.xdata.xdata_val);
@@ -1578,20 +1552,18 @@ server_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
struct iatt *stbuf, struct iatt *preparent,
struct iatt *postparent, dict_t *xdata)
{
- server_connection_t *conn = NULL;
server_state_t *state = NULL;
+ server_ctx_t *serv_ctx = NULL;
inode_t *link_inode = NULL;
rpcsvc_request_t *req = NULL;
uint64_t fd_no = 0;
gfs3_create_rsp rsp = {0,};
- req = frame->local;
- conn = SERVER_CONNECTION (frame);
- state = CALL_STATE (frame);
-
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
+ state = CALL_STATE (frame);
+
if (op_ret < 0) {
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": CREATE %s (%s/%s) ==> (%s)",
@@ -1602,7 +1574,7 @@ server_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
}
/* TODO: log gfid too */
- gf_log (state->conn->bound_xl->name, GF_LOG_TRACE,
+ gf_log (frame->root->client->bound_xl->name, GF_LOG_TRACE,
"%"PRId64": CREATE %s (%s)",
frame->root->unique, state->loc.name,
uuid_utoa (stbuf->ia_gfid));
@@ -1629,13 +1601,18 @@ server_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
inode_lookup (link_inode);
inode_unref (link_inode);
- fd_bind (fd);
+ serv_ctx = server_ctx_get (frame->root->client, this);
+ if (serv_ctx == NULL) {
+ gf_log (this->name, GF_LOG_INFO, "server_ctx_get() failed");
+ goto out;
+ }
- fd_no = gf_fd_unused_get (conn->fdtable, fd);
+ fd_bind (fd);
+ fd_no = gf_fd_unused_get (serv_ctx->fdtable, fd);
fd_ref (fd);
- if ((fd_no < 0) || (fd == 0)) {
- op_ret = fd_no;
+ if ((fd_no > UINT64_MAX) || (fd == 0)) {
+ op_ret = -1;
op_errno = errno;
}
@@ -1648,6 +1625,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_create_rsp);
@@ -1661,17 +1639,15 @@ server_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, const char *buf,
struct iatt *stbuf, dict_t *xdata)
{
- gfs3_readlink_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gfs3_readlink_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret < 0) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": READLINK %s (%s) ==> (%s)",
frame->root->unique, state->loc.path,
@@ -1690,6 +1666,7 @@ out:
if (!rsp.path)
rsp.path = "";
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_readlink_rsp);
@@ -1703,18 +1680,17 @@ server_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *stbuf,
dict_t *xdata)
{
- gfs3_stat_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE (frame);
+ gfs3_stat_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret) {
- gf_log (this->name, GF_LOG_INFO,
+ state = CALL_STATE (frame);
+ gf_log (this->name, (op_errno == ENOENT)?
+ GF_LOG_DEBUG:GF_LOG_ERROR,
"%"PRId64": STAT %s (%s) ==> (%s)",
frame->root->unique, state->loc.path,
uuid_utoa (state->resolve.gfid),
@@ -1728,6 +1704,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_stat_rsp);
@@ -1742,17 +1719,15 @@ server_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
struct iatt *statpre, struct iatt *statpost, dict_t *xdata)
{
- gfs3_setattr_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE (frame);
+ gfs3_setattr_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": SETATTR %s (%s) ==> (%s)",
frame->root->unique, state->loc.path,
@@ -1768,6 +1743,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_setattr_rsp);
@@ -1781,17 +1757,15 @@ server_fsetattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno,
struct iatt *statpre, struct iatt *statpost, dict_t *xdata)
{
- gfs3_fsetattr_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE (frame);
+ gfs3_fsetattr_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": FSETATTR %"PRId64" (%s) ==> (%s)",
frame->root->unique, state->resolve.fd_no,
@@ -1807,6 +1781,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_fsetattr_rsp);
@@ -1821,17 +1796,15 @@ server_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *dict,
dict_t *xdata)
{
- gfs3_xattrop_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE (frame);
+ gfs3_xattrop_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret < 0) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": XATTROP %s (%s) ==> (%s)",
frame->root->unique, state->loc.path,
@@ -1840,13 +1813,14 @@ server_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- GF_PROTOCOL_DICT_SERIALIZE (this, dict, (&rsp.dict.dict_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, dict, &rsp.dict.dict_val,
rsp.dict.dict_len, op_errno, out);
out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_xattrop_rsp);
@@ -1863,17 +1837,15 @@ server_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *dict,
dict_t *xdata)
{
- gfs3_xattrop_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gfs3_xattrop_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret < 0) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": FXATTROP %"PRId64" (%s) ==> (%s)",
frame->root->unique, state->resolve.fd_no,
@@ -1882,13 +1854,14 @@ server_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
goto out;
}
- GF_PROTOCOL_DICT_SERIALIZE (this, dict, (&rsp.dict.dict_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, dict, &rsp.dict.dict_val,
rsp.dict.dict_len, op_errno, out);
out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_fxattrop_rsp);
@@ -1902,20 +1875,19 @@ out:
int
server_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, gf_dirent_t *entries, dict_t *xdata)
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
+ dict_t *xdata)
{
- gfs3_readdirp_rsp rsp = {0,};
- server_state_t *state = NULL;
- rpcsvc_request_t *req = NULL;
- int ret = 0;
-
- req = frame->local;
- state = CALL_STATE(frame);
+ gfs3_readdirp_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
+ int ret = 0;
- GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
rsp.xdata.xdata_len, op_errno, out);
if (op_ret < 0) {
+ state = CALL_STATE (frame);
gf_log (this->name, GF_LOG_INFO,
"%"PRId64": READDIRP %"PRId64" (%s) ==> (%s)",
frame->root->unique, state->resolve.fd_no,
@@ -1941,6 +1913,7 @@ out:
rsp.op_ret = op_ret;
rsp.op_errno = gf_errno_to_error (op_errno);
+ req = frame->local;
server_submit_reply (frame, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gfs3_readdirp_rsp);
@@ -1951,6 +1924,158 @@ out:
return 0;
}
+int
+server_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *statpre, struct iatt *statpost, dict_t *xdata)
+{
+ gfs3_fallocate_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
+
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
+ rsp.xdata.xdata_len, op_errno, out);
+
+ if (op_ret) {
+ state = CALL_STATE (frame);
+ gf_log (this->name, GF_LOG_INFO,
+ "%"PRId64": FALLOCATE %"PRId64" (%s) ==> (%s)",
+ frame->root->unique, state->resolve.fd_no,
+ uuid_utoa (state->resolve.gfid),
+ strerror (op_errno));
+ goto out;
+ }
+
+ gf_stat_from_iatt (&rsp.statpre, statpre);
+ gf_stat_from_iatt (&rsp.statpost, statpost);
+
+out:
+ rsp.op_ret = op_ret;
+ rsp.op_errno = gf_errno_to_error (op_errno);
+
+ req = frame->local;
+ server_submit_reply(frame, req, &rsp, NULL, 0, NULL,
+ (xdrproc_t) xdr_gfs3_fallocate_rsp);
+
+ GF_FREE (rsp.xdata.xdata_val);
+
+ return 0;
+}
+
+int
+server_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *statpre, struct iatt *statpost, dict_t *xdata)
+{
+ gfs3_discard_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
+
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, &rsp.xdata.xdata_val,
+ rsp.xdata.xdata_len, op_errno, out);
+
+ if (op_ret) {
+ state = CALL_STATE (frame);
+ gf_log (this->name, GF_LOG_INFO,
+ "%"PRId64": DISCARD %"PRId64" (%s) ==> (%s)",
+ frame->root->unique, state->resolve.fd_no,
+ uuid_utoa (state->resolve.gfid),
+ strerror (op_errno));
+ goto out;
+ }
+
+ gf_stat_from_iatt (&rsp.statpre, statpre);
+ gf_stat_from_iatt (&rsp.statpost, statpost);
+
+out:
+ rsp.op_ret = op_ret;
+ rsp.op_errno = gf_errno_to_error (op_errno);
+
+ req = frame->local;
+ server_submit_reply(frame, req, &rsp, NULL, 0, NULL,
+ (xdrproc_t) xdr_gfs3_discard_rsp);
+
+ GF_FREE (rsp.xdata.xdata_val);
+
+ return 0;
+}
+
+int
+server_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *statpre, struct iatt *statpost, dict_t *xdata)
+{
+ gfs3_zerofill_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
+
+ req = frame->local;
+ state = CALL_STATE (frame);
+
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ rsp.xdata.xdata_len, op_errno, out);
+
+ if (op_ret) {
+ gf_log (this->name, GF_LOG_INFO,
+ "%"PRId64": ZEROFILL%"PRId64" (%s) ==> (%s)",
+ frame->root->unique, state->resolve.fd_no,
+ uuid_utoa (state->resolve.gfid),
+ strerror (op_errno));
+ goto out;
+ }
+
+ gf_stat_from_iatt (&rsp.statpre, statpre);
+ gf_stat_from_iatt (&rsp.statpost, statpost);
+
+out:
+ rsp.op_ret = op_ret;
+ rsp.op_errno = gf_errno_to_error (op_errno);
+
+ server_submit_reply(frame, req, &rsp, NULL, 0, NULL,
+ (xdrproc_t) xdr_gfs3_zerofill_rsp);
+
+ GF_FREE (rsp.xdata.xdata_val);
+
+ return 0;
+}
+
+
+int
+server_ipc_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ gf_common_rsp rsp = {0,};
+ server_state_t *state = NULL;
+ rpcsvc_request_t *req = NULL;
+
+ req = frame->local;
+ state = CALL_STATE (frame);
+
+ GF_PROTOCOL_DICT_SERIALIZE (this, xdata, (&rsp.xdata.xdata_val),
+ rsp.xdata.xdata_len, op_errno, out);
+
+ if (op_ret) {
+ gf_log (this->name, GF_LOG_INFO,
+ "%"PRId64": IPC%"PRId64" (%s) ==> (%s)",
+ frame->root->unique, state->resolve.fd_no,
+ uuid_utoa (state->resolve.gfid),
+ strerror (op_errno));
+ goto out;
+ }
+
+out:
+ rsp.op_ret = op_ret;
+ rsp.op_errno = gf_errno_to_error (op_errno);
+
+ server_submit_reply(frame, req, &rsp, NULL, 0, NULL,
+ (xdrproc_t) xdr_gf_common_rsp);
+
+ GF_FREE (rsp.xdata.xdata_val);
+
+ return 0;
+}
+
+
/* Resume function section */
int
@@ -2114,6 +2239,7 @@ err:
int
server_fentrylk_resume (call_frame_t *frame, xlator_t *bound_xl)
{
+ GF_UNUSED int ret = -1;
server_state_t *state = NULL;
state = CALL_STATE (frame);
@@ -2121,6 +2247,13 @@ server_fentrylk_resume (call_frame_t *frame, xlator_t *bound_xl)
if (state->resolve.op_ret != 0)
goto err;
+ if (!state->xdata)
+ state->xdata = dict_new ();
+
+ if (state->xdata)
+ ret = dict_set_str (state->xdata, "connection-id",
+ frame->root->client->client_uid);
+
STACK_WIND (frame, server_fentrylk_cbk, bound_xl,
bound_xl->fops->fentrylk,
state->volume, state->fd, state->name,
@@ -2137,6 +2270,7 @@ err:
int
server_entrylk_resume (call_frame_t *frame, xlator_t *bound_xl)
{
+ GF_UNUSED int ret = -1;
server_state_t *state = NULL;
state = CALL_STATE (frame);
@@ -2144,6 +2278,13 @@ server_entrylk_resume (call_frame_t *frame, xlator_t *bound_xl)
if (state->resolve.op_ret != 0)
goto err;
+ if (!state->xdata)
+ state->xdata = dict_new ();
+
+ if (state->xdata)
+ ret = dict_set_str (state->xdata, "connection-id",
+ frame->root->client->client_uid);
+
STACK_WIND (frame, server_entrylk_cbk,
bound_xl, bound_xl->fops->entrylk,
state->volume, &state->loc, state->name,
@@ -2159,13 +2300,24 @@ err:
int
server_finodelk_resume (call_frame_t *frame, xlator_t *bound_xl)
{
+ GF_UNUSED int ret = -1;
server_state_t *state = NULL;
+ gf_log (bound_xl->name, GF_LOG_DEBUG, "frame %p, xlator %p",
+ frame, bound_xl);
+
state = CALL_STATE (frame);
if (state->resolve.op_ret != 0)
goto err;
+ if (!state->xdata)
+ state->xdata = dict_new ();
+
+ if (state->xdata)
+ ret = dict_set_str (state->xdata, "connection-id",
+ frame->root->client->client_uid);
+
STACK_WIND (frame, server_finodelk_cbk, bound_xl,
bound_xl->fops->finodelk, state->volume, state->fd,
state->cmd, &state->flock, state->xdata);
@@ -2181,13 +2333,24 @@ err:
int
server_inodelk_resume (call_frame_t *frame, xlator_t *bound_xl)
{
+ GF_UNUSED int ret = -1;
server_state_t *state = NULL;
+ gf_log (bound_xl->name, GF_LOG_DEBUG, "frame %p, xlator %p",
+ frame, bound_xl);
+
state = CALL_STATE (frame);
if (state->resolve.op_ret != 0)
goto err;
+ if (!state->xdata)
+ state->xdata = dict_new ();
+
+ if (state->xdata)
+ ret = dict_set_str (state->xdata, "connection-id",
+ frame->root->client->client_uid);
+
STACK_WIND (frame, server_inodelk_cbk, bound_xl,
bound_xl->fops->inodelk, state->volume, &state->loc,
state->cmd, &state->flock, state->xdata);
@@ -2896,7 +3059,69 @@ err:
return 0;
}
+int
+server_fallocate_resume (call_frame_t *frame, xlator_t *bound_xl)
+{
+ server_state_t *state = NULL;
+
+ state = CALL_STATE (frame);
+
+ if (state->resolve.op_ret != 0)
+ goto err;
+
+ STACK_WIND (frame, server_fallocate_cbk,
+ bound_xl, bound_xl->fops->fallocate,
+ state->fd, state->flags, state->offset, state->size,
+ state->xdata);
+ return 0;
+err:
+ server_fallocate_cbk(frame, NULL, frame->this, state->resolve.op_ret,
+ state->resolve.op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+int
+server_discard_resume (call_frame_t *frame, xlator_t *bound_xl)
+{
+ server_state_t *state = NULL;
+
+ state = CALL_STATE (frame);
+
+ if (state->resolve.op_ret != 0)
+ goto err;
+
+ STACK_WIND (frame, server_discard_cbk,
+ bound_xl, bound_xl->fops->discard,
+ state->fd, state->offset, state->size, state->xdata);
+ return 0;
+err:
+ server_discard_cbk(frame, NULL, frame->this, state->resolve.op_ret,
+ state->resolve.op_errno, NULL, NULL, NULL);
+
+ return 0;
+}
+
+int
+server_zerofill_resume (call_frame_t *frame, xlator_t *bound_xl)
+{
+ server_state_t *state = NULL;
+
+ state = CALL_STATE (frame);
+
+ if (state->resolve.op_ret != 0)
+ goto err;
+
+ STACK_WIND (frame, server_zerofill_cbk,
+ bound_xl, bound_xl->fops->zerofill,
+ state->fd, state->offset, state->size, state->xdata);
+ return 0;
+err:
+ server_zerofill_cbk(frame, NULL, frame->this, state->resolve.op_ret,
+ state->resolve.op_errno, NULL, NULL, NULL);
+ return 0;
+}
/* Fop section */
@@ -2915,33 +3140,34 @@ server3_3_stat (rpcsvc_request_t *req)
/* Initialize args first, then decode */
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_stat_req)) {
+ ret = xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_stat_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
- // something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_STAT;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
state->resolve.type = RESOLVE_MUST;
memcpy (state->resolve.gfid, args.gfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
@@ -2952,7 +3178,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -2970,24 +3196,26 @@ server3_3_setattr (rpcsvc_request_t *req)
if (!req)
return 0;
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_setattr_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_setattr_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_SETATTR;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -2997,9 +3225,10 @@ server3_3_setattr (rpcsvc_request_t *req)
gf_stat_to_iatt (&args.stbuf, &state->stbuf);
state->valid = args.valid;
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -3007,7 +3236,7 @@ server3_3_setattr (rpcsvc_request_t *req)
out:
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
free (args.xdata.xdata_val);
@@ -3027,24 +3256,25 @@ server3_3_fsetattr (rpcsvc_request_t *req)
if (!req)
return ret;
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_fsetattr_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_fsetattr_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
- // something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_FSETATTR;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -3054,13 +3284,196 @@ server3_3_fsetattr (rpcsvc_request_t *req)
gf_stat_to_iatt (&args.stbuf, &state->stbuf);
state->valid = args.valid;
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
+ op_errno, out);
+
+ ret = 0;
+ resolve_and_resume (frame, server_fsetattr_resume);
+
+out:
+ free (args.xdata.xdata_val);
+
+ if (op_errno)
+ SERVER_REQ_SET_ERROR (req, ret);
+
+ return ret;
+}
+
+int
+server3_3_fallocate(rpcsvc_request_t *req)
+{
+ server_state_t *state = NULL;
+ call_frame_t *frame = NULL;
+ gfs3_fallocate_req args = {{0},};
+ int ret = -1;
+ int op_errno = 0;
+
+ if (!req)
+ return ret;
+
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_fallocate_req);
+ if (ret < 0) {
+ //failed to decode msg;
+ SERVER_REQ_SET_ERROR (req, ret);
+ goto out;
+ }
+
+ frame = get_frame_from_request (req);
+ if (!frame) {
+ // something wrong, mostly insufficient memory
+ SERVER_REQ_SET_ERROR (req, ret);
+ goto out;
+ }
+ frame->root->op = GF_FOP_FALLOCATE;
+
+ state = CALL_STATE (frame);
+ if (!frame->root->client->bound_xl) {
+ /* auth failure, request on subvolume without setvolume */
+ SERVER_REQ_SET_ERROR (req, ret);
+ goto out;
+ }
+
+ state->resolve.type = RESOLVE_MUST;
+ state->resolve.fd_no = args.fd;
+
+ state->flags = args.flags;
+ state->offset = args.offset;
+ state->size = args.size;
+ memcpy(state->resolve.gfid, args.gfid, 16);
+
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
+ op_errno, out);
+
+ ret = 0;
+ resolve_and_resume (frame, server_fallocate_resume);
+
+out:
+ free (args.xdata.xdata_val);
+
+ if (op_errno)
+ SERVER_REQ_SET_ERROR (req, ret);
+
+ return ret;
+}
+
+
+int
+server3_3_discard(rpcsvc_request_t *req)
+{
+ server_state_t *state = NULL;
+ call_frame_t *frame = NULL;
+ gfs3_discard_req args = {{0},};
+ int ret = -1;
+ int op_errno = 0;
+
+ if (!req)
+ return ret;
+
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_discard_req);
+ if (ret < 0) {
+ //failed to decode msg;
+ SERVER_REQ_SET_ERROR (req, ret);
+ goto out;
+ }
+
+ frame = get_frame_from_request (req);
+ if (!frame) {
+ // something wrong, mostly insufficient memory
+ SERVER_REQ_SET_ERROR (req, ret);
+ goto out;
+ }
+ frame->root->op = GF_FOP_DISCARD;
+
+ state = CALL_STATE (frame);
+ if (!frame->root->client->bound_xl) {
+ /* auth failure, request on subvolume without setvolume */
+ SERVER_REQ_SET_ERROR (req, ret);
+ goto out;
+ }
+
+ state->resolve.type = RESOLVE_MUST;
+ state->resolve.fd_no = args.fd;
+
+ state->offset = args.offset;
+ state->size = args.size;
+ memcpy(state->resolve.gfid, args.gfid, 16);
+
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
+ op_errno, out);
+
+ ret = 0;
+ resolve_and_resume (frame, server_discard_resume);
+
+out:
+ free (args.xdata.xdata_val);
+
+ if (op_errno)
+ SERVER_REQ_SET_ERROR (req, ret);
+
+ return ret;
+}
+
+
+int
+server3_3_zerofill(rpcsvc_request_t *req)
+{
+ server_state_t *state = NULL;
+ call_frame_t *frame = NULL;
+ gfs3_zerofill_req args = {{0},};
+ int ret = -1;
+ int op_errno = 0;
+
+ if (!req)
+ return ret;
+
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_zerofill_req);
+ if (ret < 0) {
+ /*failed to decode msg*/;
+ req->rpc_err = GARBAGE_ARGS;
+ goto out;
+ }
+
+ frame = get_frame_from_request (req);
+ if (!frame) {
+ /* something wrong, mostly insufficient memory*/
+ req->rpc_err = GARBAGE_ARGS; /* TODO */
+ goto out;
+ }
+ frame->root->op = GF_FOP_ZEROFILL;
+
+ state = CALL_STATE (frame);
+ if (!frame->root->client->bound_xl) {
+ /* auth failure, request on subvolume without setvolume */
+ req->rpc_err = GARBAGE_ARGS;
+ goto out;
+ }
+
+ state->resolve.type = RESOLVE_MUST;
+ state->resolve.fd_no = args.fd;
+
+ state->offset = args.offset;
+ state->size = args.size;
+ memcpy(state->resolve.gfid, args.gfid, 16);
+
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl, state->xdata,
(args.xdata.xdata_val),
(args.xdata.xdata_len), ret,
op_errno, out);
ret = 0;
- resolve_and_resume (frame, server_fsetattr_resume);
+ resolve_and_resume (frame, server_zerofill_resume);
out:
free (args.xdata.xdata_val);
@@ -3071,6 +3484,60 @@ out:
return ret;
}
+int
+server3_3_ipc (rpcsvc_request_t *req)
+{
+ call_frame_t *frame = NULL;
+ gfs3_ipc_req args = {0,};
+ int ret = -1;
+ int op_errno = 0;
+ dict_t *xdata = NULL;
+ xlator_t *bound_xl = NULL;
+
+ if (!req)
+ return ret;
+
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_ipc_req);
+ if (ret < 0) {
+ /*failed to decode msg*/;
+ req->rpc_err = GARBAGE_ARGS;
+ goto out;
+ }
+
+ frame = get_frame_from_request (req);
+ if (!frame) {
+ /* something wrong, mostly insufficient memory*/
+ req->rpc_err = GARBAGE_ARGS; /* TODO */
+ goto out;
+ }
+ frame->root->op = GF_FOP_IPC;
+
+ bound_xl = frame->root->client->bound_xl;
+ if (!bound_xl) {
+ /* auth failure, request on subvolume without setvolume */
+ req->rpc_err = GARBAGE_ARGS;
+ goto out;
+ }
+
+ GF_PROTOCOL_DICT_UNSERIALIZE (bound_xl, xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len,
+ ret, op_errno, out);
+
+ ret = 0;
+ STACK_WIND (frame, server_ipc_cbk, bound_xl, bound_xl->fops->ipc,
+ args.op, xdata);
+ free(xdata);
+
+out:
+ free (args.xdata.xdata_val);
+
+ if (op_errno)
+ req->rpc_err = GARBAGE_ARGS;
+
+ return ret;
+}
int
server3_3_readlink (rpcsvc_request_t *req)
@@ -3084,24 +3551,26 @@ server3_3_readlink (rpcsvc_request_t *req)
if (!req)
return ret;
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_readlink_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_readlink_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_READLINK;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -3110,9 +3579,10 @@ server3_3_readlink (rpcsvc_request_t *req)
state->size = args.size;
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -3122,7 +3592,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -3142,24 +3612,26 @@ server3_3_create (rpcsvc_request_t *req)
args.bname = alloca (req->msg[0].iov_len);
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_create_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_create_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_CREATE;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -3177,9 +3649,10 @@ server3_3_create (rpcsvc_request_t *req)
}
/* TODO: can do alloca for xdata field instead of stdalloc */
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -3190,7 +3663,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -3208,24 +3681,25 @@ server3_3_open (rpcsvc_request_t *req)
if (!req)
return ret;
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_open_req)) {
+ ret = xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_open_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_OPEN;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -3234,16 +3708,17 @@ server3_3_open (rpcsvc_request_t *req)
state->flags = gf_flags_to_flags (args.flags);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
resolve_and_resume (frame, server_open_resume);
out:
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
free (args.xdata.xdata_val);
@@ -3263,24 +3738,25 @@ server3_3_readv (rpcsvc_request_t *req)
if (!req)
goto out;
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_read_req)) {
+ ret = xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_read_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_READ;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -3292,9 +3768,10 @@ server3_3_readv (rpcsvc_request_t *req)
memcpy (state->resolve.gfid, args.gfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -3304,7 +3781,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -3324,31 +3801,33 @@ server3_3_writev (rpcsvc_request_t *req)
if (!req)
return ret;
- len = xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_write_req);
- if (len == 0) {
+ len = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_write_req);
+ if (len < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_WRITE;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
state->resolve.type = RESOLVE_MUST;
state->resolve.fd_no = args.fd;
state->offset = args.offset;
+ state->size = args.size;
state->flags = args.flag;
state->iobref = iobref_ref (req->iobref);
memcpy (state->resolve.gfid, args.gfid, 16);
@@ -3370,9 +3849,10 @@ server3_3_writev (rpcsvc_request_t *req)
state->size += state->payload_vector[i].iov_len;
}
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
#ifdef GF_TESTING_IO_XDATA
@@ -3385,7 +3865,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -3451,24 +3931,36 @@ server3_3_writev_vecsizer (int state, ssize_t *readsize, char *base_addr,
int
server3_3_release (rpcsvc_request_t *req)
{
- server_connection_t *conn = NULL;
- gfs3_release_req args = {{0,},};
- gf_common_rsp rsp = {0,};
- int ret = -1;
-
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_release_req)) {
+ client_t *client = NULL;
+ server_ctx_t *serv_ctx = NULL;
+ gfs3_release_req args = {{0,},};
+ gf_common_rsp rsp = {0,};
+ int ret = -1;
+
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_release_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
- conn = req->trans->xl_private;
- if (!conn) {
+ client = req->trans->xl_private;
+ if (!client) {
/* Handshake is not complete yet. */
req->rpc_err = SYSTEM_ERR;
goto out;
}
- gf_fd_put (conn->fdtable, args.fd);
+
+ serv_ctx = server_ctx_get (client, client->this);
+ if (serv_ctx == NULL) {
+ gf_log (req->trans->name, GF_LOG_INFO,
+ "server_ctx_get() failed");
+ req->rpc_err = SYSTEM_ERR;
+ goto out;
+ }
+
+ gf_fd_put (serv_ctx->fdtable, args.fd);
server_submit_reply (NULL, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_common_rsp);
@@ -3481,24 +3973,35 @@ out:
int
server3_3_releasedir (rpcsvc_request_t *req)
{
- server_connection_t *conn = NULL;
- gfs3_releasedir_req args = {{0,},};
- gf_common_rsp rsp = {0,};
- int ret = -1;
+ client_t *client = NULL;
+ server_ctx_t *serv_ctx = NULL;
+ gfs3_releasedir_req args = {{0,},};
+ gf_common_rsp rsp = {0,};
+ int ret = -1;
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_release_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_release_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
- conn = req->trans->xl_private;
- if (!conn) {
- req->rpc_err = GARBAGE_ARGS;
+ client = req->trans->xl_private;
+ if (!client) {
+ SERVER_REQ_SET_ERROR (req, ret);
+ goto out;
+ }
+
+ serv_ctx = server_ctx_get (client, client->this);
+ if (serv_ctx == NULL) {
+ gf_log (req->trans->name, GF_LOG_INFO,
+ "server_ctx_get() failed");
+ req->rpc_err = SYSTEM_ERR;
goto out;
}
- gf_fd_put (conn->fdtable, args.fd);
+ gf_fd_put (serv_ctx->fdtable, args.fd);
server_submit_reply (NULL, req, &rsp, NULL, 0, NULL,
(xdrproc_t)xdr_gf_common_rsp);
@@ -3521,24 +4024,26 @@ server3_3_fsync (rpcsvc_request_t *req)
if (!req)
return ret;
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_fsync_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_fsync_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_FSYNC;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -3547,9 +4052,10 @@ server3_3_fsync (rpcsvc_request_t *req)
state->flags = args.data;
memcpy (state->resolve.gfid, args.gfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -3558,7 +4064,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -3577,24 +4083,26 @@ server3_3_flush (rpcsvc_request_t *req)
if (!req)
return ret;
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_flush_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_flush_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_FLUSH;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -3602,9 +4110,10 @@ server3_3_flush (rpcsvc_request_t *req)
state->resolve.fd_no = args.fd;
memcpy (state->resolve.gfid, args.gfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -3613,7 +4122,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -3632,24 +4141,26 @@ server3_3_ftruncate (rpcsvc_request_t *req)
if (!req)
return ret;
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_ftruncate_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_ftruncate_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_FTRUNCATE;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -3658,9 +4169,10 @@ server3_3_ftruncate (rpcsvc_request_t *req)
state->offset = args.offset;
memcpy (state->resolve.gfid, args.gfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -3669,7 +4181,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -3687,24 +4199,26 @@ server3_3_fstat (rpcsvc_request_t *req)
if (!req)
return ret;
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_fstat_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_fstat_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_FSTAT;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -3712,9 +4226,10 @@ server3_3_fstat (rpcsvc_request_t *req)
state->resolve.fd_no = args.fd;
memcpy (state->resolve.gfid, args.gfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -3723,7 +4238,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -3741,24 +4256,26 @@ server3_3_truncate (rpcsvc_request_t *req)
if (!req)
return ret;
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_truncate_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_truncate_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_TRUNCATE;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -3766,9 +4283,10 @@ server3_3_truncate (rpcsvc_request_t *req)
memcpy (state->resolve.gfid, args.gfid, 16);
state->offset = args.offset;
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -3777,7 +4295,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -3798,24 +4316,26 @@ server3_3_unlink (rpcsvc_request_t *req)
args.bname = alloca (req->msg[0].iov_len);
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_unlink_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_unlink_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_UNLINK;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -3825,9 +4345,10 @@ server3_3_unlink (rpcsvc_request_t *req)
state->flags = args.xflags;
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -3836,7 +4357,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -3857,24 +4378,26 @@ server3_3_setxattr (rpcsvc_request_t *req)
args.dict.dict_val = alloca (req->msg[0].iov_len);
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_setxattr_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_setxattr_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_SETXATTR;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -3882,7 +4405,8 @@ server3_3_setxattr (rpcsvc_request_t *req)
state->flags = args.flags;
memcpy (state->resolve.gfid, args.gfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, dict,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ dict,
(args.dict.dict_val),
(args.dict.dict_len), ret,
op_errno, out);
@@ -3892,9 +4416,10 @@ server3_3_setxattr (rpcsvc_request_t *req)
/* There can be some commands hidden in key, check and proceed */
gf_server_check_setxattr_cmd (frame, dict);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -3905,7 +4430,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
if (dict)
dict_unref (dict);
@@ -3929,24 +4454,26 @@ server3_3_fsetxattr (rpcsvc_request_t *req)
return ret;
args.dict.dict_val = alloca (req->msg[0].iov_len);
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_fsetxattr_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_fsetxattr_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_FSETXATTR;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -3955,16 +4482,18 @@ server3_3_fsetxattr (rpcsvc_request_t *req)
state->flags = args.flags;
memcpy (state->resolve.gfid, args.gfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, dict,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ dict,
(args.dict.dict_val),
(args.dict.dict_len), ret,
op_errno, out);
state->dict = dict;
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -3975,7 +4504,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
if (dict)
dict_unref (dict);
@@ -3999,24 +4528,26 @@ server3_3_fxattrop (rpcsvc_request_t *req)
return ret;
args.dict.dict_val = alloca (req->msg[0].iov_len);
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_fxattrop_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_fxattrop_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_FXATTROP;
- state = CALL_STATE(frame);
- if (!state->conn->bound_xl) {
+ state = CALL_STATE (frame);
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -4025,16 +4556,18 @@ server3_3_fxattrop (rpcsvc_request_t *req)
state->flags = args.flags;
memcpy (state->resolve.gfid, args.gfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, dict,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ dict,
(args.dict.dict_val),
(args.dict.dict_len), ret,
op_errno, out);
state->dict = dict;
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -4046,7 +4579,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
if (dict)
dict_unref (dict);
@@ -4071,24 +4604,26 @@ server3_3_xattrop (rpcsvc_request_t *req)
args.dict.dict_val = alloca (req->msg[0].iov_len);
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_xattrop_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_xattrop_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_XATTROP;
- state = CALL_STATE(frame);
- if (!state->conn->bound_xl) {
+ state = CALL_STATE (frame);
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -4096,16 +4631,18 @@ server3_3_xattrop (rpcsvc_request_t *req)
state->flags = args.flags;
memcpy (state->resolve.gfid, args.gfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, dict,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ dict,
(args.dict.dict_val),
(args.dict.dict_len), ret,
op_errno, out);
state->dict = dict;
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -4116,7 +4653,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
if (dict)
dict_unref (dict);
@@ -4139,24 +4676,26 @@ server3_3_getxattr (rpcsvc_request_t *req)
args.name = alloca (256);
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_getxattr_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_getxattr_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_GETXATTR;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -4169,9 +4708,10 @@ server3_3_getxattr (rpcsvc_request_t *req)
gf_server_check_getxattr_cmd (frame, state->name);
}
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -4180,7 +4720,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -4199,24 +4739,26 @@ server3_3_fgetxattr (rpcsvc_request_t *req)
return ret;
args.name = alloca (256);
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_fgetxattr_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_fgetxattr_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_FGETXATTR;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -4227,9 +4769,10 @@ server3_3_fgetxattr (rpcsvc_request_t *req)
if (args.namelen)
state->name = gf_strdup (args.name);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -4238,7 +4781,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -4259,24 +4802,26 @@ server3_3_removexattr (rpcsvc_request_t *req)
args.name = alloca (256);
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_removexattr_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_removexattr_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_REMOVEXATTR;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -4284,9 +4829,10 @@ server3_3_removexattr (rpcsvc_request_t *req)
memcpy (state->resolve.gfid, args.gfid, 16);
state->name = gf_strdup (args.name);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -4295,7 +4841,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -4314,25 +4860,26 @@ server3_3_fremovexattr (rpcsvc_request_t *req)
args.name = alloca (4096);
- if (!xdr_to_generic (req->msg[0], &args,
- (xdrproc_t)xdr_gfs3_fremovexattr_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_fremovexattr_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_FREMOVEXATTR;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -4341,9 +4888,10 @@ server3_3_fremovexattr (rpcsvc_request_t *req)
memcpy (state->resolve.gfid, args.gfid, 16);
state->name = gf_strdup (args.name);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -4352,7 +4900,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -4372,33 +4920,36 @@ server3_3_opendir (rpcsvc_request_t *req)
if (!req)
return ret;
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_opendir_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_opendir_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_OPENDIR;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
state->resolve.type = RESOLVE_MUST;
memcpy (state->resolve.gfid, args.gfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -4407,7 +4958,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -4426,24 +4977,26 @@ server3_3_readdirp (rpcsvc_request_t *req)
if (!req)
return ret;
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_readdirp_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_readdirp_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_READDIRP;
- state = CALL_STATE(frame);
- if (!state->conn->bound_xl) {
+ state = CALL_STATE (frame);
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -4464,7 +5017,8 @@ server3_3_readdirp (rpcsvc_request_t *req)
memcpy (state->resolve.gfid, args.gfid, 16);
/* here, dict itself works as xdata */
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->dict,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->dict,
(args.dict.dict_val),
(args.dict.dict_len), ret,
op_errno, out);
@@ -4474,7 +5028,7 @@ server3_3_readdirp (rpcsvc_request_t *req)
resolve_and_resume (frame, server_readdirp_resume);
out:
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
free (args.dict.dict_val);
@@ -4494,24 +5048,26 @@ server3_3_readdir (rpcsvc_request_t *req)
if (!req)
return ret;
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_readdir_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_readdir_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_READDIR;
- state = CALL_STATE(frame);
- if (!state->conn->bound_xl) {
+ state = CALL_STATE (frame);
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -4531,9 +5087,10 @@ server3_3_readdir (rpcsvc_request_t *req)
state->offset = args.offset;
memcpy (state->resolve.gfid, args.gfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -4542,7 +5099,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -4559,24 +5116,26 @@ server3_3_fsyncdir (rpcsvc_request_t *req)
if (!req)
return ret;
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_fsyncdir_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_fsyncdir_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_FSYNCDIR;
- state = CALL_STATE(frame);
- if (!state->conn->bound_xl) {
+ state = CALL_STATE (frame);
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -4585,9 +5144,10 @@ server3_3_fsyncdir (rpcsvc_request_t *req)
state->flags = args.data;
memcpy (state->resolve.gfid, args.gfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -4596,7 +5156,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -4617,24 +5177,26 @@ server3_3_mknod (rpcsvc_request_t *req)
args.bname = alloca (req->msg[0].iov_len);
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_mknod_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_mknod_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_MKNOD;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -4646,9 +5208,10 @@ server3_3_mknod (rpcsvc_request_t *req)
state->dev = args.dev;
state->umask = args.umask;
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -4656,7 +5219,7 @@ server3_3_mknod (rpcsvc_request_t *req)
out:
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
/* memory allocated by libc, don't use GF_FREE */
free (args.xdata.xdata_val);
@@ -4680,24 +5243,26 @@ server3_3_mkdir (rpcsvc_request_t *req)
args.bname = alloca (req->msg[0].iov_len);
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_mkdir_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_mkdir_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_MKDIR;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -4709,9 +5274,10 @@ server3_3_mkdir (rpcsvc_request_t *req)
state->umask = args.umask;
/* TODO: can do alloca for xdata field instead of stdalloc */
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -4719,7 +5285,7 @@ server3_3_mkdir (rpcsvc_request_t *req)
out:
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
free (args.xdata.xdata_val);
@@ -4741,24 +5307,26 @@ server3_3_rmdir (rpcsvc_request_t *req)
args.bname = alloca (req->msg[0].iov_len);
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_rmdir_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_rmdir_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_RMDIR;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -4768,9 +5336,10 @@ server3_3_rmdir (rpcsvc_request_t *req)
state->flags = args.xflags;
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -4779,7 +5348,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -4801,24 +5370,26 @@ server3_3_inodelk (rpcsvc_request_t *req)
args.volume = alloca (256);
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_inodelk_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_inodelk_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_INODELK;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -4855,9 +5426,10 @@ server3_3_inodelk (rpcsvc_request_t *req)
break;
}
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -4868,7 +5440,7 @@ out:
free (args.flock.lk_owner.lk_owner_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -4886,24 +5458,26 @@ server3_3_finodelk (rpcsvc_request_t *req)
return ret;
args.volume = alloca (256);
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_finodelk_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_finodelk_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_FINODELK;
- state = CALL_STATE(frame);
- if (!state->conn->bound_xl) {
+ state = CALL_STATE (frame);
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -4941,9 +5515,10 @@ server3_3_finodelk (rpcsvc_request_t *req)
break;
}
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -4954,7 +5529,7 @@ out:
free (args.flock.lk_owner.lk_owner_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -4975,24 +5550,26 @@ server3_3_entrylk (rpcsvc_request_t *req)
args.volume = alloca (256);
args.name = alloca (256);
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_entrylk_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_entrylk_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_ENTRYLK;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -5006,9 +5583,10 @@ server3_3_entrylk (rpcsvc_request_t *req)
state->cmd = args.cmd;
state->type = args.type;
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -5017,7 +5595,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -5037,24 +5615,26 @@ server3_3_fentrylk (rpcsvc_request_t *req)
args.name = alloca (256);
args.volume = alloca (256);
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_fentrylk_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_fentrylk_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_FENTRYLK;
- state = CALL_STATE(frame);
- if (!state->conn->bound_xl) {
+ state = CALL_STATE (frame);
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -5068,9 +5648,10 @@ server3_3_fentrylk (rpcsvc_request_t *req)
state->name = gf_strdup (args.name);
state->volume = gf_strdup (args.volume);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -5079,7 +5660,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -5096,24 +5677,26 @@ server3_3_access (rpcsvc_request_t *req)
if (!req)
return ret;
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_access_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_access_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_ACCESS;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -5121,9 +5704,10 @@ server3_3_access (rpcsvc_request_t *req)
memcpy (state->resolve.gfid, args.gfid, 16);
state->mask = args.mask;
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -5132,7 +5716,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -5154,24 +5738,26 @@ server3_3_symlink (rpcsvc_request_t *req)
args.bname = alloca (req->msg[0].iov_len);
args.linkname = alloca (4096);
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_symlink_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_symlink_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_SYMLINK;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -5181,9 +5767,10 @@ server3_3_symlink (rpcsvc_request_t *req)
state->name = gf_strdup (args.linkname);
state->umask = args.umask;
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -5191,7 +5778,7 @@ server3_3_symlink (rpcsvc_request_t *req)
out:
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
/* memory allocated by libc, don't use GF_FREE */
free (args.xdata.xdata_val);
@@ -5215,24 +5802,25 @@ server3_3_link (rpcsvc_request_t *req)
args.newbname = alloca (req->msg[0].iov_len);
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_link_req)) {
+ ret = xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_link_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_LINK;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -5243,9 +5831,10 @@ server3_3_link (rpcsvc_request_t *req)
state->resolve2.bname = gf_strdup (args.newbname);
memcpy (state->resolve2.pargfid, args.newgfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -5254,7 +5843,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -5275,24 +5864,26 @@ server3_3_rename (rpcsvc_request_t *req)
args.oldbname = alloca (req->msg[0].iov_len);
args.newbname = alloca (req->msg[0].iov_len);
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_rename_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_rename_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_RENAME;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -5304,9 +5895,10 @@ server3_3_rename (rpcsvc_request_t *req)
state->resolve2.bname = gf_strdup (args.newbname);
memcpy (state->resolve2.pargfid, args.newgfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -5315,7 +5907,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -5332,24 +5924,25 @@ server3_3_lk (rpcsvc_request_t *req)
if (!req)
return ret;
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_lk_req)) {
+ ret = xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_lk_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_LK;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -5397,7 +5990,7 @@ server3_3_lk (rpcsvc_request_t *req)
state->flock.l_type = F_UNLCK;
break;
default:
- gf_log (state->conn->bound_xl->name, GF_LOG_ERROR,
+ gf_log (frame->root->client->bound_xl->name, GF_LOG_ERROR,
"fd - %"PRId64" (%s): Unknown lock type: %"PRId32"!",
state->resolve.fd_no,
uuid_utoa (state->fd->inode->gfid), state->type);
@@ -5405,9 +5998,10 @@ server3_3_lk (rpcsvc_request_t *req)
}
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -5418,7 +6012,7 @@ out:
free (args.flock.lk_owner.lk_owner_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -5436,24 +6030,26 @@ server3_3_rchecksum (rpcsvc_request_t *req)
if (!req)
return ret;
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_rchecksum_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_rchecksum_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_RCHECKSUM;
- state = CALL_STATE(frame);
- if (!state->conn->bound_xl) {
+ state = CALL_STATE (frame);
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -5462,9 +6058,10 @@ server3_3_rchecksum (rpcsvc_request_t *req)
state->offset = args.offset;
state->size = args.len;
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -5473,7 +6070,7 @@ out:
free (args.xdata.xdata_val);
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
@@ -5506,16 +6103,18 @@ server3_3_lookup (rpcsvc_request_t *req)
args.bname = alloca (req->msg[0].iov_len);
args.xdata.xdata_val = alloca (req->msg[0].iov_len);
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_lookup_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_lookup_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto err;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto err;
}
frame->root->op = GF_FOP_LOOKUP;
@@ -5525,9 +6124,9 @@ server3_3_lookup (rpcsvc_request_t *req)
*/
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
@@ -5540,9 +6139,10 @@ server3_3_lookup (rpcsvc_request_t *req)
memcpy (state->resolve.gfid, args.gfid, 16);
}
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
@@ -5570,90 +6170,97 @@ server3_3_statfs (rpcsvc_request_t *req)
if (!req)
return ret;
- if (!xdr_to_generic (req->msg[0], &args, (xdrproc_t)xdr_gfs3_statfs_req)) {
+ ret = xdr_to_generic (req->msg[0], &args,
+ (xdrproc_t)xdr_gfs3_statfs_req);
+ if (ret < 0) {
//failed to decode msg;
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame = get_frame_from_request (req);
if (!frame) {
// something wrong, mostly insufficient memory
- req->rpc_err = GARBAGE_ARGS; /* TODO */
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
frame->root->op = GF_FOP_STATFS;
state = CALL_STATE (frame);
- if (!state->conn->bound_xl) {
+ if (!frame->root->client->bound_xl) {
/* auth failure, request on subvolume without setvolume */
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
goto out;
}
state->resolve.type = RESOLVE_MUST;
memcpy (state->resolve.gfid, args.gfid, 16);
- GF_PROTOCOL_DICT_UNSERIALIZE (state->conn->bound_xl, state->xdata,
- (args.xdata.xdata_val),
- (args.xdata.xdata_len), ret,
+ GF_PROTOCOL_DICT_UNSERIALIZE (frame->root->client->bound_xl,
+ state->xdata,
+ args.xdata.xdata_val,
+ args.xdata.xdata_len, ret,
op_errno, out);
ret = 0;
resolve_and_resume (frame, server_statfs_resume);
out:
if (op_errno)
- req->rpc_err = GARBAGE_ARGS;
+ SERVER_REQ_SET_ERROR (req, ret);
return ret;
}
rpcsvc_actor_t glusterfs3_3_fop_actors[] = {
- [GFS3_OP_NULL] = { "NULL", GFS3_OP_NULL, server_null, NULL, 0},
- [GFS3_OP_STAT] = { "STAT", GFS3_OP_STAT, server3_3_stat, NULL, 0},
- [GFS3_OP_READLINK] = { "READLINK", GFS3_OP_READLINK, server3_3_readlink, NULL, 0},
- [GFS3_OP_MKNOD] = { "MKNOD", GFS3_OP_MKNOD, server3_3_mknod, NULL, 0},
- [GFS3_OP_MKDIR] = { "MKDIR", GFS3_OP_MKDIR, server3_3_mkdir, NULL, 0},
- [GFS3_OP_UNLINK] = { "UNLINK", GFS3_OP_UNLINK, server3_3_unlink, NULL, 0},
- [GFS3_OP_RMDIR] = { "RMDIR", GFS3_OP_RMDIR, server3_3_rmdir, NULL, 0},
- [GFS3_OP_SYMLINK] = { "SYMLINK", GFS3_OP_SYMLINK, server3_3_symlink, NULL, 0},
- [GFS3_OP_RENAME] = { "RENAME", GFS3_OP_RENAME, server3_3_rename, NULL, 0},
- [GFS3_OP_LINK] = { "LINK", GFS3_OP_LINK, server3_3_link, NULL, 0},
- [GFS3_OP_TRUNCATE] = { "TRUNCATE", GFS3_OP_TRUNCATE, server3_3_truncate, NULL, 0},
- [GFS3_OP_OPEN] = { "OPEN", GFS3_OP_OPEN, server3_3_open, NULL, 0},
- [GFS3_OP_READ] = { "READ", GFS3_OP_READ, server3_3_readv, NULL, 0},
- [GFS3_OP_WRITE] = { "WRITE", GFS3_OP_WRITE, server3_3_writev, server3_3_writev_vecsizer, 0},
- [GFS3_OP_STATFS] = { "STATFS", GFS3_OP_STATFS, server3_3_statfs, NULL, 0},
- [GFS3_OP_FLUSH] = { "FLUSH", GFS3_OP_FLUSH, server3_3_flush, NULL, 0},
- [GFS3_OP_FSYNC] = { "FSYNC", GFS3_OP_FSYNC, server3_3_fsync, NULL, 0},
- [GFS3_OP_SETXATTR] = { "SETXATTR", GFS3_OP_SETXATTR, server3_3_setxattr, NULL, 0},
- [GFS3_OP_GETXATTR] = { "GETXATTR", GFS3_OP_GETXATTR, server3_3_getxattr, NULL, 0},
- [GFS3_OP_REMOVEXATTR] = { "REMOVEXATTR", GFS3_OP_REMOVEXATTR, server3_3_removexattr, NULL, 0},
- [GFS3_OP_OPENDIR] = { "OPENDIR", GFS3_OP_OPENDIR, server3_3_opendir, NULL, 0},
- [GFS3_OP_FSYNCDIR] = { "FSYNCDIR", GFS3_OP_FSYNCDIR, server3_3_fsyncdir, NULL, 0},
- [GFS3_OP_ACCESS] = { "ACCESS", GFS3_OP_ACCESS, server3_3_access, NULL, 0},
- [GFS3_OP_CREATE] = { "CREATE", GFS3_OP_CREATE, server3_3_create, NULL, 0},
- [GFS3_OP_FTRUNCATE] = { "FTRUNCATE", GFS3_OP_FTRUNCATE, server3_3_ftruncate, NULL, 0},
- [GFS3_OP_FSTAT] = { "FSTAT", GFS3_OP_FSTAT, server3_3_fstat, NULL, 0},
- [GFS3_OP_LK] = { "LK", GFS3_OP_LK, server3_3_lk, NULL, 0},
- [GFS3_OP_LOOKUP] = { "LOOKUP", GFS3_OP_LOOKUP, server3_3_lookup, NULL, 0},
- [GFS3_OP_READDIR] = { "READDIR", GFS3_OP_READDIR, server3_3_readdir, NULL, 0},
- [GFS3_OP_INODELK] = { "INODELK", GFS3_OP_INODELK, server3_3_inodelk, NULL, 0},
- [GFS3_OP_FINODELK] = { "FINODELK", GFS3_OP_FINODELK, server3_3_finodelk, NULL, 0},
- [GFS3_OP_ENTRYLK] = { "ENTRYLK", GFS3_OP_ENTRYLK, server3_3_entrylk, NULL, 0},
- [GFS3_OP_FENTRYLK] = { "FENTRYLK", GFS3_OP_FENTRYLK, server3_3_fentrylk, NULL, 0},
- [GFS3_OP_XATTROP] = { "XATTROP", GFS3_OP_XATTROP, server3_3_xattrop, NULL, 0},
- [GFS3_OP_FXATTROP] = { "FXATTROP", GFS3_OP_FXATTROP, server3_3_fxattrop, NULL, 0},
- [GFS3_OP_FGETXATTR] = { "FGETXATTR", GFS3_OP_FGETXATTR, server3_3_fgetxattr, NULL, 0},
- [GFS3_OP_FSETXATTR] = { "FSETXATTR", GFS3_OP_FSETXATTR, server3_3_fsetxattr, NULL, 0},
- [GFS3_OP_RCHECKSUM] = { "RCHECKSUM", GFS3_OP_RCHECKSUM, server3_3_rchecksum, NULL, 0},
- [GFS3_OP_SETATTR] = { "SETATTR", GFS3_OP_SETATTR, server3_3_setattr, NULL, 0},
- [GFS3_OP_FSETATTR] = { "FSETATTR", GFS3_OP_FSETATTR, server3_3_fsetattr, NULL, 0},
- [GFS3_OP_READDIRP] = { "READDIRP", GFS3_OP_READDIRP, server3_3_readdirp, NULL, 0},
- [GFS3_OP_RELEASE] = { "RELEASE", GFS3_OP_RELEASE, server3_3_release, NULL, 0},
- [GFS3_OP_RELEASEDIR] = { "RELEASEDIR", GFS3_OP_RELEASEDIR, server3_3_releasedir, NULL, 0},
- [GFS3_OP_FREMOVEXATTR] = { "FREMOVEXATTR", GFS3_OP_FREMOVEXATTR, server3_3_fremovexattr, NULL, 0},
+ [GFS3_OP_NULL] = {"NULL", GFS3_OP_NULL, server_null, NULL, 0, DRC_NA},
+ [GFS3_OP_STAT] = {"STAT", GFS3_OP_STAT, server3_3_stat, NULL, 0, DRC_NA},
+ [GFS3_OP_READLINK] = {"READLINK", GFS3_OP_READLINK, server3_3_readlink, NULL, 0, DRC_NA},
+ [GFS3_OP_MKNOD] = {"MKNOD", GFS3_OP_MKNOD, server3_3_mknod, NULL, 0, DRC_NA},
+ [GFS3_OP_MKDIR] = {"MKDIR", GFS3_OP_MKDIR, server3_3_mkdir, NULL, 0, DRC_NA},
+ [GFS3_OP_UNLINK] = {"UNLINK", GFS3_OP_UNLINK, server3_3_unlink, NULL, 0, DRC_NA},
+ [GFS3_OP_RMDIR] = {"RMDIR", GFS3_OP_RMDIR, server3_3_rmdir, NULL, 0, DRC_NA},
+ [GFS3_OP_SYMLINK] = {"SYMLINK", GFS3_OP_SYMLINK, server3_3_symlink, NULL, 0, DRC_NA},
+ [GFS3_OP_RENAME] = {"RENAME", GFS3_OP_RENAME, server3_3_rename, NULL, 0, DRC_NA},
+ [GFS3_OP_LINK] = {"LINK", GFS3_OP_LINK, server3_3_link, NULL, 0, DRC_NA},
+ [GFS3_OP_TRUNCATE] = {"TRUNCATE", GFS3_OP_TRUNCATE, server3_3_truncate, NULL, 0, DRC_NA},
+ [GFS3_OP_OPEN] = {"OPEN", GFS3_OP_OPEN, server3_3_open, NULL, 0, DRC_NA},
+ [GFS3_OP_READ] = {"READ", GFS3_OP_READ, server3_3_readv, NULL, 0, DRC_NA},
+ [GFS3_OP_WRITE] = {"WRITE", GFS3_OP_WRITE, server3_3_writev, server3_3_writev_vecsizer, 0, DRC_NA},
+ [GFS3_OP_STATFS] = {"STATFS", GFS3_OP_STATFS, server3_3_statfs, NULL, 0, DRC_NA},
+ [GFS3_OP_FLUSH] = {"FLUSH", GFS3_OP_FLUSH, server3_3_flush, NULL, 0, DRC_NA},
+ [GFS3_OP_FSYNC] = {"FSYNC", GFS3_OP_FSYNC, server3_3_fsync, NULL, 0, DRC_NA},
+ [GFS3_OP_SETXATTR] = {"SETXATTR", GFS3_OP_SETXATTR, server3_3_setxattr, NULL, 0, DRC_NA},
+ [GFS3_OP_GETXATTR] = {"GETXATTR", GFS3_OP_GETXATTR, server3_3_getxattr, NULL, 0, DRC_NA},
+ [GFS3_OP_REMOVEXATTR] = {"REMOVEXATTR", GFS3_OP_REMOVEXATTR, server3_3_removexattr, NULL, 0, DRC_NA},
+ [GFS3_OP_OPENDIR] = {"OPENDIR", GFS3_OP_OPENDIR, server3_3_opendir, NULL, 0, DRC_NA},
+ [GFS3_OP_FSYNCDIR] = {"FSYNCDIR", GFS3_OP_FSYNCDIR, server3_3_fsyncdir, NULL, 0, DRC_NA},
+ [GFS3_OP_ACCESS] = {"ACCESS", GFS3_OP_ACCESS, server3_3_access, NULL, 0, DRC_NA},
+ [GFS3_OP_CREATE] = {"CREATE", GFS3_OP_CREATE, server3_3_create, NULL, 0, DRC_NA},
+ [GFS3_OP_FTRUNCATE] = {"FTRUNCATE", GFS3_OP_FTRUNCATE, server3_3_ftruncate, NULL, 0, DRC_NA},
+ [GFS3_OP_FSTAT] = {"FSTAT", GFS3_OP_FSTAT, server3_3_fstat, NULL, 0, DRC_NA},
+ [GFS3_OP_LK] = {"LK", GFS3_OP_LK, server3_3_lk, NULL, 0, DRC_NA},
+ [GFS3_OP_LOOKUP] = {"LOOKUP", GFS3_OP_LOOKUP, server3_3_lookup, NULL, 0, DRC_NA},
+ [GFS3_OP_READDIR] = {"READDIR", GFS3_OP_READDIR, server3_3_readdir, NULL, 0, DRC_NA},
+ [GFS3_OP_INODELK] = {"INODELK", GFS3_OP_INODELK, server3_3_inodelk, NULL, 0, DRC_NA},
+ [GFS3_OP_FINODELK] = {"FINODELK", GFS3_OP_FINODELK, server3_3_finodelk, NULL, 0, DRC_NA},
+ [GFS3_OP_ENTRYLK] = {"ENTRYLK", GFS3_OP_ENTRYLK, server3_3_entrylk, NULL, 0, DRC_NA},
+ [GFS3_OP_FENTRYLK] = {"FENTRYLK", GFS3_OP_FENTRYLK, server3_3_fentrylk, NULL, 0, DRC_NA},
+ [GFS3_OP_XATTROP] = {"XATTROP", GFS3_OP_XATTROP, server3_3_xattrop, NULL, 0, DRC_NA},
+ [GFS3_OP_FXATTROP] = {"FXATTROP", GFS3_OP_FXATTROP, server3_3_fxattrop, NULL, 0, DRC_NA},
+ [GFS3_OP_FGETXATTR] = {"FGETXATTR", GFS3_OP_FGETXATTR, server3_3_fgetxattr, NULL, 0, DRC_NA},
+ [GFS3_OP_FSETXATTR] = {"FSETXATTR", GFS3_OP_FSETXATTR, server3_3_fsetxattr, NULL, 0, DRC_NA},
+ [GFS3_OP_RCHECKSUM] = {"RCHECKSUM", GFS3_OP_RCHECKSUM, server3_3_rchecksum, NULL, 0, DRC_NA},
+ [GFS3_OP_SETATTR] = {"SETATTR", GFS3_OP_SETATTR, server3_3_setattr, NULL, 0, DRC_NA},
+ [GFS3_OP_FSETATTR] = {"FSETATTR", GFS3_OP_FSETATTR, server3_3_fsetattr, NULL, 0, DRC_NA},
+ [GFS3_OP_READDIRP] = {"READDIRP", GFS3_OP_READDIRP, server3_3_readdirp, NULL, 0, DRC_NA},
+ [GFS3_OP_RELEASE] = {"RELEASE", GFS3_OP_RELEASE, server3_3_release, NULL, 0, DRC_NA},
+ [GFS3_OP_RELEASEDIR] = {"RELEASEDIR", GFS3_OP_RELEASEDIR, server3_3_releasedir, NULL, 0, DRC_NA},
+ [GFS3_OP_FREMOVEXATTR] = {"FREMOVEXATTR", GFS3_OP_FREMOVEXATTR, server3_3_fremovexattr, NULL, 0, DRC_NA},
+ [GFS3_OP_FALLOCATE] = {"FALLOCATE", GFS3_OP_FALLOCATE, server3_3_fallocate, NULL, 0, DRC_NA},
+ [GFS3_OP_DISCARD] = {"DISCARD", GFS3_OP_DISCARD, server3_3_discard, NULL, 0, DRC_NA},
+ [GFS3_OP_ZEROFILL] = {"ZEROFILL", GFS3_OP_ZEROFILL, server3_3_zerofill, NULL, 0, DRC_NA},
+ [GFS3_OP_IPC] = {"IPC", GFS3_OP_IPC, server3_3_ipc, NULL, 0, DRC_NA},
};
diff --git a/xlators/protocol/server/src/server.c b/xlators/protocol/server/src/server.c
index 908f62a7b..6bd00cac0 100644
--- a/xlators/protocol/server/src/server.c
+++ b/xlators/protocol/server/src/server.c
@@ -1,20 +1,11 @@
/*
- Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2010-2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
@@ -34,33 +25,56 @@
#include "statedump.h"
#include "defaults.h"
#include "authenticate.h"
-#include "rpcsvc.h"
void
grace_time_handler (void *data)
{
- server_connection_t *conn = NULL;
- xlator_t *this = NULL;
- gf_boolean_t cancelled = _gf_false;
- gf_boolean_t detached = _gf_false;
+ client_t *client = NULL;
+ xlator_t *this = NULL;
+ gf_timer_t *timer = NULL;
+ server_ctx_t *serv_ctx = NULL;
+ gf_boolean_t cancelled = _gf_false;
+ gf_boolean_t detached = _gf_false;
- conn = data;
- this = conn->this;
+ client = data;
+ this = client->this;
- GF_VALIDATE_OR_GOTO (THIS->name, conn, out);
GF_VALIDATE_OR_GOTO (THIS->name, this, out);
- gf_log (this->name, GF_LOG_INFO, "grace timer expired for %s", conn->id);
+ gf_log (this->name, GF_LOG_INFO, "grace timer expired for %s",
+ client->client_uid);
+
+ serv_ctx = server_ctx_get (client, this);
+
+ if (serv_ctx == NULL) {
+ gf_log (this->name, GF_LOG_INFO, "server_ctx_get() failed");
+ goto out;
+ }
- cancelled = server_cancel_conn_timer (this, conn);
+ LOCK (&serv_ctx->fdtable_lock);
+ {
+ if (serv_ctx->grace_timer) {
+ timer = serv_ctx->grace_timer;
+ serv_ctx->grace_timer = NULL;
+ }
+ }
+ UNLOCK (&serv_ctx->fdtable_lock);
+ if (timer) {
+ gf_timer_call_cancel (this->ctx, timer);
+ cancelled = _gf_true;
+ }
if (cancelled) {
- //conn should not be destroyed in conn_put, so take a ref.
- server_conn_ref (conn);
- server_connection_put (this, conn, &detached);
+
+ /*
+ * client must not be destroyed in gf_client_put(),
+ * so take a ref.
+ */
+ gf_client_ref (client);
+ gf_client_put (client, &detached);
if (detached)//reconnection did not happen :-(
- server_connection_cleanup (this, conn,
- INTERNAL_LOCKS | POSIX_LOCKS);
- server_conn_unref (conn);
+ server_connection_cleanup (this, client,
+ INTERNAL_LOCKS | POSIX_LOCKS);
+ gf_client_unref (client);
}
out:
return;
@@ -116,8 +130,6 @@ ret:
return iob;
}
-
-
int
server_submit_reply (call_frame_t *frame, rpcsvc_request_t *req, void *arg,
struct iovec *payload, int payloadcount,
@@ -128,19 +140,24 @@ server_submit_reply (call_frame_t *frame, rpcsvc_request_t *req, void *arg,
struct iovec rsp = {0,};
server_state_t *state = NULL;
char new_iobref = 0;
- server_connection_t *conn = NULL;
+ client_t *client = NULL;
gf_boolean_t lk_heal = _gf_false;
+ server_conf_t *conf = NULL;
+ gf_barrier_t *barrier = NULL;
+ gf_barrier_payload_t *stub = NULL;
+ gf_boolean_t barriered = _gf_false;
GF_VALIDATE_OR_GOTO ("server", req, ret);
if (frame) {
state = CALL_STATE (frame);
frame->local = NULL;
- conn = SERVER_CONNECTION(frame);
+ client = frame->root->client;
+ conf = (server_conf_t *) client->this->private;
}
- if (conn)
- lk_heal = ((server_conf_t *) conn->this->private)->lk_heal;
+ if (client)
+ lk_heal = ((server_conf_t *) client->this->private)->lk_heal;
if (!iobref) {
iobref = iobref_new ();
@@ -159,6 +176,32 @@ server_submit_reply (call_frame_t *frame, rpcsvc_request_t *req, void *arg,
iobref_add (iobref, iob);
+ if (conf)
+ barrier = conf->barrier;
+ if (barrier) {
+ /* todo: write's with fd flags set to O_SYNC and O_DIRECT */
+ LOCK (&barrier->lock);
+ {
+ if (is_fop_barriered (barrier->fops, req->procnum) &&
+ (barrier_add_to_queue (barrier))) {
+ stub = gf_barrier_payload (req, &rsp, frame,
+ payload,
+ payloadcount, iobref,
+ iob, new_iobref);
+ if (stub) {
+ gf_barrier_enqueue (barrier, stub);
+ barriered = _gf_true;
+ } else {
+ gf_log ("", GF_LOG_ERROR, "Failed to "
+ " barrier fop %"PRIu64,
+ ((uint64_t)1 << req->procnum));
+ }
+ }
+ }
+ UNLOCK (&barrier->lock);
+ if (barriered == _gf_true)
+ goto out;
+ }
/* Then, submit the message for transmission. */
ret = rpcsvc_submit_generic (req, &rsp, 1, payload, payloadcount,
iobref);
@@ -174,13 +217,14 @@ server_submit_reply (call_frame_t *frame, rpcsvc_request_t *req, void *arg,
iobuf_unref (iob);
if (ret == -1) {
gf_log_callingfn ("", GF_LOG_ERROR, "Reply submission failed");
- if (frame && conn && !lk_heal) {
- server_connection_cleanup (frame->this, conn,
+ if (frame && client && !lk_heal) {
+ server_connection_cleanup (frame->this, client,
INTERNAL_LOCKS | POSIX_LOCKS);
} else {
+ gf_log_callingfn ("", GF_LOG_ERROR,
+ "Reply submission failed");
/* TODO: Failure of open(dir), create, inodelk, entrylk
or lk fops send failure must be handled specially. */
- ;
}
goto ret;
}
@@ -192,185 +236,17 @@ ret:
}
if (frame) {
- if (frame->root->trans)
- server_conn_unref (frame->root->trans);
+ gf_client_unref (client);
STACK_DESTROY (frame->root);
}
if (new_iobref) {
iobref_unref (iobref);
}
-
- return ret;
-}
-
-/* */
-int
-server_fd_to_dict (xlator_t *this, dict_t *dict)
-{
- server_conf_t *conf = NULL;
- server_connection_t *trav = NULL;
- char key[GF_DUMP_MAX_BUF_LEN] = {0,};
- int count = 0;
- int ret = -1;
-
- GF_VALIDATE_OR_GOTO (THIS->name, this, out);
- GF_VALIDATE_OR_GOTO (this->name, dict, out);
-
- conf = this->private;
- if (!conf)
- return -1;
-
- ret = pthread_mutex_trylock (&conf->mutex);
- if (ret)
- return -1;
-
- list_for_each_entry (trav, &conf->conns, list) {
- memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "conn%d", count++);
- fdtable_dump_to_dict (trav->fdtable, key, dict);
- }
- pthread_mutex_unlock (&conf->mutex);
-
- ret = dict_set_int32 (dict, "conncount", count);
-out:
- return ret;
-}
-
-int
-server_fd (xlator_t *this)
-{
- server_conf_t *conf = NULL;
- server_connection_t *trav = NULL;
- char key[GF_DUMP_MAX_BUF_LEN];
- int i = 1;
- int ret = -1;
- gf_boolean_t section_added = _gf_false;
-
- GF_VALIDATE_OR_GOTO ("server", this, out);
-
- conf = this->private;
- if (!conf) {
- gf_log (this->name, GF_LOG_WARNING,
- "conf null in xlator");
- return -1;
- }
-
- gf_proc_dump_add_section("xlator.protocol.server.conn");
- section_added = _gf_true;
-
- ret = pthread_mutex_trylock (&conf->mutex);
- if (ret)
- goto out;
-
- list_for_each_entry (trav, &conf->conns, list) {
- if (trav->id) {
- gf_proc_dump_build_key(key,
- "conn","%d.id", i);
- gf_proc_dump_write(key, "%s", trav->id);
- }
-
- gf_proc_dump_build_key(key,"conn","%d.ref",i)
- gf_proc_dump_write(key, "%d", trav->ref);
- if (trav->bound_xl) {
- gf_proc_dump_build_key(key,
- "conn","%d.bound_xl", i);
- gf_proc_dump_write(key, "%s", trav->bound_xl->name);
- }
-
- gf_proc_dump_build_key(key,
- "conn","%d.id", i);
- fdtable_dump(trav->fdtable,key);
- i++;
- }
- pthread_mutex_unlock (&conf->mutex);
-
- ret = 0;
out:
- if (ret) {
- if (section_added == _gf_false)
- gf_proc_dump_add_section("xlator.protocol.server.conn");
- gf_proc_dump_write ("Unable to dump the list of connections",
- "(Lock acquisition failed) %s",
- this?this->name:"server");
- }
return ret;
}
-void
-ltable_dump (server_connection_t *trav)
-{
- char key[GF_DUMP_MAX_BUF_LEN] = {0,};
- struct _locker *locker = NULL;
- char locker_data[GF_MAX_LOCK_OWNER_LEN] = {0,};
- int count = 0;
-
- gf_proc_dump_build_key(key,
- "conn","bound_xl.ltable.inodelk.%s",
- trav->bound_xl?trav->bound_xl->name:"");
- gf_proc_dump_add_section(key);
-
- list_for_each_entry (locker, &trav->ltable->inodelk_lockers, lockers) {
- count++;
- gf_proc_dump_write("volume", "%s", locker->volume);
- if (locker->fd) {
- gf_proc_dump_write("fd", "%p", locker->fd);
- gf_proc_dump_write("gfid", "%s",
- uuid_utoa (locker->fd->inode->gfid));
- } else {
- gf_proc_dump_write("fd", "%s", locker->loc.path);
- gf_proc_dump_write("gfid", "%s",
- uuid_utoa (locker->loc.inode->gfid));
- }
- gf_proc_dump_write("pid", "%d", locker->pid);
- gf_proc_dump_write("lock length", "%d", locker->owner.len);
- lkowner_unparse (&locker->owner, locker_data,
- locker->owner.len);
- gf_proc_dump_write("lock owner", "%s", locker_data);
- memset (locker_data, 0, sizeof (locker_data));
-
- gf_proc_dump_build_key (key, "inode", "%d", count);
- gf_proc_dump_add_section (key);
- if (locker->fd)
- inode_dump (locker->fd->inode, key);
- else
- inode_dump (locker->loc.inode, key);
- }
-
- count = 0;
- locker = NULL;
- gf_proc_dump_build_key(key,
- "conn","bound_xl.ltable.entrylk.%s",
- trav->bound_xl?trav->bound_xl->name:"");
- gf_proc_dump_add_section(key);
-
- list_for_each_entry (locker, &trav->ltable->entrylk_lockers,
- lockers) {
- count++;
- gf_proc_dump_write("volume", "%s", locker->volume);
- if (locker->fd) {
- gf_proc_dump_write("fd", "%p", locker->fd);
- gf_proc_dump_write("gfid", "%s",
- uuid_utoa (locker->fd->inode->gfid));
- } else {
- gf_proc_dump_write("fd", "%s", locker->loc.path);
- gf_proc_dump_write("gfid", "%s",
- uuid_utoa (locker->loc.inode->gfid));
- }
- gf_proc_dump_write("pid", "%d", locker->pid);
- gf_proc_dump_write("lock length", "%d", locker->owner.len);
- lkowner_unparse (&locker->owner, locker_data, locker->owner.len);
- gf_proc_dump_write("lock data", "%s", locker_data);
- memset (locker_data, 0, sizeof (locker_data));
-
- gf_proc_dump_build_key (key, "inode", "%d", count);
- gf_proc_dump_add_section (key);
- if (locker->fd)
- inode_dump (locker->fd->inode, key);
- else
- inode_dump (locker->loc.inode, key);
- }
-}
int
server_priv_to_dict (xlator_t *this, dict_t *dict)
@@ -477,104 +353,6 @@ out:
return ret;
}
-int
-server_inode_to_dict (xlator_t *this, dict_t *dict)
-{
- server_conf_t *conf = NULL;
- server_connection_t *trav = NULL;
- char key[32] = {0,};
- int count = 0;
- int ret = -1;
- xlator_t *prev_bound_xl = NULL;
-
- GF_VALIDATE_OR_GOTO (THIS->name, this, out);
- GF_VALIDATE_OR_GOTO (this->name, dict, out);
-
- conf = this->private;
- if (!conf)
- return -1;
-
- ret = pthread_mutex_trylock (&conf->mutex);
- if (ret)
- return -1;
-
- list_for_each_entry (trav, &conf->conns, list) {
- if (trav->bound_xl && trav->bound_xl->itable) {
- /* Presently every brick contains only one
- * bound_xl for all connections. This will lead
- * to duplicating of the inode lists, if listing
- * is done for every connection. This simple check
- * prevents duplication in the present case. If
- * need arises the check can be improved.
- */
- if (trav->bound_xl == prev_bound_xl)
- continue;
- prev_bound_xl = trav->bound_xl;
-
- memset (key, 0, sizeof (key));
- snprintf (key, sizeof (key), "conn%d", count);
- inode_table_dump_to_dict (trav->bound_xl->itable,
- key, dict);
- count++;
- }
- }
- pthread_mutex_unlock (&conf->mutex);
-
- ret = dict_set_int32 (dict, "conncount", count);
-
-out:
- if (prev_bound_xl)
- prev_bound_xl = NULL;
- return ret;
-}
-
-int
-server_inode (xlator_t *this)
-{
- server_conf_t *conf = NULL;
- server_connection_t *trav = NULL;
- char key[GF_DUMP_MAX_BUF_LEN];
- int i = 1;
- int ret = -1;
-
- GF_VALIDATE_OR_GOTO ("server", this, out);
-
- conf = this->private;
- if (!conf) {
- gf_log (this->name, GF_LOG_WARNING,
- "conf null in xlator");
- return -1;
- }
-
- ret = pthread_mutex_trylock (&conf->mutex);
- if (ret)
- goto out;
-
- list_for_each_entry (trav, &conf->conns, list) {
- ret = pthread_mutex_trylock (&trav->lock);
- if (!ret)
- {
- gf_proc_dump_build_key(key,
- "conn","%d.ltable", i);
- gf_proc_dump_add_section(key);
- ltable_dump (trav);
- i++;
- pthread_mutex_unlock (&trav->lock);
- }else
- continue;
- }
- pthread_mutex_unlock (&conf->mutex);
-
- ret = 0;
-out:
- if (ret)
- gf_proc_dump_write ("Unable to dump the lock table",
- "(Lock acquisition failed) %s",
- this?this->name:"server");
-
- return ret;
-}
-
static int
get_auth_types (dict_t *this, char *key, data_t *value, void *data)
@@ -616,84 +394,85 @@ out:
return 0;
}
+int
+_check_for_auth_option (dict_t *d, char *k, data_t *v,
+ void *tmp)
+{
+ int ret = 0;
+ xlator_t *xl = NULL;
+ char *tail = NULL;
+ char *tmp_addr_list = NULL;
+ char *addr = NULL;
+ char *tmp_str = NULL;
+
+ xl = tmp;
+
+ tail = strtail (k, "auth.");
+ if (!tail)
+ goto out;
+
+ /* fast fwd thru module type */
+ tail = strchr (tail, '.');
+ if (!tail)
+ goto out;
+ tail++;
+
+ tail = strtail (tail, xl->name);
+ if (!tail)
+ goto out;
+
+ if (*tail == '.') {
+ /* when we are here, the key is checked for
+ * valid auth.allow.<xlator>
+ * Now we verify the ip address
+ */
+ if (!strcmp (v->data, "*")) {
+ ret = 0;
+ goto out;
+ }
+
+ tmp_addr_list = gf_strdup (v->data);
+ addr = strtok_r (tmp_addr_list, ",", &tmp_str);
+ if (!addr)
+ addr = v->data;
+
+ while (addr) {
+ if (valid_internet_address (addr, _gf_true)) {
+ ret = 0;
+ } else {
+ ret = -1;
+ gf_log (xl->name, GF_LOG_ERROR,
+ "internet address '%s'"
+ " does not conform to"
+ " standards.", addr);
+ goto out;
+ }
+ if (tmp_str)
+ addr = strtok_r (NULL, ",", &tmp_str);
+ else
+ addr = NULL;
+ }
+
+ GF_FREE (tmp_addr_list);
+ tmp_addr_list = NULL;
+ }
+out:
+ return ret;
+}
int
validate_auth_options (xlator_t *this, dict_t *dict)
{
int error = -1;
xlator_list_t *trav = NULL;
- char *tail = NULL;
- char *tmp_addr_list = NULL;
- char *addr = NULL;
- char *tmp_str = NULL;
GF_VALIDATE_OR_GOTO ("server", this, out);
GF_VALIDATE_OR_GOTO ("server", dict, out);
trav = this->children;
while (trav) {
- int _check_for_auth_option (dict_t *d, char *k, data_t *v,
- void *tmp)
- {
- int ret = 0;
- tail = strtail (k, "auth.");
- if (!tail)
- goto internal_out;
-
- /* fast fwd thru module type */
- tail = strchr (tail, '.');
- if (!tail)
- goto internal_out;
- tail++;
-
- tail = strtail (tail, trav->xlator->name);
- if (!tail)
- goto internal_out;
-
- if (*tail == '.') {
-
- /* when we are here, the key is checked for
- * valid auth.allow.<xlator>
- * Now we verify the ip address
- */
- if (!strcmp (v->data, "*")) {
- ret = 0;
- goto internal_out;
- }
-
- tmp_addr_list = gf_strdup (v->data);
- addr = strtok_r (tmp_addr_list, ",", &tmp_str);
- if (!addr)
- addr = v->data;
-
- while (addr) {
-
- if (valid_internet_address (addr,
- _gf_true)) {
- ret = 0;
- } else {
- ret = -1;
- gf_log (this->name, GF_LOG_ERROR,
- "internet address '%s'"
- " does not conform to"
- " standards.", addr);
- goto internal_out;
-
- }
- if (tmp_str)
- addr = strtok_r (NULL, ",",
- &tmp_str);
- else
- addr = NULL;
- }
-
- GF_FREE (tmp_addr_list);
- tmp_addr_list = NULL;
- }
- internal_out:
- return ret;
- }
- error = dict_foreach (dict, _check_for_auth_option, NULL);
+ error = dict_foreach (dict, _check_for_auth_option,
+ trav->xlator);
if (-1 == error) {
gf_log (this->name, GF_LOG_ERROR,
@@ -706,7 +485,6 @@ validate_auth_options (xlator_t *this, dict_t *dict)
}
out:
- GF_FREE (tmp_addr_list);
return error;
}
@@ -717,9 +495,10 @@ server_rpc_notify (rpcsvc_t *rpc, void *xl, rpcsvc_event_t event,
{
gf_boolean_t detached = _gf_false;
xlator_t *this = NULL;
- rpc_transport_t *xprt = NULL;
- server_connection_t *conn = NULL;
+ rpc_transport_t *trans = NULL;
server_conf_t *conf = NULL;
+ client_t *client = NULL;
+ server_ctx_t *serv_ctx = NULL;
if (!xl || !data) {
gf_log_callingfn ("server", GF_LOG_WARNING,
@@ -728,7 +507,7 @@ server_rpc_notify (rpcsvc_t *rpc, void *xl, rpcsvc_event_t event,
}
this = xl;
- xprt = data;
+ trans = data;
conf = this->private;
switch (event) {
@@ -736,17 +515,17 @@ server_rpc_notify (rpcsvc_t *rpc, void *xl, rpcsvc_event_t event,
{
/* Have a structure per new connection */
/* TODO: Should we create anything here at all ? * /
- conn = create_server_conn_state (this, xprt);
- if (!conn)
+ client->conn = create_server_conn_state (this, trans);
+ if (!client->conn)
goto out;
- xprt->protocol_private = conn;
+ trans->protocol_private = client->conn;
*/
- INIT_LIST_HEAD (&xprt->list);
+ INIT_LIST_HEAD (&trans->list);
pthread_mutex_lock (&conf->mutex);
{
- list_add_tail (&xprt->list, &conf->xprt_list);
+ list_add_tail (&trans->list, &conf->xprt_list);
}
pthread_mutex_unlock (&conf->mutex);
@@ -759,50 +538,58 @@ server_rpc_notify (rpcsvc_t *rpc, void *xl, rpcsvc_event_t event,
*/
pthread_mutex_lock (&conf->mutex);
{
- list_del_init (&xprt->list);
+ list_del_init (&trans->list);
}
pthread_mutex_unlock (&conf->mutex);
- conn = get_server_conn_state (this, xprt);
- if (!conn)
+ client = trans->xl_private;
+ if (!client)
break;
gf_log (this->name, GF_LOG_INFO, "disconnecting connection"
- "from %s", conn->id);
+ "from %s", client->client_uid);
/* If lock self heal is off, then destroy the
conn object, else register a grace timer event */
if (!conf->lk_heal) {
- server_conn_ref (conn);
- server_connection_put (this, conn, &detached);
+ gf_client_ref (client);
+ gf_client_put (client, &detached);
if (detached)
- server_connection_cleanup (this, conn,
- INTERNAL_LOCKS |
- POSIX_LOCKS);
- server_conn_unref (conn);
- } else {
- put_server_conn_state (this, xprt);
- server_connection_cleanup (this, conn, INTERNAL_LOCKS);
+ server_connection_cleanup (this, client,
+ INTERNAL_LOCKS | POSIX_LOCKS);
+ gf_client_unref (client);
+ break;
+ }
+ trans->xl_private = NULL;
+ server_connection_cleanup (this, client, INTERNAL_LOCKS);
- pthread_mutex_lock (&conn->lock);
- {
- if (conn->timer)
- goto unlock;
+ serv_ctx = server_ctx_get (client, this);
- gf_log (this->name, GF_LOG_INFO, "starting a grace "
- "timer for %s", conn->id);
+ if (serv_ctx == NULL) {
+ gf_log (this->name, GF_LOG_INFO,
+ "server_ctx_get() failed");
+ goto out;
+ }
- conn->timer = gf_timer_call_after (this->ctx,
- conf->grace_tv,
- grace_time_handler,
- conn);
+ LOCK (&serv_ctx->fdtable_lock);
+ {
+ if (!serv_ctx->grace_timer) {
+
+ gf_log (this->name, GF_LOG_INFO,
+ "starting a grace timer for %s",
+ client->client_uid);
+
+ serv_ctx->grace_timer =
+ gf_timer_call_after (this->ctx,
+ conf->grace_ts,
+ grace_time_handler,
+ client);
}
- unlock:
- pthread_mutex_unlock (&conn->lock);
}
+ UNLOCK (&serv_ctx->fdtable_lock);
break;
case RPCSVC_EVENT_TRANSPORT_DESTROY:
- /*- conn obj has been disassociated from xprt on first
+ /*- conn obj has been disassociated from trans on first
* disconnect.
* conn cleanup and destruction is handed over to
* grace_time_handler or the subsequent handler that 'owns'
@@ -839,13 +626,16 @@ static int
_delete_auth_opt (dict_t *this, char *key, data_t *value, void *data)
{
char *auth_option_pattern[] = { "auth.addr.*.allow",
- "auth.addr.*.reject"};
-
- if (fnmatch ( auth_option_pattern[0], key, 0) != 0)
- dict_del (this, key);
+ "auth.addr.*.reject",
+ NULL};
+ int i = 0;
- if (fnmatch ( auth_option_pattern[1], key, 0) != 0)
- dict_del (this, key);
+ for (i = 0; auth_option_pattern[i]; i++) {
+ if (fnmatch (auth_option_pattern[i], key, 0) == 0) {
+ dict_del (this, key);
+ break;
+ }
+ }
return 0;
}
@@ -855,12 +645,16 @@ static int
_copy_auth_opt (dict_t *unused, char *key, data_t *value, void *xl_dict)
{
char *auth_option_pattern[] = { "auth.addr.*.allow",
- "auth.addr.*.reject"};
- if (fnmatch ( auth_option_pattern[0], key, 0) != 0)
- dict_set ((dict_t *)xl_dict, key, (value));
+ "auth.addr.*.reject",
+ NULL};
+ int i = 0;
- if (fnmatch ( auth_option_pattern[1], key, 0) != 0)
- dict_set ((dict_t *)xl_dict, key, (value));
+ for (i = 0; auth_option_pattern [i]; i++) {
+ if (fnmatch (auth_option_pattern[i], key, 0) == 0) {
+ dict_set ((dict_t *)xl_dict, key, value);
+ break;
+ }
+ }
return 0;
}
@@ -889,14 +683,14 @@ server_init_grace_timer (xlator_t *this, dict_t *options,
ret = dict_get_int32 (options, "grace-timeout", &grace_timeout);
if (!ret)
- conf->grace_tv.tv_sec = grace_timeout;
+ conf->grace_ts.tv_sec = grace_timeout;
else
- conf->grace_tv.tv_sec = 10;
+ conf->grace_ts.tv_sec = 10;
gf_log (this->name, GF_LOG_DEBUG, "Server grace timeout "
- "value = %"PRIu64, conf->grace_tv.tv_sec);
+ "value = %"GF_PRI_SECOND, conf->grace_ts.tv_sec);
- conf->grace_tv.tv_usec = 0;
+ conf->grace_ts.tv_nsec = 0;
ret = 0;
out:
@@ -934,21 +728,15 @@ reconfigure (xlator_t *this, dict_t *options)
gf_log (this->name, GF_LOG_WARNING,
"'trace' takes on only boolean values. "
"Neglecting option");
- ret = -1;
+ ret = -1;
goto out;
- }
- conf->trace = trace;
- gf_log (this->name, GF_LOG_TRACE, "Reconfigured trace"
- " to %d", conf->trace);
+ }
+ conf->trace = trace;
+ gf_log (this->name, GF_LOG_TRACE, "Reconfigured trace"
+ " to %d", conf->trace);
- }
+ }
- /*ret = dict_get_str (options, "statedump-path", &statedump_path);
- if (!ret) {
- gf_path_strip_trailing_slashes (statedump_path);
- GF_FREE (this->ctx->statedump_path);
- this->ctx->statedump_path = gf_strdup (statedump_path);
- }*/
GF_OPTION_RECONF ("statedump-path", statedump_path,
options, path, out);
if (!statedump_path) {
@@ -986,6 +774,16 @@ reconfigure (xlator_t *this, dict_t *options)
}
(void) rpcsvc_set_allow_insecure (rpc_conf, options);
+ (void) rpcsvc_set_root_squash (rpc_conf, options);
+
+ ret = rpcsvc_set_outstanding_rpc_limit (rpc_conf, options,
+ RPCSVC_DEFAULT_OUTSTANDING_RPC_LIMIT);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to reconfigure outstanding-rpc-limit");
+ goto out;
+ }
+
list_for_each_entry (listeners, &(rpc_conf->listeners), list) {
if (listeners->trans != NULL) {
if (listeners->trans->reconfigure )
@@ -1002,6 +800,26 @@ out:
return ret;
}
+static int32_t
+client_destroy_cbk (xlator_t *this, client_t *client)
+{
+ void *tmp = NULL;
+ server_ctx_t *ctx = NULL;
+
+ client_ctx_del (client, this, &tmp);
+
+ ctx = tmp;
+
+ if (ctx == NULL)
+ return 0;
+
+ gf_fd_fdtable_destroy (ctx->fdtable);
+ LOCK_DESTROY (&ctx->fdtable_lock);
+ GF_FREE (ctx);
+
+ return 0;
+}
+
int
init (xlator_t *this)
{
@@ -1009,6 +827,8 @@ init (xlator_t *this)
server_conf_t *conf = NULL;
rpcsvc_listener_t *listener = NULL;
char *statedump_path = NULL;
+ gf_barrier_t *barrier = NULL;
+ char *str = NULL;
GF_VALIDATE_OR_GOTO ("init", this, out);
if (this->children == NULL) {
@@ -1028,7 +848,6 @@ init (xlator_t *this)
GF_VALIDATE_OR_GOTO(this->name, conf, out);
- INIT_LIST_HEAD (&conf->conns);
INIT_LIST_HEAD (&conf->xprt_list);
pthread_mutex_init (&conf->mutex, NULL);
@@ -1080,12 +899,20 @@ init (xlator_t *this)
/* RPC related */
conf->rpc = rpcsvc_init (this, this->ctx, this->options, 0);
if (conf->rpc == NULL) {
- gf_log (this->name, GF_LOG_WARNING,
+ gf_log (this->name, GF_LOG_ERROR,
"creation of rpcsvc failed");
ret = -1;
goto out;
}
+ ret = rpcsvc_set_outstanding_rpc_limit (conf->rpc, this->options,
+ RPCSVC_DEFAULT_OUTSTANDING_RPC_LIMIT);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to configure outstanding-rpc-limit");
+ goto out;
+ }
+
ret = rpcsvc_create_listeners (conf->rpc, this->options,
this->name);
if (ret < 1) {
@@ -1150,6 +977,37 @@ init (xlator_t *this)
}
}
#endif
+ /* barrier related */
+ barrier = GF_CALLOC (1, sizeof (*barrier),1);
+ if (!barrier) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "WARNING: Failed to allocate barrier");
+ ret = -1;
+ goto out;
+ }
+
+ LOCK_INIT (&barrier->lock);
+ INIT_LIST_HEAD (&barrier->queue);
+ barrier->on = _gf_false;
+
+ GF_OPTION_INIT ("barrier-queue-length", barrier->max_size,
+ int64, out);
+ GF_OPTION_INIT ("barrier-timeout", barrier->time_out,
+ uint64, out);
+
+ ret = dict_get_str (this->options, "barrier-fops", &str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "setting barrier fops to default value");
+ }
+ ret = gf_barrier_fops_configure (this, barrier, str);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "invalid barrier fops specified");
+ goto out;
+ }
+
+ conf->barrier = barrier;
this->private = conf;
ret = 0;
@@ -1203,29 +1061,65 @@ int
notify (xlator_t *this, int32_t event, void *data, ...)
{
int ret = 0;
+ int32_t val = 0;
+ dict_t *dict = NULL;
+ dict_t *output = NULL;
+ va_list ap;
+
+ dict = data;
+ va_start (ap, data);
+ output = va_arg (ap, dict_t*);
+ va_end (ap);
+
switch (event) {
+ case GF_EVENT_VOLUME_BARRIER_OP:
+ ret = dict_get_int32 (dict, "barrier", &val);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Wrong BARRIER event");
+ goto out;
+ }
+ /* !val un-barrier, if val, barrier */
+ if (val) {
+ ret = gf_barrier_start (this);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Barrier start failed");
+ } else {
+ ret = gf_barrier_stop (this);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Barrier stop failed");
+ }
+ ret = dict_set_int32 (output, "barrier-status", ret);
+ if (ret)
+ gf_log (this->name, GF_LOG_ERROR,
+ "Failed to set barrier-status in dict");
+ break;
+
+ /* todo: call default_notify to make other xlators handle it.*/
default:
default_notify (this, event, data);
break;
}
-
+out:
return ret;
}
-struct xlator_fops fops = {
-};
+struct xlator_fops fops;
struct xlator_cbks cbks = {
+ .client_destroy = client_destroy_cbk,
};
struct xlator_dumpops dumpops = {
.priv = server_priv,
- .fd = server_fd,
- .inode = server_inode,
+ .fd = gf_client_dump_fdtables,
+ .inode = gf_client_dump_inodes,
.priv_to_dict = server_priv_to_dict,
- .fd_to_dict = server_fd_to_dict,
- .inode_to_dict = server_inode_to_dict,
+ .fd_to_dict = gf_client_dump_fdtables_to_dict,
+ .inode_to_dict = gf_client_dump_inodes_to_dict,
};
@@ -1253,6 +1147,8 @@ struct volume_options options[] = {
.min = 0,
.max = (1 * GF_UNIT_MB),
.default_value = "16384",
+ .description = "Specifies the maximum megabytes of memory to be "
+ "used in the inode cache."
},
{ .key = {"verify-volfile-checksum"},
.type = GF_OPTION_TYPE_BOOL
@@ -1267,9 +1163,33 @@ struct volume_options options[] = {
{ .key = {"rpc-auth-allow-insecure"},
.type = GF_OPTION_TYPE_BOOL,
},
+ { .key = {"root-squash"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "Map requests from uid/gid 0 to the anonymous "
+ "uid/gid. Note that this does not apply to any other "
+ "uids or gids that might be equally sensitive, such "
+ "as user bin or group staff."
+ },
+ { .key = {"anonuid"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "65534", /* RPC_NOBODY_UID */
+ .min = 0,
+ .max = (uint32_t) -1,
+ .description = "value of the uid used for the anonymous "
+ "user/nfsnobody when root-squash is enabled."
+ },
+ { .key = {"anongid"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "65534", /* RPC_NOBODY_GID */
+ .min = 0,
+ .max = (uint32_t) -1,
+ .description = "value of the gid used for the anonymous "
+ "user/nfsnobody when root-squash is enabled."
+ },
{ .key = {"statedump-path"},
.type = GF_OPTION_TYPE_PATH,
- .default_value = "/tmp",
+ .default_value = DEFAULT_VAR_RUN_DIRECTORY,
.description = "Specifies directory in which gluster should save its"
" statedumps. By default it is the /tmp directory"
},
@@ -1285,7 +1205,8 @@ struct volume_options options[] = {
{.key = {"tcp-window-size"},
.type = GF_OPTION_TYPE_SIZET,
.min = GF_MIN_SOCKET_WINDOW_SIZE,
- .max = GF_MAX_SOCKET_WINDOW_SIZE
+ .max = GF_MAX_SOCKET_WINDOW_SIZE,
+ .description = "Specifies the window size for tcp socket."
},
/* The following two options are defined in addr.c, redifined here *
@@ -1294,15 +1215,34 @@ struct volume_options options[] = {
{ .key = {"auth.addr.*.allow"},
.type = GF_OPTION_TYPE_INTERNET_ADDRESS_LIST,
.description = "Allow a comma separated list of addresses and/or "
- "hostnames to connect to the server. By default, all"
- " connections are allowed."
+ "hostnames to connect to the server. Option "
+ "auth.reject overrides this option. By default, all "
+ "connections are allowed."
},
{ .key = {"auth.addr.*.reject"},
.type = GF_OPTION_TYPE_INTERNET_ADDRESS_LIST,
.description = "Reject a comma separated list of addresses and/or "
- "hostnames to connect to the server. By default, all"
+ "hostnames to connect to the server. This option "
+ "overrides the auth.allow option. By default, all"
" connections are allowed."
},
-
+ {.key = {"barrier-timeout"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "60",
+ .min = 0,
+ .max = 360,
+ .description = "Barrier timeout in seconds",
+ },
+ {.key = {"barrier-queue-length"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "4096",
+ .min = 0,
+ .max = 16384,
+ .description = "Barrier queue length",
+ },
+ {.key = {"barrier-fops"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "Allow a comma seperated fop lists",
+ },
{ .key = {NULL} },
};
diff --git a/xlators/protocol/server/src/server.h b/xlators/protocol/server/src/server.h
index 750196e1b..782327d77 100644
--- a/xlators/protocol/server/src/server.h
+++ b/xlators/protocol/server/src/server.h
@@ -1,20 +1,11 @@
/*
- Copyright (c) 2010-2011 Gluster, Inc. <http://www.gluster.com>
+ Copyright (c) 2010-2013 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
- GlusterFS is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 3 of the License,
- or (at your option) any later version.
-
- GlusterFS is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see
- <http://www.gnu.org/licenses/>.
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
*/
#ifndef _SERVER_H
@@ -22,6 +13,7 @@
#include <pthread.h>
+#include "fd.h"
#include "rpcsvc.h"
#include "fd.h"
@@ -29,69 +21,47 @@
#include "server-mem-types.h"
#include "glusterfs3.h"
#include "timer.h"
+#include "client_t.h"
#define DEFAULT_BLOCK_SIZE 4194304 /* 4MB */
#define DEFAULT_VOLUME_FILE_PATH CONFDIR "/glusterfs.vol"
#define GF_MAX_SOCKET_WINDOW_SIZE (1 * GF_UNIT_MB)
#define GF_MIN_SOCKET_WINDOW_SIZE (0)
-typedef enum {
- INTERNAL_LOCKS = 1,
- POSIX_LOCKS = 2,
-} server_lock_flags_t;
-
-typedef struct _server_state server_state_t;
-
-struct _locker {
- struct list_head lockers;
- char *volume;
- loc_t loc;
- fd_t *fd;
- gf_lkowner_t owner;
- pid_t pid;
+struct _gf_barrier_payload {
+ rpcsvc_request_t *req;
+ struct iovec rsp;
+ call_frame_t *frame;
+ struct iovec *payload;
+ struct iobref *iobref;
+ struct iobuf *iob;
+ int payload_count;
+ gf_boolean_t free_iobref;
+ struct list_head list;
};
-struct _lock_table {
- struct list_head inodelk_lockers;
- struct list_head entrylk_lockers;
+typedef struct _gf_barrier_payload gf_barrier_payload_t;
+
+struct _gf_barrier {
+ gf_lock_t lock;
+ gf_boolean_t on;
+ gf_boolean_t force;
+ size_t cur_size;
+ int64_t max_size;
+ uint64_t fops;
+ gf_timer_t *timer;
+ uint64_t time_out;
+ struct list_head queue;
};
-/* private structure per connection (transport object)
- * used as transport_t->xl_private
- */
-struct _server_connection {
- struct list_head list;
- char *id;
- int ref;
- int bind_ref;
- pthread_mutex_t lock;
- fdtable_t *fdtable;
- struct _lock_table *ltable;
- gf_timer_t *timer;
- xlator_t *bound_xl;
- xlator_t *this;
- uint32_t lk_version;
-};
-
-typedef struct _server_connection server_connection_t;
-
+typedef struct _gf_barrier gf_barrier_t;
-server_connection_t *
-server_connection_get (xlator_t *this, const char *id);
-
-server_connection_t *
-server_connection_put (xlator_t *this, server_connection_t *conn,
- gf_boolean_t *detached);
-
-server_connection_t*
-server_conn_unref (server_connection_t *conn);
-
-server_connection_t*
-server_conn_ref (server_connection_t *conn);
+typedef enum {
+ INTERNAL_LOCKS = 1,
+ POSIX_LOCKS = 2,
+} server_lock_flags_t;
-int
-server_connection_cleanup (xlator_t *this, server_connection_t *conn,
- int32_t flags);
+typedef struct _server_state server_state_t;
int server_null (rpcsvc_request_t *req);
@@ -111,11 +81,12 @@ struct server_conf {
heal is on else off. */
char *conf_dir;
struct _volfile_ctx *volfile;
- struct timeval grace_tv;
+ struct timespec grace_ts;
dict_t *auth_modules;
pthread_mutex_t mutex;
- struct list_head conns;
+ gf_barrier_t *barrier;
struct list_head xprt_list;
+ pthread_t barrier_th;
};
typedef struct server_conf server_conf_t;
@@ -153,11 +124,10 @@ int
resolve_and_resume (call_frame_t *frame, server_resume_fn_t fn);
struct _server_state {
- server_connection_t *conn;
- rpc_transport_t *xprt;
- inode_table_t *itable;
+ rpc_transport_t *xprt;
+ inode_table_t *itable;
- server_resume_fn_t resume_fn;
+ server_resume_fn_t resume_fn;
loc_t loc;
loc_t loc2;
@@ -193,7 +163,7 @@ struct _server_state {
int mask;
char is_revalidate;
dict_t *dict;
- struct gf_flock flock;
+ struct gf_flock flock;
const char *volume;
dir_entry_t *entry;
@@ -201,10 +171,20 @@ struct _server_state {
mode_t umask;
};
+
extern struct rpcsvc_program gluster_handshake_prog;
extern struct rpcsvc_program glusterfs3_3_fop_prog;
extern struct rpcsvc_program gluster_ping_prog;
+
+typedef struct _server_ctx {
+ gf_lock_t fdtable_lock;
+ fdtable_t *fdtable;
+ struct _gf_timer *grace_timer;
+ uint32_t lk_version;
+} server_ctx_t;
+
+
int
server_submit_reply (call_frame_t *frame, rpcsvc_request_t *req, void *arg,
struct iovec *payload, int payloadcount,
@@ -213,6 +193,4 @@ server_submit_reply (call_frame_t *frame, rpcsvc_request_t *req, void *arg,
int gf_server_check_setxattr_cmd (call_frame_t *frame, dict_t *dict);
int gf_server_check_getxattr_cmd (call_frame_t *frame, const char *name);
-void ltable_dump (server_connection_t *conn);
-
#endif /* !_SERVER_H */
diff --git a/xlators/storage/Makefile.am b/xlators/storage/Makefile.am
index 9cb9ded30..c08e8e41b 100644
--- a/xlators/storage/Makefile.am
+++ b/xlators/storage/Makefile.am
@@ -1,3 +1,7 @@
SUBDIRS = posix
-CLEANFILES =
+if ENABLE_BD_XLATOR
+SUBDIRS += bd
+endif
+
+CLEANFILES =
diff --git a/xlators/storage/bd/Makefile.am b/xlators/storage/bd/Makefile.am
new file mode 100644
index 000000000..a985f42a8
--- /dev/null
+++ b/xlators/storage/bd/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src
+
+CLEANFILES =
diff --git a/xlators/storage/bd/src/Makefile.am b/xlators/storage/bd/src/Makefile.am
new file mode 100644
index 000000000..60ceff31b
--- /dev/null
+++ b/xlators/storage/bd/src/Makefile.am
@@ -0,0 +1,20 @@
+if ENABLE_BD_XLATOR
+xlator_LTLIBRARIES = bd.la
+xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/storage
+
+bd_la_LDFLAGS = -module -avoid-version
+LIBBD = -llvm2app -lrt
+bd_la_SOURCES = bd.c bd-helper.c bd-aio.c
+bd_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(LIBBD) $(LIBAIO)
+
+noinst_HEADERS = bd.h bd-aio.h bd-mem-types.h
+
+AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
+ -I$(top_srcdir)/rpc/xdr/src \
+ -I$(top_srcdir)/rpc/rpc-lib/src
+
+AM_CFLAGS = -fno-strict-aliasing -Wall $(GF_CFLAGS)
+
+CLEANFILES =
+
+endif
diff --git a/xlators/storage/bd/src/bd-aio.c b/xlators/storage/bd/src/bd-aio.c
new file mode 100644
index 000000000..9dc13b3ec
--- /dev/null
+++ b/xlators/storage/bd/src/bd-aio.c
@@ -0,0 +1,528 @@
+/*
+ Copyright IBM, Corp. 2013
+
+ This file is part of GlusterFS.
+
+ Author: M. Mohan Kumar <mohan@in.ibm.com>
+
+ Based on posix-aio.c
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include <lvm2app.h>
+#include <sys/uio.h>
+
+#include "xlator.h"
+#include "glusterfs.h"
+#include "defaults.h"
+#include "bd.h"
+#include "bd-aio.h"
+
+#ifdef HAVE_LIBAIO
+#include <libaio.h>
+#include "bd-mem-types.h"
+
+struct bd_aio_cb {
+ struct iocb iocb;
+ call_frame_t *frame;
+ struct iobuf *iobuf;
+ struct iobref *iobref;
+ struct iatt prebuf;
+ int op;
+ off_t offset;
+ fd_t *fd;
+};
+
+void
+__bd_fd_set_odirect (fd_t *fd, bd_fd_t *bd_fd, int opflags,
+ off_t offset, size_t size)
+{
+ int odirect = 0;
+ int flags = 0;
+ int ret = 0;
+
+ odirect = bd_fd->odirect;
+
+ if ((fd->flags|opflags) & O_DIRECT) {
+ /* if instructed, use O_DIRECT always */
+ odirect = 1;
+ } else {
+ /* else use O_DIRECT when feasible */
+ if ((offset|size) & 0xfff)
+ odirect = 0;
+ else
+ odirect = 1;
+ }
+
+ if (!odirect && bd_fd->odirect) {
+ flags = fcntl (bd_fd->fd, F_GETFL);
+ ret = fcntl (bd_fd->fd, F_SETFL, (flags & (~O_DIRECT)));
+ bd_fd->odirect = 0;
+ }
+
+ if (odirect && !bd_fd->odirect) {
+ flags = fcntl (bd_fd->fd, F_GETFL);
+ ret = fcntl (bd_fd->fd, F_SETFL, (flags | O_DIRECT));
+ bd_fd->odirect = 1;
+ }
+
+ if (ret) {
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "fcntl() failed (%s). fd=%d flags=%d pfd->odirect=%d",
+ strerror (errno), bd_fd->fd, flags, bd_fd->odirect);
+ }
+}
+
+int
+bd_aio_readv_complete (struct bd_aio_cb *paiocb, int res, int res2)
+{
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ struct iobuf *iobuf = NULL;
+ struct iatt postbuf = {0,};
+ int op_ret = -1;
+ int op_errno = 0;
+ struct iovec iov;
+ struct iobref *iobref = NULL;
+ off_t offset = 0;
+ bd_attr_t *bdatt = NULL;
+
+ frame = paiocb->frame;
+ this = frame->this;
+ iobuf = paiocb->iobuf;
+ offset = paiocb->offset;
+
+ if (res < 0) {
+ op_ret = -1;
+ op_errno = -res;
+ gf_log (this->name, GF_LOG_ERROR,
+ "readv(async) failed fd=%p,size=%lu,offset=%llu (%d/%s)",
+ paiocb->fd, paiocb->iocb.u.c.nbytes,
+ (unsigned long long) paiocb->offset,
+ res, strerror (op_errno));
+ goto out;
+ }
+
+ bd_inode_ctx_get (paiocb->fd->inode, this, &bdatt);
+ memcpy (&postbuf, &bdatt->iatt, sizeof (struct iatt));
+
+ op_ret = res;
+ op_errno = 0;
+
+ iobref = iobref_new ();
+ if (!iobref) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ iobref_add (iobref, iobuf);
+
+ iov.iov_base = iobuf_ptr (iobuf);
+ iov.iov_len = op_ret;
+
+ /* Hack to notify higher layers of EOF. */
+ if (!postbuf.ia_size || (offset + iov.iov_len) >= postbuf.ia_size)
+ op_errno = ENOENT;
+
+out:
+ STACK_UNWIND_STRICT (readv, frame, op_ret, op_errno, &iov, 1,
+ &postbuf, iobref, NULL);
+ if (iobuf)
+ iobuf_unref (iobuf);
+ if (iobref)
+ iobref_unref (iobref);
+
+ GF_FREE (paiocb);
+
+ return 0;
+}
+
+int
+bd_aio_readv (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ size_t size, off_t offset, uint32_t flags, dict_t *xdata)
+{
+ int32_t op_errno = EINVAL;
+ int _fd = -1;
+ struct iobuf *iobuf = NULL;
+ bd_fd_t *bd_fd = NULL;
+ int ret = -1;
+ struct bd_aio_cb *paiocb = NULL;
+ bd_priv_t *priv = NULL;
+ struct iocb *iocb = NULL;
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ priv = this->private;
+
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd) {
+ STACK_WIND (frame, default_readv_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->readv, fd, size, offset,
+ flags, xdata);
+ return 0;
+ }
+ _fd = bd_fd->fd;
+ bd_inode_ctx_get (fd->inode, this, &bdatt);
+ if (!size) {
+ op_errno = EINVAL;
+ gf_log (this->name, GF_LOG_WARNING, "size=%"GF_PRI_SIZET, size);
+ goto err;
+ }
+
+ iobuf = iobuf_get2 (this->ctx->iobuf_pool, size);
+ if (!iobuf) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ paiocb = GF_CALLOC (1, sizeof (*paiocb), gf_bd_aio_cb);
+ if (!paiocb) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ paiocb->frame = frame;
+ paiocb->iobuf = iobuf;
+ paiocb->offset = offset;
+ paiocb->op = GF_FOP_READ;
+ paiocb->fd = fd;
+
+ paiocb->iocb.data = paiocb;
+ paiocb->iocb.aio_fildes = _fd;
+ paiocb->iocb.aio_lio_opcode = IO_CMD_PREAD;
+ paiocb->iocb.aio_reqprio = 0;
+ paiocb->iocb.u.c.buf = iobuf_ptr (iobuf);
+ paiocb->iocb.u.c.nbytes = size;
+ paiocb->iocb.u.c.offset = offset;
+
+ iocb = &paiocb->iocb;
+
+ LOCK (&fd->lock);
+ {
+ __bd_fd_set_odirect (fd, bd_fd, flags, offset, size);
+
+ ret = io_submit (priv->ctxp, 1, &iocb);
+ }
+ UNLOCK (&fd->lock);
+
+ if (ret != 1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "io_submit() returned %d", ret);
+ op_errno = -ret;
+ goto err;
+ }
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (readv, frame, -1, op_errno, 0, 0, 0, 0, 0);
+ if (iobuf)
+ iobuf_unref (iobuf);
+
+ if (paiocb)
+ GF_FREE (paiocb);
+
+ return 0;
+}
+
+int
+bd_aio_writev_complete (struct bd_aio_cb *paiocb, int res, int res2)
+{
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ struct iatt prebuf = {0,};
+ struct iatt postbuf = {0,};
+ int op_ret = -1;
+ int op_errno = 0;
+ bd_attr_t *bdatt = NULL;
+
+ frame = paiocb->frame;
+ prebuf = paiocb->prebuf;
+ this = frame->this;
+
+ if (res < 0) {
+ op_ret = -1;
+ op_errno = -res;
+ gf_log (this->name, GF_LOG_ERROR,
+ "writev(async) failed fd=%p,offset=%llu (%d/%s)",
+ paiocb->fd, (unsigned long long) paiocb->offset, res,
+ strerror (op_errno));
+
+ goto out;
+ }
+
+ bd_inode_ctx_get (paiocb->fd->inode, this, &bdatt);
+ bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME);
+ memcpy (&postbuf, &bdatt->iatt, sizeof (struct iatt));
+
+ op_ret = res;
+ op_errno = 0;
+
+out:
+ STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &prebuf, &postbuf,
+ NULL);
+
+ if (paiocb) {
+ if (paiocb->iobref)
+ iobref_unref (paiocb->iobref);
+ GF_FREE (paiocb);
+ }
+
+ return 0;
+}
+
+int
+bd_aio_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *iov, int count, off_t offset, uint32_t flags,
+ struct iobref *iobref, dict_t *xdata)
+{
+ int32_t op_errno = EINVAL;
+ int _fd = -1;
+ bd_fd_t *bd_fd = NULL;
+ int ret = -1;
+ struct bd_aio_cb *paiocb = NULL;
+ bd_priv_t *priv = NULL;
+ struct iocb *iocb = NULL;
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, err);
+ VALIDATE_OR_GOTO (this, err);
+ VALIDATE_OR_GOTO (fd, err);
+
+ priv = this->private;
+
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd) {
+ STACK_WIND (frame, default_writev_cbk,
+ FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev,
+ fd, iov, count, offset, flags, iobref, xdata);
+ return 0;
+ }
+
+ bd_inode_ctx_get (fd->inode, this, &bdatt);
+
+ _fd = bd_fd->fd;
+
+ paiocb = GF_CALLOC (1, sizeof (*paiocb), gf_bd_aio_cb);
+ if (!paiocb) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+
+ paiocb->frame = frame;
+ paiocb->offset = offset;
+ paiocb->op = GF_FOP_WRITE;
+ paiocb->fd = fd;
+
+ paiocb->iocb.data = paiocb;
+ paiocb->iocb.aio_fildes = _fd;
+ paiocb->iobref = iobref_ref (iobref);
+ paiocb->iocb.aio_lio_opcode = IO_CMD_PWRITEV;
+ paiocb->iocb.aio_reqprio = 0;
+ paiocb->iocb.u.v.vec = iov;
+ paiocb->iocb.u.v.nr = count;
+ paiocb->iocb.u.v.offset = offset;
+
+ iocb = &paiocb->iocb;
+
+ memcpy (&paiocb->prebuf, &bdatt->iatt, sizeof (struct iatt));
+ LOCK (&fd->lock);
+ {
+ __bd_fd_set_odirect (fd, bd_fd, flags, offset,
+ iov_length (iov, count));
+
+ ret = io_submit (priv->ctxp, 1, &iocb);
+ }
+ UNLOCK (&fd->lock);
+
+ if (ret != 1) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "io_submit() returned %d", ret);
+ op_errno = -ret;
+ goto err;
+ }
+
+ return 0;
+err:
+ STACK_UNWIND_STRICT (writev, frame, -1, op_errno, 0, 0, 0);
+
+ if (paiocb) {
+ if (paiocb->iobref)
+ iobref_unref (paiocb->iobref);
+ GF_FREE (paiocb);
+ }
+
+ return 0;
+}
+
+void *
+bd_aio_thread (void *data)
+{
+ xlator_t *this = NULL;
+ bd_priv_t *priv = NULL;
+ int ret = 0;
+ int i = 0;
+ struct io_event *event = NULL;
+ struct bd_aio_cb *paiocb = NULL;
+ struct io_event events[BD_AIO_MAX_NR_GETEVENTS];
+ struct timespec ts = {0, };
+
+ this = data;
+ THIS = this;
+ priv = this->private;
+
+ ts.tv_sec = 5;
+ for (;;) {
+ memset (&events[0], 0, sizeof (events));
+ ret = io_getevents (priv->ctxp, 1, BD_AIO_MAX_NR_GETEVENTS,
+ &events[0], &ts);
+ if (ret < 0) {
+ if (ret == -EINTR)
+ continue;
+ gf_log (this->name, GF_LOG_ERROR,
+ "io_getevents() returned %d, exiting", ret);
+ break;
+ }
+
+ for (i = 0; i < ret; i++) {
+ event = &events[i];
+
+ paiocb = event->data;
+
+ switch (paiocb->op) {
+ case GF_FOP_READ:
+ bd_aio_readv_complete (paiocb, event->res,
+ event->res2);
+ break;
+ case GF_FOP_WRITE:
+ bd_aio_writev_complete (paiocb, event->res,
+ event->res2);
+ break;
+ default:
+ gf_log (this->name, GF_LOG_ERROR,
+ "unknown op %d found in piocb",
+ paiocb->op);
+ break;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+int
+bd_aio_init (xlator_t *this)
+{
+ bd_priv_t *priv = NULL;
+ int ret = 0;
+
+ priv = this->private;
+
+ ret = io_setup (BD_AIO_MAX_NR_EVENTS, &priv->ctxp);
+ if ((ret == -1 && errno == ENOSYS) || ret == -ENOSYS) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Linux AIO not available at run-time."
+ " Continuing with synchronous IO");
+ ret = 0;
+ goto out;
+ }
+
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "io_setup() failed. ret=%d, errno=%d",
+ ret, errno);
+ goto out;
+ }
+
+ ret = pthread_create (&priv->aiothread, NULL,
+ bd_aio_thread, this);
+ if (ret != 0) {
+ io_destroy (priv->ctxp);
+ goto out;
+ }
+
+ this->fops->readv = bd_aio_readv;
+ this->fops->writev = bd_aio_writev;
+out:
+ return ret;
+}
+
+
+int
+bd_aio_on (xlator_t *this)
+{
+ bd_priv_t *priv = NULL;
+ int ret = 0;
+
+ priv = this->private;
+
+ if (!priv->aio_init_done) {
+ ret = bd_aio_init (this);
+ if (ret == 0)
+ priv->aio_capable = _gf_true;
+ else
+ priv->aio_capable = _gf_false;
+ priv->aio_init_done = _gf_true;
+ }
+
+ if (priv->aio_capable) {
+ this->fops->readv = bd_aio_readv;
+ this->fops->writev = bd_aio_writev;
+ }
+
+ return ret;
+}
+
+int
+bd_aio_off (xlator_t *this)
+{
+ this->fops->readv = bd_readv;
+ this->fops->writev = bd_writev;
+
+ return 0;
+}
+
+#else
+
+int
+bd_aio_on (xlator_t *this)
+{
+ gf_log (this->name, GF_LOG_INFO,
+ "Linux AIO not available at build-time."
+ " Continuing with synchronous IO");
+ return 0;
+}
+
+int
+bd_aio_off (xlator_t *this)
+{
+ gf_log (this->name, GF_LOG_INFO,
+ "Linux AIO not available at build-time."
+ " Continuing with synchronous IO");
+ return 0;
+}
+
+void
+__bd_fd_set_odirect (fd_t *fd, struct bd_fd *pfd, int opflags,
+ off_t offset, size_t size)
+{
+ xlator_t *this = THIS;
+ gf_log (this->name, GF_LOG_INFO,
+ "Linux AIO not available at build-time."
+ " Continuing with synchronous IO");
+ return;
+}
+#endif
diff --git a/xlators/storage/bd/src/bd-aio.h b/xlators/storage/bd/src/bd-aio.h
new file mode 100644
index 000000000..16f686a4c
--- /dev/null
+++ b/xlators/storage/bd/src/bd-aio.h
@@ -0,0 +1,41 @@
+/*
+ Copyright IBM, Corp. 2013
+
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+#ifndef _BD_AIO_H
+#define _BD_AIO_H
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#include "xlator.h"
+#include "glusterfs.h"
+
+/*
+ * Maximum number of concurrently submitted IO events. The heaviest load
+ * GlusterFS has been able to handle had 60-80 concurrent calls
+ */
+#define BD_AIO_MAX_NR_EVENTS 256
+
+/* Maximum number of completed IO operations to reap per getevents syscall */
+#define BD_AIO_MAX_NR_GETEVENTS 16
+
+int bd_aio_on (xlator_t *this);
+int bd_aio_off (xlator_t *this);
+
+int bd_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata);
+
+int bd_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
+ struct iovec *vector, int32_t count, off_t offset,
+ uint32_t flags, struct iobref *iobref, dict_t *xdata);
+
+#endif /* !_BD_AIO_H */
diff --git a/xlators/storage/bd/src/bd-helper.c b/xlators/storage/bd/src/bd-helper.c
new file mode 100644
index 000000000..d598e5755
--- /dev/null
+++ b/xlators/storage/bd/src/bd-helper.c
@@ -0,0 +1,1021 @@
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+#include <lvm2app.h>
+#ifdef HAVE_LIBAIO
+#include <libaio.h>
+#endif
+#include <linux/fs.h>
+#include <sys/ioctl.h>
+#include "bd.h"
+#include "bd-mem-types.h"
+#include "run.h"
+#include "lvm-defaults.h"
+
+int
+bd_inode_ctx_set (inode_t *inode, xlator_t *this, bd_attr_t *ctx)
+{
+ int ret = -1;
+ uint64_t ctx_int = 0;
+
+ GF_VALIDATE_OR_GOTO (this->name, inode, out);
+ GF_VALIDATE_OR_GOTO (this->name, ctx, out);
+
+ ctx_int = (long)ctx;
+ ret = inode_ctx_set (inode, this, &ctx_int);
+out:
+ return ret;
+}
+
+int
+bd_inode_ctx_get (inode_t *inode, xlator_t *this, bd_attr_t **ctx)
+{
+ int ret = -1;
+ uint64_t ctx_int = 0;
+
+ GF_VALIDATE_OR_GOTO (this->name, inode, out);
+ ret = inode_ctx_get (inode, this, &ctx_int);
+ if (ret)
+ return ret;
+ if (ctx)
+ *ctx = (bd_attr_t *) ctx_int;
+out:
+ return ret;
+}
+
+void
+bd_local_free (xlator_t *this, bd_local_t *local)
+{
+ if (!local)
+ return;
+ if (local->fd)
+ fd_unref (local->fd);
+ else if (local->loc.path)
+ loc_wipe (&local->loc);
+ if (local->dict)
+ dict_unref (local->dict);
+ if (local->inode)
+ inode_unref (local->inode);
+ if (local->bdatt) {
+ GF_FREE (local->bdatt->type);
+ GF_FREE (local->bdatt);
+ }
+ mem_put (local);
+ local = NULL;
+}
+
+bd_local_t *
+bd_local_init (call_frame_t *frame, xlator_t *this)
+{
+ frame->local = mem_get0 (this->local_pool);
+ if (!frame->local)
+ return NULL;
+
+ return frame->local;
+}
+
+/*
+ * VG are set with the tag in GF_XATTR_VOL_ID_KEY:<uuid> format.
+ * This function validates this tag agains volume-uuid. Also goes
+ * through LV list to find out if a thin-pool is configured or not.
+ */
+int bd_scan_vg (xlator_t *this, bd_priv_t *priv)
+{
+ vg_t brick = NULL;
+ data_t *tmp_data = NULL;
+ struct dm_list *tags = NULL;
+ int op_ret = -1;
+ uuid_t dict_uuid = {0, };
+ uuid_t vg_uuid = {0, };
+ gf_boolean_t uuid = _gf_false;
+ lvm_str_list_t *strl = NULL;
+ struct dm_list *lv_dm_list = NULL;
+ lv_list_t *lv_list = NULL;
+ struct dm_list *dm_seglist = NULL;
+ lvseg_list_t *seglist = NULL;
+ lvm_property_value_t prop = {0, };
+ gf_boolean_t thin = _gf_false;
+ const char *lv_name = NULL;
+
+ brick = lvm_vg_open (priv->handle, priv->vg, "w", 0);
+ if (!brick) {
+ gf_log (this->name, GF_LOG_CRITICAL, "VG %s is not found",
+ priv->vg);
+ return ENOENT;
+ }
+
+ lv_dm_list = lvm_vg_list_lvs (brick);
+ if (!lv_dm_list)
+ goto check;
+
+ dm_list_iterate_items (lv_list, lv_dm_list) {
+ dm_seglist = lvm_lv_list_lvsegs (lv_list->lv);
+ if (!dm_seglist)
+ continue;
+ dm_list_iterate_items (seglist, dm_seglist) {
+ prop = lvm_lvseg_get_property (seglist->lvseg,
+ "segtype");
+ if (!prop.is_valid || !prop.value.string)
+ continue;
+ if (!strcmp (prop.value.string, "thin-pool")) {
+ thin = _gf_true;
+ lv_name = lvm_lv_get_name (lv_list->lv);
+ priv->pool = gf_strdup (lv_name);
+ gf_log (THIS->name, GF_LOG_INFO, "Thin Pool "
+ "\"%s\" will be used for thin LVs",
+ lv_name);
+ break;
+ }
+ }
+ }
+
+check:
+ /* If there is no volume-id set in dict, we cant validate */
+ tmp_data = dict_get (this->options, "volume-id");
+ if (!tmp_data) {
+ op_ret = 0;
+ goto out;
+ }
+
+ op_ret = uuid_parse (tmp_data->data, dict_uuid);
+ if (op_ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "wrong volume-id (%s) set in volume file",
+ tmp_data->data);
+ op_ret = -1;
+ goto out;
+ }
+
+ tags = lvm_vg_get_tags (brick);
+ if (!tags) { /* no tags in the VG */
+ gf_log (this->name, GF_LOG_ERROR,
+ "Extended attribute trusted.glusterfs."
+ "volume-id is absent");
+ op_ret = -1;
+ goto out;
+ }
+ dm_list_iterate_items (strl, tags) {
+ if (!strncmp (strl->str, GF_XATTR_VOL_ID_KEY,
+ strlen (GF_XATTR_VOL_ID_KEY))) {
+ uuid = _gf_true;
+ break;
+ }
+ }
+ /* UUID tag is not set in VG */
+ if (!uuid) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "Extended attribute trusted.glusterfs."
+ "volume-id is absent");
+ op_ret = -1;
+ goto out;
+ }
+
+ op_ret = uuid_parse (strl->str + strlen (GF_XATTR_VOL_ID_KEY) + 1,
+ vg_uuid);
+ if (op_ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "wrong volume-id (%s) set in VG", strl->str);
+ op_ret = -1;
+ goto out;
+ }
+ if (uuid_compare (dict_uuid, vg_uuid)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "mismatching volume-id (%s) received. "
+ "already is a part of volume %s ",
+ tmp_data->data, vg_uuid);
+ op_ret = -1;
+ goto out;
+ }
+
+ op_ret = 0;
+
+out:
+ lvm_vg_close (brick);
+
+ if (!thin)
+ gf_log (THIS->name, GF_LOG_WARNING, "No thin pool found in "
+ "VG %s\n", priv->vg);
+ else
+ priv->caps |= BD_CAPS_THIN;
+
+ return op_ret;
+}
+
+/* FIXME: Move this code to common place, so posix and bd xlator can use */
+char *
+page_aligned_alloc (size_t size, char **aligned_buf)
+{
+ char *alloc_buf = NULL;
+ char *buf = NULL;
+
+ alloc_buf = GF_CALLOC (1, (size + ALIGN_SIZE), gf_common_mt_char);
+ if (!alloc_buf)
+ return NULL;
+ /* page aligned buffer */
+ buf = GF_ALIGN_BUF (alloc_buf, ALIGN_SIZE);
+ *aligned_buf = buf;
+
+ return alloc_buf;
+}
+
+static int
+__bd_fd_ctx_get (xlator_t *this, fd_t *fd, bd_fd_t **bdfd_p)
+{
+ int ret = -1;
+ int _fd = -1;
+ char *devpath = NULL;
+ bd_fd_t *bdfd = NULL;
+ uint64_t tmp_bdfd = 0;
+ bd_priv_t *priv = this->private;
+ bd_gfid_t gfid = {0, };
+ bd_attr_t *bdatt = NULL;
+
+ /* not bd file */
+ if (fd->inode->ia_type != IA_IFREG ||
+ bd_inode_ctx_get (fd->inode, this, &bdatt))
+ return 0;
+
+ ret = __fd_ctx_get (fd, this, &tmp_bdfd);
+ if (ret == 0) {
+ bdfd = (void *)(long) tmp_bdfd;
+ *bdfd_p = bdfd;
+ return 0;
+ }
+
+ uuid_utoa_r (fd->inode->gfid, gfid);
+ gf_asprintf (&devpath, "/dev/%s/%s", priv->vg, gfid);
+ if (!devpath)
+ goto out;
+
+ _fd = open (devpath, O_RDWR | O_LARGEFILE, 0);
+ if (_fd < 0) {
+ ret = errno;
+ gf_log (this->name, GF_LOG_ERROR, "open on %s: %s", devpath,
+ strerror (ret));
+ goto out;
+ }
+ bdfd = GF_CALLOC (1, sizeof(bd_fd_t), gf_bd_fd);
+ BD_VALIDATE_MEM_ALLOC (bdfd, ret, out);
+
+ bdfd->fd = _fd;
+ bdfd->flag = O_RDWR | O_LARGEFILE;
+ if (__fd_ctx_set (fd, this, (uint64_t)(long)bdfd) < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to set the fd context fd=%p", fd);
+ goto out;
+ }
+
+ *bdfd_p = bdfd;
+
+ ret = 0;
+out:
+ GF_FREE (devpath);
+ if (ret) {
+ close (_fd);
+ GF_FREE (bdfd);
+ }
+ return ret;
+}
+
+int
+bd_fd_ctx_get (xlator_t *this, fd_t *fd, bd_fd_t **bdfd)
+{
+ int ret;
+
+ /* FIXME: Is it ok to fd->lock here ? */
+ LOCK (&fd->lock);
+ {
+ ret = __bd_fd_ctx_get (this, fd, bdfd);
+ }
+ UNLOCK (&fd->lock);
+
+ return ret;
+}
+
+/*
+ * Validates if LV exists for given inode or not.
+ * Returns 0 if LV exists and size also matches.
+ * If LV does not exist -1 returned
+ * If LV size mismatches, returnes 1 also lv_size is updated with actual
+ * size
+ */
+int
+bd_validate_bd_xattr (xlator_t *this, char *bd, char **type,
+ uint64_t *lv_size, uuid_t uuid)
+{
+ char *path = NULL;
+ int ret = -1;
+ bd_gfid_t gfid = {0, };
+ bd_priv_t *priv = this->private;
+ struct stat stbuf = {0, };
+ uint64_t size = 0;
+ vg_t vg = NULL;
+ lv_t lv = NULL;
+ char *bytes = NULL;
+
+ bytes = strrchr (bd, ':');
+ if (bytes) {
+ *bytes = '\0';
+ bytes++;
+ gf_string2bytesize (bytes, &size);
+ }
+
+ if (strcmp (bd, BD_LV) && strcmp (bd, BD_THIN)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "invalid xattr %s", bd);
+ return -1;
+ }
+ *type = gf_strdup (bd);
+
+ /*
+ * Check if LV really exist, there could be a failure
+ * after setxattr and successful LV creation
+ */
+ uuid_utoa_r (uuid, gfid);
+ gf_asprintf (&path, "/dev/%s/%s", priv->vg, gfid);
+ if (!path) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "insufficient memory");
+ return 0;
+ }
+
+ /* Destination file does not exist */
+ if (stat (path, &stbuf)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "lstat failed for path %s", path);
+ return -1;
+ }
+
+ vg = lvm_vg_open (priv->handle, priv->vg, "r", 0);
+ if (!vg) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "VG %s does not exist?", priv->vg);
+ ret = -1;
+ goto out;
+ }
+
+ lv = lvm_lv_from_name (vg, gfid);
+ if (!lv) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "LV %s does not exist", gfid);
+ ret = -1;
+ goto out;
+ }
+
+ *lv_size = lvm_lv_get_size (lv);
+ if (size == *lv_size) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = 1;
+
+out:
+ if (vg)
+ lvm_vg_close (vg);
+
+ GF_FREE (path);
+ return ret;
+}
+
+static int
+create_thin_lv (char *vg, char *pool, char *lv, uint64_t extent)
+{
+ int ret = -1;
+ runner_t runner = {0, };
+ char *path = NULL;
+ struct stat stat = {0, };
+
+ runinit (&runner);
+ runner_add_args (&runner, LVM_CREATE, NULL);
+ runner_add_args (&runner, "--thin", NULL);
+ runner_argprintf (&runner, "%s/%s", vg, pool);
+ runner_add_args (&runner, "--name", NULL);
+ runner_argprintf (&runner, "%s", lv);
+ runner_add_args (&runner, "--virtualsize", NULL);
+ runner_argprintf (&runner, "%ldB", extent);
+ runner_start (&runner);
+ runner_end (&runner);
+
+ gf_asprintf (&path, "/dev/%s/%s", vg, lv);
+ if (!path) {
+ ret = ENOMEM;
+ goto out;
+ }
+ if (lstat (path, &stat) < 0)
+ ret = EAGAIN;
+ else
+ ret = 0;
+out:
+ GF_FREE (path);
+ return ret;
+}
+
+int
+bd_create (uuid_t uuid, uint64_t size, char *type, bd_priv_t *priv)
+{
+ int ret = 0;
+ vg_t vg = NULL;
+ bd_gfid_t gfid = {0, };
+
+ uuid_utoa_r (uuid, gfid);
+
+ if (!strcmp (type, BD_THIN))
+ return create_thin_lv (priv->vg, priv->pool, gfid,
+ size);
+
+ vg = lvm_vg_open (priv->handle, priv->vg, "w", 0);
+ if (!vg) {
+ gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed",
+ priv->vg);
+ return ENOENT;
+ }
+
+ if (!lvm_vg_create_lv_linear (vg, gfid, size)) {
+ gf_log (THIS->name, GF_LOG_WARNING, "lvm_vg_create_lv_linear "
+ "failed");
+ ret = errno;
+ }
+
+ lvm_vg_close (vg);
+
+ return ret;
+}
+
+int32_t
+bd_resize (bd_priv_t *priv, uuid_t uuid, off_t size)
+{
+ uint64_t new_size = 0;
+ runner_t runner = {0, };
+ bd_gfid_t gfid = {0, };
+ int ret = 0;
+ vg_t vg = NULL;
+ lv_t lv = NULL;
+
+ uuid_utoa_r (uuid, gfid);
+
+ runinit (&runner);
+
+ runner_add_args (&runner, LVM_RESIZE, NULL);
+ runner_argprintf (&runner, "%s/%s", priv->vg, gfid);
+ runner_argprintf (&runner, "-L%ldb", size);
+ runner_add_args (&runner, "-f", NULL);
+
+ runner_start (&runner);
+ runner_end (&runner);
+
+ vg = lvm_vg_open (priv->handle, priv->vg, "w", 0);
+ if (!vg) {
+ gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed",
+ priv->vg);
+ return EAGAIN;
+ }
+
+ lv = lvm_lv_from_name (vg, gfid);
+ if (!lv) {
+ gf_log (THIS->name, GF_LOG_WARNING, "LV %s not found", gfid);
+ ret = EIO;
+ goto out;
+ }
+ new_size = lvm_lv_get_size (lv);
+
+ if (new_size != size) {
+ gf_log (THIS->name, GF_LOG_WARNING, "resized LV size %ld does "
+ "not match requested size %ld", new_size, size);
+ ret = EIO;
+ }
+
+out:
+ lvm_vg_close (vg);
+ return ret;
+}
+
+uint64_t
+bd_get_default_extent (bd_priv_t *priv)
+{
+ vg_t vg = NULL;
+ uint64_t size = 0;
+
+ vg = lvm_vg_open (priv->handle, priv->vg, "w", 0);
+ if (!vg) {
+ gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed",
+ priv->vg);
+ return 0;
+ }
+
+ size = lvm_vg_get_extent_size (vg);
+
+ lvm_vg_close (vg);
+
+ return size;
+}
+
+/*
+ * Adjusts the user specified size to VG specific extent size
+ */
+uint64_t
+bd_adjust_size (bd_priv_t *priv, uint64_t size)
+{
+ uint64_t extent = 0;
+ uint64_t nr_ex = 0;
+
+ extent = bd_get_default_extent (priv);
+ if (!extent)
+ return 0;
+
+ nr_ex = size / extent;
+ if (size % extent)
+ nr_ex++;
+
+ size = extent * nr_ex;
+
+ return size;
+}
+
+int
+bd_delete_lv (bd_priv_t *priv, const char *lv_name, int *op_errno)
+{
+ vg_t vg = NULL;
+ lv_t lv = NULL;
+ int ret = -1;
+
+ *op_errno = 0;
+ vg = lvm_vg_open (priv->handle, priv->vg, "w", 0);
+ if (!vg) {
+ gf_log (THIS->name, GF_LOG_WARNING, "opening VG %s failed",
+ priv->vg);
+ *op_errno = ENOENT;
+ return -1;
+ }
+ lv = lvm_lv_from_name (vg, lv_name);
+ if (!lv) {
+ gf_log (THIS->name, GF_LOG_WARNING, "No such LV %s", lv_name);
+ *op_errno = ENOENT;
+ goto out;
+ }
+ ret = lvm_vg_remove_lv (lv);
+ if (ret < 0) {
+ gf_log (THIS->name, GF_LOG_WARNING, "removing LV %s failed",
+ lv_name);
+ *op_errno = errno;
+ goto out;
+ }
+out:
+ lvm_vg_close (vg);
+
+ return ret;
+}
+
+inline void
+bd_update_amtime(struct iatt *iatt, int flag)
+{
+ struct timespec ts = {0, };
+
+ clock_gettime (CLOCK_REALTIME, &ts);
+ if (flag & GF_SET_ATTR_ATIME) {
+ iatt->ia_atime = ts.tv_sec;
+ iatt->ia_atime_nsec = ts.tv_nsec;
+ }
+ if (flag & GF_SET_ATTR_MTIME) {
+ iatt->ia_mtime = ts.tv_sec;
+ iatt->ia_mtime_nsec = ts.tv_nsec;
+ }
+}
+
+int
+bd_snapshot_create (bd_local_t *local, bd_priv_t *priv)
+{
+ char *path = NULL;
+ bd_gfid_t dest = {0, };
+ bd_gfid_t origin = {0, };
+ int ret = 0;
+ runner_t runner = {0, };
+ struct stat stat = {0, };
+
+ uuid_utoa_r (local->dloc->gfid, dest);
+ uuid_utoa_r (local->loc.gfid, origin);
+
+ gf_asprintf (&path, "/dev/%s/%s", priv->vg, dest);
+ if (!path) {
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "Insufficient memory");
+ return ENOMEM;
+ }
+
+ runinit (&runner);
+ runner_add_args (&runner, LVM_CREATE, NULL);
+ runner_add_args (&runner, "--snapshot", NULL);
+ runner_argprintf (&runner, "/dev/%s/%s", priv->vg, origin);
+ runner_add_args (&runner, "--name", NULL);
+ runner_argprintf (&runner, "%s", dest);
+ if (strcmp (local->bdatt->type, BD_THIN))
+ runner_argprintf (&runner, "-L%ldB", local->size);
+ runner_start (&runner);
+ runner_end (&runner);
+
+ if (lstat (path, &stat) < 0)
+ ret = EIO;
+
+ GF_FREE (path);
+ return ret;
+}
+
+int
+bd_clone (bd_local_t *local, bd_priv_t *priv)
+{
+ int ret = ENOMEM;
+ int fd1 = -1;
+ int fd2 = -1;
+ int i = 0;
+ char *buff = NULL;
+ ssize_t bytes = 0;
+ char *spath = NULL;
+ char *dpath = NULL;
+ struct iovec *vec = NULL;
+ bd_gfid_t source = {0, };
+ bd_gfid_t dest = {0, };
+ void *bufp[IOV_NR] = {0, };
+
+ vec = GF_CALLOC (IOV_NR, sizeof (struct iovec), gf_common_mt_iovec);
+ if (!vec)
+ return ENOMEM;
+
+ for (i = 0; i < IOV_NR; i++) {
+ bufp[i] = page_aligned_alloc (IOV_SIZE, &buff);
+ if (!buff)
+ goto out;
+ vec[i].iov_base = buff;
+ vec[i].iov_len = IOV_SIZE;
+ }
+
+ uuid_utoa_r (local->loc.gfid, source);
+ uuid_utoa_r (local->dloc->gfid, dest);
+
+ gf_asprintf (&spath, "/dev/%s/%s", priv->vg, source);
+ gf_asprintf (&dpath, "/dev/%s/%s", priv->vg, dest);
+ if (!spath || !dpath)
+ goto out;
+
+ ret = bd_create (local->dloc->gfid, local->size,
+ local->bdatt->type, priv);
+ if (ret)
+ goto out;
+
+ fd1 = open (spath, O_RDONLY | O_DIRECT);
+ if (fd1 < 0) {
+ ret = errno;
+ goto out;
+ }
+ fd2 = open (dpath, O_WRONLY | O_DIRECT);
+ if (fd2 < 0) {
+ ret = errno;
+ goto out;
+ }
+
+ while (1) {
+ bytes = readv (fd1, vec, IOV_NR);
+ if (bytes < 0) {
+ ret = errno;
+ gf_log (THIS->name, GF_LOG_WARNING, "read failed: %s",
+ strerror (ret));
+ goto out;
+ }
+ if (!bytes)
+ break;
+ bytes = writev (fd2, vec, IOV_NR);
+ if (bytes < 0) {
+ ret = errno;
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "write failed: %s", strerror (ret));
+ goto out;
+ }
+ }
+ ret = 0;
+
+out:
+ for (i = 0; i < IOV_NR; i++)
+ GF_FREE (bufp[i]);
+ GF_FREE (vec);
+
+ if (fd1 != -1)
+ close (fd1);
+ if (fd2 != -1)
+ close (fd2);
+
+ GF_FREE (spath);
+ GF_FREE (dpath);
+
+ return ret;
+}
+
+/*
+ * Merges snapshot LV to origin LV and returns status
+ */
+int
+bd_merge (bd_priv_t *priv, uuid_t gfid)
+{
+ bd_gfid_t dest = {0, };
+ char *path = NULL;
+ struct stat stat = {0, };
+ runner_t runner = {0, };
+ int ret = 0;
+
+ uuid_utoa_r (gfid, dest);
+ gf_asprintf (&path, "/dev/%s/%s", priv->vg, dest);
+
+ runinit (&runner);
+ runner_add_args (&runner, LVM_CONVERT, NULL);
+ runner_add_args (&runner, "--merge", NULL);
+ runner_argprintf (&runner, "%s", path);
+ runner_start (&runner);
+ runner_end (&runner);
+
+ if (!lstat (path, &stat))
+ ret = EIO;
+
+ GF_FREE (path);
+
+ return ret;
+}
+
+int
+bd_get_origin (bd_priv_t *priv, loc_t *loc, fd_t *fd, dict_t *dict)
+{
+ vg_t brick = NULL;
+ lvm_property_value_t prop = {0, };
+ lv_t lv = NULL;
+ int ret = -1;
+ bd_gfid_t gfid = {0, };
+ inode_t *inode = NULL;
+ char *origin = NULL;
+
+ brick = lvm_vg_open (priv->handle, priv->vg, "w", 0);
+ if (!brick) {
+ gf_log (THIS->name, GF_LOG_CRITICAL, "VG %s is not found",
+ priv->vg);
+ return ENOENT;
+ }
+
+ if (fd)
+ inode = fd->inode;
+ else
+ inode = loc->inode;
+
+ uuid_utoa_r (inode->gfid, gfid);
+ lv = lvm_lv_from_name (brick, gfid);
+ if (!lv) {
+ gf_log (THIS->name, GF_LOG_CRITICAL, "LV %s not found", gfid);
+ ret = ENOENT;
+ goto out;
+ }
+
+ prop = lvm_lv_get_property (lv, "origin");
+ if (!prop.is_valid || !prop.value.string) {
+ ret = ENODATA;
+ goto out;
+ }
+
+ origin = gf_strdup (prop.value.string);
+ ret = dict_set_dynstr (dict, BD_ORIGIN, origin);
+
+out:
+ lvm_vg_close (brick);
+ return ret;
+}
+
+#ifndef BLKZEROOUT
+
+int
+bd_do_manual_zerofill (int fd, off_t offset, off_t len, int o_direct)
+{
+ off_t num_vect = 0;
+ off_t num_loop = 1;
+ int idx = 0;
+ int op_ret = -1;
+ int vect_size = IOV_SIZE;
+ off_t remain = 0;
+ off_t extra = 0;
+ struct iovec *vector = NULL;
+ char *iov_base = NULL;
+ char *alloc_buf = NULL;
+
+ if (len == 0)
+ return 0;
+
+ if (len < IOV_SIZE)
+ vect_size = len;
+
+ num_vect = len / (vect_size);
+ remain = len % vect_size ;
+
+ if (num_vect > MAX_NO_VECT) {
+ extra = num_vect % MAX_NO_VECT;
+ num_loop = num_vect / MAX_NO_VECT;
+ num_vect = MAX_NO_VECT;
+ }
+
+ vector = GF_CALLOC (num_vect, sizeof(struct iovec),
+ gf_common_mt_iovec);
+ if (!vector)
+ return -1;
+
+ if (o_direct) {
+ alloc_buf = page_aligned_alloc (vect_size, &iov_base);
+ if (!alloc_buf) {
+ gf_log ("bd_do_manual_zerofill", GF_LOG_DEBUG,
+ "memory alloc failed, vect_size %d: %s",
+ vect_size, strerror (errno));
+ GF_FREE (vector);
+ return -1;
+ }
+ } else {
+ iov_base = GF_CALLOC (vect_size, sizeof(char),
+ gf_common_mt_char);
+ if (!iov_base) {
+ GF_FREE (vector);
+ return -1;
+ }
+ }
+
+ for (idx = 0; idx < num_vect; idx++) {
+ vector[idx].iov_base = iov_base;
+ vector[idx].iov_len = vect_size;
+ }
+
+ if (lseek (fd, offset, SEEK_SET) < 0) {
+ op_ret = -1;
+ goto err;
+ }
+
+ for (idx = 0; idx < num_loop; idx++) {
+ op_ret = writev (fd, vector, num_vect);
+ if (op_ret < 0)
+ goto err;
+ }
+ if (extra) {
+ op_ret = writev (fd, vector, extra);
+ if (op_ret < 0)
+ goto err;
+ }
+ if (remain) {
+ vector[0].iov_len = remain;
+ op_ret = writev (fd, vector , 1);
+ if (op_ret < 0)
+ goto err;
+ }
+ op_ret = 0;
+err:
+ if (o_direct)
+ GF_FREE (alloc_buf);
+ else
+ GF_FREE (iov_base);
+ GF_FREE (vector);
+ return op_ret;
+}
+
+#else
+
+/*
+ * Issue Linux ZEROOUT ioctl to write '0' to a scsi device at given offset
+ * and number of bytes. Each SCSI device's maximum write same bytes are exported
+ * in sysfs file. Sending ioctl request greater than this bytes results in slow
+ * performance. Read this file to get the maximum bytes and break down single
+ * ZEROOUT request into multiple ZEROOUT request not exceeding maximum bytes.
+ * From VG & LV name of device mapper identified and sysfs file read.
+ * /sys/block/<block-device>/queue/write_same_max_bytes
+ */
+int
+bd_do_ioctl_zerofill (bd_priv_t *priv, bd_attr_t *bdatt, int fd, char *vg,
+ off_t offset, off_t len)
+{
+ char *dm = NULL;
+ char dmname[4096] = {0, };
+ char lvname[4096] = {0, };
+ char sysfs[4096] = {0, };
+ bd_gfid_t uuid = {0, };
+ char *p = NULL;
+ off_t max_bytes = 0;
+ int sysfd = -1;
+ uint64_t param[2] = {0, 0};
+ off_t nr_loop = 0;
+ char buff[16] = {0, };
+
+ uuid_utoa_r (bdatt->iatt.ia_gfid, uuid);
+ sprintf (lvname, "/dev/%s/%s", vg, uuid);
+
+ readlink (lvname, dmname, sizeof (dmname));
+
+ p = strrchr (dmname, '/');
+ if (p)
+ dm = p + 1;
+ else
+ dm = dmname;
+
+ sprintf(sysfs, "/sys/block/%s/queue/write_same_max_bytes", dm);
+ sysfd = open (sysfs, O_RDONLY);
+ if (sysfd < 0) {
+ gf_log ("bd_do_ioctl_zerofill", GF_LOG_DEBUG,
+ "sysfs file %s does not exist", lvname);
+ goto skip;
+ }
+
+ read (sysfd, buff, sizeof (buff));
+ close (sysfd);
+
+ max_bytes = atoll (buff);
+
+skip:
+ /*
+ * If requested len is less than write_same_max_bytes,
+ * issue single ioctl to zeroout. Otherwise split the ioctls
+ */
+ if (!max_bytes || len <= max_bytes) {
+ param[0] = offset;
+ param[1] = len;
+
+ if (ioctl (fd, BLKZEROOUT, param) < 0)
+ return errno;
+ return 0;
+ }
+
+ /* Split ioctls to max write_same_max_bytes */
+ nr_loop = len / max_bytes;
+ for (; nr_loop; nr_loop--) {
+ param[0] = offset;
+ param[1] = max_bytes;
+
+ if (ioctl (fd, BLKZEROOUT, param) < 0)
+ return errno;
+
+ offset += max_bytes;
+ }
+
+ if (!(len % max_bytes))
+ return 0;
+
+ param[0] = offset;
+ param[1] = len % max_bytes;
+
+ if (ioctl (fd, BLKZEROOUT, param) < 0)
+ return errno;
+
+ return 0;
+}
+#endif
+
+int
+bd_do_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, off_t len, struct iatt *prebuf,
+ struct iatt *postbuf)
+{
+ int ret = -1;
+ bd_fd_t *bd_fd = NULL;
+ bd_priv_t *priv = this->private;
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (priv, out);
+
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "bd_fd is NULL from fd=%p", fd);
+ goto out;
+ }
+
+ bd_inode_ctx_get (fd->inode, this, &bdatt);
+#ifndef BLKZEROOUT
+ ret = bd_do_manual_zerofill(bd_fd->fd, offset, len,
+ bd_fd->flag & O_DIRECT);
+#else
+ ret = bd_do_ioctl_zerofill(priv, bdatt, bd_fd->fd, priv->vg, offset,
+ len);
+#endif
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "zerofill failed on fd %d length %ld %s",
+ bd_fd->fd, len, strerror (ret));
+ goto out;
+ }
+
+ if (bd_fd->flag & (O_SYNC|O_DSYNC)) {
+ ret = fsync (bd_fd->fd);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "fsync() in writev on fd %d failed: %s",
+ bd_fd->fd, strerror (errno));
+ return errno;
+ }
+ }
+
+ memcpy (&prebuf, &bdatt->iatt, sizeof (prebuf));
+ bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME);
+ memcpy (&postbuf, &bdatt->iatt, sizeof (postbuf));
+
+out:
+
+ return ret;
+}
diff --git a/xlators/storage/bd/src/bd-mem-types.h b/xlators/storage/bd/src/bd-mem-types.h
new file mode 100644
index 000000000..58b448342
--- /dev/null
+++ b/xlators/storage/bd/src/bd-mem-types.h
@@ -0,0 +1,27 @@
+/*
+ Copyright (c) 2008-2014 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+
+#ifndef __BD_MEM_TYPES_H__
+#define __BD_MEM_TYPES_H__
+
+#include "mem-types.h"
+
+enum gf_bd_mem_types_ {
+ gf_bd_private = gf_common_mt_end + 1,
+ gf_bd_attr,
+ gf_bd_fd,
+ gf_bd_loc_t,
+ gf_bd_int32_t,
+ gf_bd_aio_cb,
+ gf_bd_mt_end
+};
+
+#endif
diff --git a/xlators/storage/bd/src/bd.c b/xlators/storage/bd/src/bd.c
new file mode 100644
index 000000000..750b00943
--- /dev/null
+++ b/xlators/storage/bd/src/bd.c
@@ -0,0 +1,2450 @@
+/*
+ BD translator V2 - Exports Block devices on server side as regular
+ files to client
+
+ Now only exporting Logical volumes supported.
+
+ Copyright IBM, Corp. 2013
+
+ This file is part of GlusterFS.
+
+ Author:
+ M. Mohan Kumar <mohan@in.ibm.com>
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+#include <lvm2app.h>
+#include <openssl/md5.h>
+#include <time.h>
+#include <linux/fs.h>
+#include <sys/ioctl.h>
+#ifdef HAVE_LIBAIO
+#include <libaio.h>
+#endif
+
+#include "bd.h"
+#include "bd-aio.h"
+#include "bd-mem-types.h"
+#include "defaults.h"
+#include "glusterfs3-xdr.h"
+#include "run.h"
+#include "protocol-common.h"
+#include "checksum.h"
+#include "syscall.h"
+#include "lvm-defaults.h"
+
+/*
+ * Call back function for setxattr and removexattr.
+ * does not do anything. FIXME: How to handle remove/setxattr failure
+ */
+int
+bd_null_rmsetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ STACK_DESTROY (frame->root);
+ return 0;
+}
+
+/*
+ * returns 0 if a file is mapped to BD or not.
+ */
+int
+bd_get_bd_info (call_frame_t *frame, xlator_t *this, dict_t *xattr, uuid_t gfid,
+ char **type, uint64_t *size)
+{
+ char *bd_xattr = NULL;
+ char *bd = NULL;
+ int ret = -1;
+ loc_t loc = {0, };
+ dict_t *dict = NULL;
+ char *p = NULL;
+ call_frame_t *bd_frame = NULL;
+
+ if (!xattr)
+ return 1;
+
+ if (dict_get_str (xattr, BD_XATTR, &p))
+ return 1;
+
+ bd_xattr = gf_strdup (p);
+
+ memcpy (loc.gfid, gfid, sizeof (uuid_t));
+
+ bd_frame = copy_frame (frame);
+ BD_VALIDATE_MEM_ALLOC (bd_frame, ret, out);
+
+ ret = bd_validate_bd_xattr (this, bd_xattr, type, size, gfid);
+ if (ret < 0) {/* LV does not exist */
+ STACK_WIND (bd_frame, bd_null_rmsetxattr_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->removexattr, &loc,
+ BD_XATTR, NULL);
+
+ gf_log (this->name, GF_LOG_WARNING,
+ "Mapped LV not available for posix file <gfid:%s>, "
+ "deleting mapping", uuid_utoa (gfid));
+ } else if (ret == 1) {
+ /* BD_XATTR size and LV size mismatch. Update BD_XATTR */
+ gf_asprintf (&bd, "%s:%ld", *type, *size);
+
+ dict = dict_new ();
+ BD_VALIDATE_MEM_ALLOC (dict, ret, out);
+
+ ret = dict_set_dynstr (dict, BD_XATTR, bd);
+ if (ret)
+ goto out;
+
+ STACK_WIND (bd_frame, bd_null_rmsetxattr_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->setxattr, &loc, dict, 0,
+ NULL);
+ }
+
+out:
+ dict_del (xattr, BD_XATTR);
+ GF_FREE (bd_xattr);
+ GF_FREE (bd);
+ return ret;
+}
+
+/*
+ * bd_lookup_cbk: Call back from posix_lookup.
+ */
+int32_t
+bd_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *buf, dict_t *xattr,
+ struct iatt *postparent)
+{
+ int ret = -1;
+ bd_attr_t *bdatt = NULL;
+ uint64_t size = 0;
+ char *type = BD_TYPE_NONE;
+
+ /* only regular files are part of BD object */
+ if (op_ret < 0 || buf->ia_type != IA_IFREG)
+ goto out;
+
+ /* iatt already cached */
+ if (!bd_inode_ctx_get (inode, this, &bdatt))
+ goto next;
+
+ if (bd_get_bd_info (frame, this, xattr, buf->ia_gfid, &type, &size))
+ goto out;
+
+ /* BD file, update buf */
+ bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr);
+ if (!bdatt) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+ memcpy (&bdatt->iatt, buf, sizeof (struct iatt));
+ bdatt->type = type;
+
+ /* Cache LV size in inode_ctx */
+ ret = bd_inode_ctx_set (inode, this, bdatt);
+ if (ret < 0) {
+ GF_FREE (bdatt);
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ bdatt->iatt.ia_size = size;
+ bdatt->iatt.ia_blocks = size / 512;
+
+next:
+ dict_del (xattr, GF_CONTENT_KEY);
+ memcpy (buf, &bdatt->iatt, sizeof (struct iatt));
+
+out:
+ BD_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, buf,
+ xattr, postparent);
+ return 0;
+}
+
+/*
+ * bd_lookup: Issues posix_lookup to find out if file is mapped to BD
+ * bd_lookup -> posix_lookup -> bd_lookup_cbk
+*/
+int32_t
+bd_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
+{
+ dict_t *bd_xattr = NULL;
+ bd_attr_t *bdatt = NULL;
+ int op_errno = EINVAL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (loc, out);
+ VALIDATE_OR_GOTO (loc->path, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ if (bd_inode_ctx_get (loc->inode, this, &bdatt) < 0) {
+ if (!xattr_req) {
+ bd_xattr = dict_new ();
+ BD_VALIDATE_MEM_ALLOC (bd_xattr, op_errno, out);
+ xattr_req = bd_xattr;
+ }
+ if (dict_set_int8 (xattr_req, BD_XATTR, 1) < 0)
+ goto out;
+ }
+
+ STACK_WIND (frame, bd_lookup_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->lookup, loc, xattr_req);
+
+ if (bd_xattr)
+ dict_unref (bd_xattr);
+ return 0;
+out:
+ BD_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+
+ return 0;
+}
+
+int
+bd_forget (xlator_t *this, inode_t *inode)
+{
+ int ret = -1;
+ uint64_t ctx = 0;
+ bd_attr_t *bdatt = NULL;
+
+ ret = bd_inode_ctx_get (inode, this, &bdatt);
+ if (!ret) {
+ inode_ctx_del (inode, this, &ctx);
+ GF_FREE (bdatt);
+ }
+ return 0;
+}
+
+int
+bd_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, gf_dirent_t *entries, dict_t *xdata)
+{
+ gf_dirent_t *entry = NULL;
+ uint64_t size = 0;
+ char *type = NULL;
+
+ if (op_ret < 0)
+ goto out;
+
+ list_for_each_entry (entry, &entries->list, list) {
+ if (entry->d_type != DT_REG)
+ continue;
+ if (!bd_get_bd_info (frame, this, entry->dict,
+ entry->d_stat.ia_gfid, &type, &size)) {
+ entry->d_stat.ia_size = size;
+ entry->d_stat.ia_blocks = size / 512;
+ GF_FREE (type);
+ }
+ }
+
+out:
+ BD_STACK_UNWIND (readdirp, frame, op_ret, op_errno, entries, xdata);
+ return 0;
+}
+
+/*
+ * bd_readdirp: In bd_readdirp_cbk if the file and BD_XATTR_SIZE is set
+ * ia_size is updated with the LV(BD_XATTR_SIZE) size
+ */
+int32_t
+bd_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t off, dict_t *dict)
+{
+ int op_errno = EINVAL;
+ bd_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ if (!dict) {
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+ local->dict = dict_new ();
+ BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out);
+ dict = local->dict;
+ }
+
+ if (dict_set_int8 (dict, BD_XATTR, 0)) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to set key %s", BD_XATTR);
+ goto out;
+ }
+
+ STACK_WIND (frame, bd_readdirp_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readdirp, fd, size, off, dict);
+
+ return 0;
+out:
+ BD_STACK_UNWIND (readdirp, frame, -1, op_errno, NULL, dict);
+ return 0;
+}
+
+int
+bd_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *buf, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+ bd_attr_t *bdatt = NULL;
+
+ /* only regular files are part of BD object */
+ if (op_ret < 0 || buf->ia_type != IA_IFREG)
+ goto out;
+
+ BD_VALIDATE_LOCAL_OR_GOTO (local, op_errno, out);
+
+ /* update buf with LV size */
+ if (!bd_inode_ctx_get (local->inode, this, &bdatt))
+ memcpy (buf, bdatt, sizeof (struct iatt));
+
+out:
+ BD_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata);
+ return 0;
+}
+
+int
+bd_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ int op_errno = EINVAL;
+ bd_local_t *local = NULL;
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (loc, out);
+ VALIDATE_OR_GOTO (loc->path, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ if (!bd_inode_ctx_get (loc->inode, this, &bdatt)) {
+ BD_STACK_UNWIND (stat, frame, 0, 0, &bdatt->iatt, xdata);
+ return 0;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+ local->inode = inode_ref (loc->inode);
+
+ STACK_WIND(frame, bd_stat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->stat, loc, xdata);
+ return 0;
+out:
+ BD_STACK_UNWIND (stat, frame, -1, op_errno, NULL, xdata);
+ return 0;
+}
+
+int
+bd_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct statvfs *buff, dict_t *xdata)
+{
+ uint64_t size = 0;
+ uint64_t fr_size = 0;
+ bd_priv_t *priv = NULL;
+ vg_t vg = NULL;
+
+ if (op_ret < 0)
+ goto out;
+
+ priv = this->private;
+
+ vg = lvm_vg_open (priv->handle, priv->vg, "r", 0);
+ if (!vg) {
+ gf_log (this->name, GF_LOG_WARNING, "opening VG %s failed",
+ priv->vg);
+ op_ret = -1;
+ op_errno = EAGAIN;
+ goto out;
+ }
+ size = lvm_vg_get_size (vg);
+ fr_size = lvm_vg_get_free_size (vg);
+ lvm_vg_close (vg);
+
+ buff->f_blocks += size / buff->f_frsize;
+ buff->f_bfree += fr_size / buff->f_frsize;
+ buff->f_bavail += fr_size / buff->f_frsize;
+
+out:
+ BD_STACK_UNWIND (statfs, frame, op_ret, op_errno, buff, xdata);
+ return 0;
+}
+
+/*
+ * bd_statfs: Mimics statfs by returning used/free extents in the VG
+ */
+int
+bd_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ VALIDATE_OR_GOTO (loc, out);
+
+ STACK_WIND (frame, bd_statfs_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->statfs, loc, xdata);
+ return 0;
+out:
+ BD_STACK_UNWIND (statfs, frame, -1, EINVAL, NULL, NULL);
+ return 0;
+}
+
+int
+bd_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *buf, dict_t *xdata)
+{
+ bd_attr_t *bdatt = NULL;
+ bd_local_t *local = frame->local;
+
+ /* only regular files are part of BD object */
+ if (op_ret < 0 || buf->ia_type != IA_IFREG)
+ goto out;
+
+ BD_VALIDATE_LOCAL_OR_GOTO (local, op_errno, out);
+
+ /* update buf with LV size */
+ if (!bd_inode_ctx_get (local->inode, this, &bdatt))
+ memcpy (buf, &bdatt->iatt, sizeof (struct iatt));
+
+out:
+ BD_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata);
+ return 0;
+}
+
+int
+bd_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ int op_errno = EINVAL;
+ bd_local_t *local = NULL;
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ /* if its already cached return it */
+ if (!bd_inode_ctx_get (fd->inode, this, &bdatt)) {
+ BD_STACK_UNWIND (fstat, frame, 0, 0, &bdatt->iatt, xdata);
+ return 0;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ local->inode = inode_ref (fd->inode);
+
+ STACK_WIND (frame, bd_fstat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fstat, fd, xdata);
+
+ return 0;
+out:
+ BD_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, xdata);
+ return 0;
+}
+
+/*
+ * bd_readv: If posix file, invokes posix_readv otherwise reads from the BD
+ * file
+ */
+int
+bd_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata)
+{
+ int ret = -1;
+ int _fd = -1;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ bd_fd_t *bd_fd = NULL;
+ struct iovec vec = {0, };
+ struct iobuf *iobuf = NULL;
+ struct iobref *iobref = NULL;
+ uint64_t bd_size = 0;
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd) {
+ STACK_WIND (frame, default_readv_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->readv,
+ fd, size, offset, flags, xdata);
+ return 0;
+ }
+ if (!size) {
+ op_errno = EINVAL;
+ gf_log (this->name, GF_LOG_WARNING, "size=%"GF_PRI_SIZET, size);
+ goto out;
+ }
+ iobuf = iobuf_get2 (this->ctx->iobuf_pool, size);
+ if (!iobuf) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+ _fd = bd_fd->fd;
+ op_ret = pread (_fd, iobuf->ptr, size, offset);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "read failed on fd=%p: %s", fd,
+ strerror (op_errno));
+ goto out;
+ }
+
+ vec.iov_base = iobuf->ptr;
+ vec.iov_len = op_ret;
+
+ iobref = iobref_new ();
+ iobref_add (iobref, iobuf);
+
+ if (bd_inode_ctx_get (fd->inode, this, &bdatt)) {
+ op_errno = EINVAL;
+ op_ret = -1;
+ goto out;
+ }
+ bd_size = bdatt->iatt.ia_size;
+ if (!bd_size || (offset + vec.iov_len) >= bd_size)
+ op_errno = ENOENT;
+
+ op_ret = vec.iov_len;
+ bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_ATIME);
+
+out:
+ BD_STACK_UNWIND (readv, frame, op_ret, op_errno,
+ &vec, 1, &bdatt->iatt, iobref, NULL);
+
+ if (iobref)
+ iobref_unref (iobref);
+ if (iobuf)
+ iobuf_unref (iobuf);
+
+ return 0;
+}
+
+#ifdef BLKDISCARD
+/*
+ * bd_discard: Sends BLKDISCARD ioctl to the block device
+ */
+int
+bd_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ int ret = -1;
+ int op_errno = EINVAL;
+ bd_fd_t *bd_fd = NULL;
+ uint64_t param[2] = {0, };
+ bd_attr_t *bdatt = NULL;
+ struct iatt prebuf = {0, };
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ VALIDATE_OR_GOTO (fd, out);
+
+ /* posix */
+ if (bd_inode_ctx_get (fd->inode, this, &bdatt)) {
+ STACK_WIND (frame, default_discard_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->discard,
+ fd, offset, len, xdata);
+ return 0;
+ }
+
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ param[0] = offset;
+ param[1] = len;
+ ret = ioctl (bd_fd->fd, BLKDISCARD, param);
+ if (ret < 0) {
+ if (errno == ENOTTY)
+ op_errno = ENOSYS;
+ else
+ op_errno = errno;
+ goto out;
+ }
+ memcpy (&prebuf, &bdatt->iatt, sizeof (prebuf));
+ bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME);
+
+ BD_STACK_UNWIND (discard, frame, ret, op_errno, &prebuf,
+ &bdatt->iatt, xdata);
+ return 0;
+
+out:
+ BD_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+#else
+
+int
+bd_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ BD_STACK_UNWIND (discard, frame, -1, ENOSYS, NULL, NULL, NULL);
+ return 0;
+}
+#endif
+
+/*
+ * Call back from posix_open for opening the backing posix file
+ * If it failed, close BD fd
+ */
+int
+bd_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, fd_t *fd, dict_t *xdata)
+{
+ bd_fd_t *bd_fd = NULL;
+ bd_attr_t *bdatt = NULL;
+
+ if (!op_ret)
+ goto out;
+
+ bd_inode_ctx_get (fd->inode, this, &bdatt);
+ if (!bdatt) /* posix file */
+ goto out;
+
+ /* posix open failed */
+ if (bd_fd_ctx_get (this, fd, &bd_fd) < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "bd_fd is NULL from fd=%p", fd);
+ goto out;
+ }
+ close (bd_fd->fd);
+ GF_FREE (bd_fd);
+
+out:
+ BD_STACK_UNWIND (open, frame, op_ret, op_errno, fd, NULL);
+
+ return 0;
+}
+
+/*
+ * bd_open: Opens BD file if given posix file is mapped to BD. Also opens
+ * posix file.
+ * fd contains both posix and BD fd
+ */
+int32_t
+bd_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ fd_t *fd, dict_t *xdata)
+{
+ int32_t ret = EINVAL;
+ bd_fd_t *bd_fd = NULL;
+ bd_attr_t *bdatt = NULL;
+ bd_gfid_t gfid = {0, };
+ char *devpath = NULL;
+ bd_priv_t *priv = this->private;
+ int _fd = -1;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ VALIDATE_OR_GOTO (loc, out);
+ VALIDATE_OR_GOTO (fd, out);
+
+ /* not bd file */
+ if (fd->inode->ia_type != IA_IFREG ||
+ bd_inode_ctx_get (fd->inode, this, &bdatt))
+ goto posix;
+
+ uuid_utoa_r (fd->inode->gfid, gfid);
+ gf_asprintf (&devpath, "/dev/%s/%s", priv->vg, gfid);
+ BD_VALIDATE_MEM_ALLOC (devpath, ret, out);
+
+ _fd = open (devpath, flags | O_LARGEFILE, 0);
+ if (_fd < 0) {
+ ret = errno;
+ gf_log (this->name, GF_LOG_ERROR, "open on %s: %s", devpath,
+ strerror (ret));
+ goto out;
+ }
+ bd_fd = GF_CALLOC (1, sizeof(bd_fd_t), gf_bd_fd);
+ BD_VALIDATE_MEM_ALLOC (bd_fd, ret, out);
+
+ bd_fd->fd = _fd;
+ bd_fd->flag = flags | O_LARGEFILE;
+
+ if (fd_ctx_set (fd, this, (uint64_t)(long)bd_fd) < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "failed to set the fd context fd=%p", fd);
+ goto out;
+ }
+
+ ret = 0;
+
+posix:
+
+ /* open posix equivalant of this file, fd needed for fd related
+ operations like fsetxattr, ftruncate etc */
+ STACK_WIND (frame, bd_open_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata);
+
+ return 0;
+out:
+ BD_STACK_UNWIND (open, frame, -1, ret, fd, NULL);
+
+ GF_FREE (devpath);
+ if (ret) {
+ close (_fd);
+ GF_FREE (bd_fd);
+ }
+
+ return 0;
+}
+
+/*
+ * call back from posix_setattr after updating iatt to posix file.
+ */
+int
+bd_fsync_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+ bd_attr_t *bdatt = local->bdatt;
+
+ BD_STACK_UNWIND (fsync, frame, op_ret, op_errno, &bdatt->iatt,
+ &bdatt->iatt, NULL);
+ return 0;
+}
+
+int
+bd_do_fsync (int fd, int datasync)
+{
+ int op_errno = 0;
+
+ if (datasync) {
+ if (sys_fdatasync (fd)) {
+ op_errno = errno;
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "fdatasync on fd=%d failed: %s",
+ fd, strerror (errno));
+ }
+
+ } else
+
+ {
+ if (sys_fsync (fd)) {
+ op_errno = errno;
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "fsync on fd=%d failed: %s",
+ fd, strerror (op_errno));
+ }
+ }
+
+ return op_errno;
+}
+
+/*
+ * bd_fsync: Syncs if BD fd, forwards the request to posix
+ * fsync -> posix_setattr -> posix_fsync
+*/
+int32_t
+bd_fsync (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, int32_t datasync, dict_t *xdata)
+{
+ int ret = -1;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ bd_fd_t *bd_fd = NULL;
+ bd_priv_t *priv = NULL;
+ bd_attr_t *bdatt = NULL;
+ bd_local_t *local = NULL;
+ int valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME;
+ struct iatt prebuf = {0, };
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ret = bd_inode_ctx_get (fd->inode, this, &bdatt);
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd || !bdatt) {
+ STACK_WIND (frame, default_fsync_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->fsync, fd, datasync,
+ xdata);
+ return 0;
+ }
+
+ memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt));
+
+ op_errno = bd_do_fsync (bd_fd->fd, datasync);
+ if (op_errno)
+ goto out;
+
+ /* For BD, Update the a|mtime during full fsync only */
+ if (!datasync) {
+ local = bd_local_init (frame, this);
+ /* In case of mem failure, should posix flush called ? */
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ local->bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr);
+ BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out);
+
+ local->bdatt->type = gf_strdup (bdatt->type);
+ memcpy (&local->bdatt->iatt, &bdatt->iatt, sizeof (struct iatt));
+ bd_update_amtime (&local->bdatt->iatt, valid);
+ uuid_copy (local->loc.gfid, fd->inode->gfid);
+ STACK_WIND (frame, bd_fsync_setattr_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->setattr, &local->loc,
+ &local->bdatt->iatt,
+ valid, NULL);
+ return 0;
+ }
+
+out:
+ BD_STACK_UNWIND (fsync, frame, op_ret, op_errno, &prebuf,
+ &bdatt->iatt, NULL);
+ return 0;
+}
+
+int
+bd_flush_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata)
+{
+ BD_STACK_UNWIND (flush, frame, op_ret, op_errno, xdata);
+ return 0;
+}
+
+int
+bd_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ int ret = -1;
+ bd_fd_t *bd_fd = NULL;
+ bd_priv_t *priv = NULL;
+ bd_attr_t *bdatt = NULL;
+ int valid = GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME;
+ bd_local_t *local = NULL;
+ int op_errno = EINVAL;
+ loc_t loc = {0, };
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (this->private, out);
+
+ priv = this->private;
+
+ ret = bd_inode_ctx_get (fd->inode, this, &bdatt);
+ if (!bdatt)
+ goto out;
+
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd || !bdatt) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "bdfd/bdatt is NULL from fd=%p", fd);
+ goto out;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ local->fd = fd_ref (fd);
+ uuid_copy (loc.gfid, bdatt->iatt.ia_gfid);
+
+ /* Update the a|mtime during flush */
+ STACK_WIND (frame, bd_flush_setattr_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->setattr, &loc, &bdatt->iatt,
+ valid, NULL);
+
+ return 0;
+
+out:
+ STACK_WIND (frame, default_flush_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->flush, fd, xdata);
+
+ return 0;
+}
+
+int32_t
+bd_release (xlator_t *this, fd_t *fd)
+{
+ int ret = -1;
+ bd_fd_t *bd_fd = NULL;
+ uint64_t tmp_bfd = 0;
+ bd_attr_t *bdatt = NULL;
+ bd_priv_t *priv = this->private;
+
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (priv, out);
+
+ ret = bd_inode_ctx_get (fd->inode, this, &bdatt);
+ if (ret || !bdatt) /* posix file */
+ goto out;
+
+ /* FIXME: Update amtime during release */
+
+ ret = fd_ctx_del (fd, this, &tmp_bfd);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "bfd is NULL from fd=%p", fd);
+ goto out;
+ }
+ bd_fd = (bd_fd_t *)(long)tmp_bfd;
+
+ close (bd_fd->fd);
+ GF_FREE (bd_fd);
+out:
+ return 0;
+}
+
+/*
+ * Call back for removexattr after removing BD_XATTR incase of
+ * bd create failure
+ */
+int
+bd_setx_rm_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+
+ if (local->fd)
+ BD_STACK_UNWIND (setxattr, frame, -1, EIO, xdata);
+ else
+ BD_STACK_UNWIND (setxattr, frame, -1, EIO, xdata);
+ return 0;
+
+}
+
+/*
+ * Call back after setting BD_XATTR. Creates BD. If BD creation is a failure
+ * invokes posix_removexattr to remove created BD_XATTR
+ */
+int
+bd_setx_setx_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+ bd_attr_t *bdatt = NULL;
+
+ if (op_ret < 0)
+ goto next;
+
+ /* Create LV */
+ op_errno = bd_create (local->inode->gfid, local->bdatt->iatt.ia_size,
+ local->bdatt->type, this->private);
+ if (!op_errno)
+ goto out;
+
+ /* LV creation failed, remove BD_XATTR */
+ if (local->fd)
+ STACK_WIND (frame, bd_setx_rm_xattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fremovexattr,
+ local->fd, BD_XATTR, NULL);
+ else
+ STACK_WIND (frame, bd_setx_rm_xattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->removexattr,
+ &local->loc, BD_XATTR, NULL);
+
+ return 0;
+out:
+
+ bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr);
+ if (!bdatt) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto next;
+ }
+
+ memcpy (&bdatt->iatt, &local->bdatt->iatt, sizeof (struct iatt));
+ bdatt->type = gf_strdup (local->bdatt->type);
+
+ bd_inode_ctx_set (local->inode, THIS, bdatt);
+
+next:
+ if (local->fd)
+ BD_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata);
+ else
+ BD_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata);
+ return 0;
+
+}
+
+/*
+ * Call back from posix_stat
+ */
+int
+bd_setx_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *iatt,
+ dict_t *xdata)
+{
+ char *param = NULL;
+ char *type = NULL;
+ char *s_size = NULL;
+ char *p = NULL;
+ char *copy = NULL;
+ bd_local_t *local = frame->local;
+ bd_priv_t *priv = this->private;
+ char *bd = NULL;
+ uint64_t size = 0;
+
+ if (op_ret < 0)
+ goto out;
+
+ if (!IA_ISREG (iatt->ia_type)) {
+ op_errno = EOPNOTSUPP;
+ goto out;
+ }
+
+ param = copy = GF_CALLOC (1, local->data->len + 1, gf_common_mt_char);
+ BD_VALIDATE_MEM_ALLOC (param, op_errno, out);
+
+ strncpy (param, local->data->data, local->data->len);
+
+ type = strtok_r (param, ":", &p);
+ if (!type) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ if (strcmp (type, BD_LV) && strcmp (type, BD_THIN)) {
+ gf_log (this->name, GF_LOG_WARNING, "Invalid bd type %s given",
+ type);
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ if (!strcmp (type, BD_THIN) && !(priv->caps & BD_CAPS_THIN)) {
+ gf_log (this->name, GF_LOG_WARNING, "THIN lv not supported by "
+ "this volume");
+ op_errno = EOPNOTSUPP;
+ goto out;
+ }
+
+ s_size = strtok_r (NULL, ":", &p);
+
+ /* If size not specified get default size */
+ if (!s_size)
+ size = bd_get_default_extent (priv);
+ else
+ gf_string2bytesize (s_size, &size);
+
+ gf_asprintf (&bd, "%s:%ld", type, size);
+ BD_VALIDATE_MEM_ALLOC (bd, op_errno, out);
+
+ local->dict = dict_new ();
+ BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out);
+
+ local->bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr);
+ BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out);
+
+ if (dict_set_dynstr (local->dict, BD_XATTR, bd) < 0) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ local->bdatt->type = gf_strdup (type);
+ memcpy (&local->bdatt->iatt, iatt, sizeof (struct iatt));
+ local->bdatt->iatt.ia_size = size;
+
+ if (local->fd)
+ STACK_WIND (frame, bd_setx_setx_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr,
+ local->fd, local->dict, 0, NULL);
+ else
+ STACK_WIND (frame, bd_setx_setx_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setxattr,
+ &local->loc, local->dict, 0, NULL);
+
+ return 0;
+
+out:
+ if (local->fd)
+ BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, xdata);
+ else
+ BD_STACK_UNWIND (setxattr, frame, -1, op_errno, xdata);
+
+ GF_FREE (bd);
+ GF_FREE (copy);
+ return 0;
+}
+
+int
+bd_offload_rm_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+
+ if (local->fd)
+ BD_STACK_UNWIND (fsetxattr, frame, -1, EIO, NULL);
+ else
+ BD_STACK_UNWIND (setxattr, frame, -1, EIO, NULL);
+
+ return 0;
+}
+
+int
+bd_offload_setx_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+
+ if (op_ret < 0)
+ goto out;
+
+ if (local->offload == BD_OF_SNAPSHOT)
+ op_ret = bd_snapshot_create (frame->local, this->private);
+ else
+ op_ret = bd_clone (frame->local, this->private);
+
+ if (op_ret) {
+ STACK_WIND (frame, bd_offload_rm_xattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->removexattr,
+ local->dloc, BD_XATTR, NULL);
+ return 0;
+ }
+
+out:
+ if (local->fd)
+ BD_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, NULL);
+ else
+ BD_STACK_UNWIND (setxattr, frame, op_ret, op_errno, NULL);
+
+ return 0;
+}
+
+int
+bd_offload_getx_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
+{
+ char *bd = NULL;
+ bd_local_t *local = frame->local;
+ char *type = NULL;
+ char *p = NULL;
+
+ if (op_ret < 0)
+ goto out;
+
+ if (dict_get_str (xattr, BD_XATTR, &p)) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ type = gf_strdup (p);
+ BD_VALIDATE_MEM_ALLOC (type, op_errno, out);
+
+ p = strrchr (type, ':');
+ if (!p) {
+ op_errno = EINVAL;
+ gf_log (this->name, GF_LOG_WARNING,
+ "source file xattr %s corrupted?", type);
+ goto out;
+ }
+
+ *p='\0';
+
+ /* For clone size is taken from source LV */
+ if (!local->size) {
+ p++;
+ gf_string2bytesize (p, &local->size);
+ }
+ gf_asprintf (&bd, "%s:%ld", type, local->size);
+ local->bdatt->type = gf_strdup (type);
+ dict_del (local->dict, BD_XATTR);
+ dict_del (local->dict, LINKTO);
+ if (dict_set_dynstr (local->dict, BD_XATTR, bd)) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ STACK_WIND (frame, bd_offload_setx_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setxattr,
+ local->dloc, local->dict, 0, NULL);
+
+ return 0;
+
+out:
+ if (local->fd)
+ BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL);
+ else
+ BD_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
+
+ GF_FREE (type);
+ GF_FREE (bd);
+
+ return 0;
+}
+
+int
+bd_offload_dest_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno,
+ inode_t *inode, struct iatt *iatt,
+ dict_t *xattr, struct iatt *postparent)
+{
+ bd_local_t *local = frame->local;
+ char *bd = NULL;
+ int ret = -1;
+ char *linkto = NULL;
+
+ if (op_ret < 0 && op_errno != ENODATA) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ if (!IA_ISREG (iatt->ia_type)) {
+ op_errno = EINVAL;
+ gf_log (this->name, GF_LOG_WARNING, "destination gfid is not a "
+ "regular file");
+ goto out;
+ }
+
+ ret = dict_get_str (xattr, LINKTO, &linkto);
+ if (linkto) {
+ op_errno = EINVAL;
+ gf_log (this->name, GF_LOG_WARNING, "destination file not "
+ "present in same brick");
+ goto out;
+ }
+
+ ret = dict_get_str (xattr, BD_XATTR, &bd);
+ if (bd) {
+ op_errno = EEXIST;
+ goto out;
+ }
+
+ local->bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr);
+ BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out);
+
+ STACK_WIND (frame, bd_offload_getx_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->getxattr,
+ &local->loc, BD_XATTR, NULL);
+
+ return 0;
+out:
+ if (local->fd)
+ BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL);
+ else
+ BD_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+int
+bd_merge_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ /* FIXME: if delete failed, remove xattr */
+
+ BD_STACK_UNWIND (setxattr, frame, op_ret, op_errno, NULL);
+ return 0;
+}
+
+int
+bd_do_merge(call_frame_t *frame, xlator_t *this)
+{
+ bd_local_t *local = frame->local;
+ inode_t *parent = NULL;
+ char *p = NULL;
+ int op_errno = 0;
+
+ op_errno = bd_merge (this->private, local->inode->gfid);
+ if (op_errno)
+ goto out;
+
+ /*
+ * posix_unlink needs loc->pargfid to be valid, but setxattr FOP does
+ * not have loc->pargfid set. Get parent's gfid by getting parents inode
+ */
+ parent = inode_parent (local->inode, NULL, NULL);
+ if (!parent) {
+ /*
+ * FIXME: Snapshot LV already deleted.
+ * remove xattr, instead of returning failure
+ */
+ op_errno = EINVAL;
+ goto out;
+ }
+ uuid_copy (local->loc.pargfid, parent->gfid);
+
+ p = strrchr (local->loc.path, '/');
+ if (p)
+ p++;
+ local->loc.name = p;
+
+ STACK_WIND (frame, bd_merge_unlink_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->unlink,
+ &local->loc, 0, NULL);
+
+ return 0;
+out:
+ BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL);
+
+ return op_errno;
+}
+
+int
+bd_offload (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ fd_t *fd, bd_offload_t offload)
+{
+ char *param = NULL;
+ char *param_copy = NULL;
+ char *p = NULL;
+ char *size = NULL;
+ char *gfid = NULL;
+ int op_errno = 0;
+ bd_local_t *local = frame->local;
+
+ param = GF_CALLOC (1, local->data->len + 1, gf_common_mt_char);
+ BD_VALIDATE_MEM_ALLOC (param, op_errno, out);
+ param_copy = param;
+
+ local->dict = dict_new ();
+ BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out);
+
+ local->dloc = GF_CALLOC (1, sizeof (loc_t), gf_bd_loc_t);
+ BD_VALIDATE_MEM_ALLOC (local->dloc, op_errno, out);
+
+ strncpy (param, local->data->data, local->data->len);
+
+ gfid = strtok_r (param, ":", &p);
+ size = strtok_r (NULL, ":", &p);
+ if (size)
+ gf_string2bytesize (size, &local->size);
+ else if (offload != BD_OF_CLONE)
+ local->size = bd_get_default_extent (this->private);
+
+ if (dict_set_int8 (local->dict, BD_XATTR, 1) < 0) {
+ op_errno = EINVAL;
+ goto out;
+ }
+ if (dict_set_int8 (local->dict, LINKTO, 1) < 0) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ uuid_parse (gfid, local->dloc->gfid);
+ local->offload = offload;
+
+ STACK_WIND (frame, bd_offload_dest_lookup_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->lookup, local->dloc,
+ local->dict);
+
+ return 0;
+
+out:
+ if (fd)
+ BD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL);
+ else
+ BD_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
+
+ GF_FREE (param_copy);
+ return 0;
+}
+
+/*
+ * bd_setxattr: Used to create & map an LV to a posix file using
+ * BD_XATTR xattr
+ * bd_setxattr -> posix_stat -> bd_setx_stat_cbk -> posix_setxattr ->
+ * bd_setx_setx_cbk -> create_lv
+ * if create_lv failed, posix_removexattr -> bd_setx_rm_xattr_cbk
+ */
+int
+bd_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+ int flags, dict_t *xdata)
+{
+ int op_errno = 0;
+ data_t *data = NULL;
+ bd_local_t *local = NULL;
+ bd_attr_t *bdatt = NULL;
+ bd_offload_t cl_type = BD_OF_NONE;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+
+ if ((data = dict_get (dict, BD_XATTR)))
+ cl_type = BD_OF_NONE;
+ else if ((data = dict_get (dict, BD_CLONE)))
+ cl_type = BD_OF_CLONE;
+ else if ((data = dict_get (dict, BD_SNAPSHOT)))
+ cl_type = BD_OF_SNAPSHOT;
+ else if ((data = dict_get (dict, BD_MERGE)))
+ cl_type = BD_OF_MERGE;
+
+ bd_inode_ctx_get (loc->inode, this, &bdatt);
+ if (!cl_type && !data) {
+ STACK_WIND (frame, default_setxattr_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->setxattr, loc, dict,
+ flags, xdata);
+ return 0;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ local->data = data;
+ loc_copy (&local->loc, loc);
+ local->inode = inode_ref (loc->inode);
+
+ if (cl_type) {
+ /* For cloning/snapshot, source file must be mapped to LV */
+ if (!bdatt) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s not mapped to BD", loc->path);
+ op_errno = EINVAL;
+ goto out;
+ }
+ if (cl_type == BD_OF_MERGE)
+ bd_do_merge (frame, this);
+ else
+ bd_offload (frame, this, loc, NULL, cl_type);
+ } else if (data) {
+ if (bdatt) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "%s already mapped to BD", loc->path);
+ op_errno = EEXIST;
+ goto out;
+ }
+ STACK_WIND (frame, bd_setx_stat_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->stat, loc, xdata);
+ }
+
+ return 0;
+out:
+ if (op_errno)
+ STACK_UNWIND_STRICT (setxattr, frame, -1, op_errno, xdata);
+
+ return 0;
+}
+
+/*
+ * bd_fsetxattr: Used to create/map an LV to a posix file using
+ * BD_XATTR xattr
+ * bd_fsetxattr -> posix_fstat -> bd_setx_stat_cbk -> posix_fsetxattr ->
+ * bd_setx_setx_cbk -> create_lv
+ * if create_lv failed, posix_removexattr -> bd_setx_rm_xattr_cbk
+ * -> bd_fsetxattr_cbk
+ */
+int32_t
+bd_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+ int flags, dict_t *xdata)
+{
+ int op_errno = 0;
+ data_t *data = NULL;
+ bd_attr_t *bdatt = NULL;
+ bd_local_t *local = NULL;
+ bd_offload_t cl_type = BD_OF_NONE;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (this->private, out);
+ VALIDATE_OR_GOTO (fd, out);
+
+ bd_inode_ctx_get (fd->inode, this, &bdatt);
+
+ data = dict_get (dict, BD_XATTR);
+ if ((data = dict_get (dict, BD_XATTR)))
+ cl_type = BD_OF_NONE;
+ else if ((data = dict_get (dict, BD_CLONE)))
+ cl_type = BD_OF_CLONE;
+ else if ((data = dict_get (dict, BD_SNAPSHOT)))
+ cl_type = BD_OF_SNAPSHOT;
+ else if ((data = dict_get (dict, BD_MERGE))) {
+ /*
+ * bd_merge is not supported for fsetxattr, because snapshot LV
+ * is opened and it causes problem in snapshot merge
+ */
+ op_errno = EOPNOTSUPP;
+ goto out;
+ }
+
+ bd_inode_ctx_get (fd->inode, this, &bdatt);
+
+ if (!cl_type && !data) {
+ /* non bd file object */
+ STACK_WIND (frame, default_fsetxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr,
+ fd, dict, flags, xdata);
+ return 0;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ local->inode = inode_ref (fd->inode);
+ local->fd = fd_ref (fd);
+ local->data = data;
+
+ if (cl_type) {
+ /* For cloning/snapshot, source file must be mapped to LV */
+ if (!bdatt) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "fd %p not mapped to BD", fd);
+ op_errno = EINVAL;
+ goto out;
+
+ }
+ bd_offload (frame, this, NULL, fd, cl_type);
+ } else if (data) {
+ if (bdatt) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "fd %p already mapped to BD", fd);
+ op_errno = EEXIST;
+ goto out;
+ }
+ STACK_WIND(frame, bd_setx_stat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fstat, fd, xdata);
+ }
+
+ return 0;
+out:
+
+ BD_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
+
+ return 0;
+}
+
+int32_t
+bd_removexattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name, dict_t *xdata)
+{
+ if (!strcmp (name, BD_XATTR))
+ goto out;
+
+ STACK_WIND (frame, default_removexattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->removexattr, loc, name, xdata);
+ return 0;
+out:
+ BD_STACK_UNWIND (removexattr, frame, -1, ENODATA, NULL);
+ return 0;
+}
+
+int32_t
+bd_fremovexattr (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, const char *name, dict_t *xdata)
+{
+ if (!strcmp (name, BD_XATTR))
+ goto out;
+
+ STACK_WIND (frame, default_removexattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata);
+
+ return 0;
+out:
+ BD_STACK_UNWIND (fremovexattr, frame, -1, ENODATA, NULL);
+ return 0;
+}
+
+int
+bd_trunc_setxattr_setx_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+
+ if (local->fd)
+ BD_STACK_UNWIND (ftruncate, frame, -1, EIO, NULL, NULL, NULL);
+ else
+ BD_STACK_UNWIND (truncate, frame, -1, EIO, NULL, NULL, NULL);
+
+ return 0;
+}
+
+/*
+ * Call back for setxattr after setting BD_XATTR_SIZE.
+ */
+int
+bd_trunc_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ bd_local_t *local = frame->local;
+ bd_attr_t *bdatt = NULL;
+ struct iatt prebuf = {0, };
+ char *bd = NULL;
+
+ if (op_ret < 0)
+ goto out;
+
+ bd_inode_ctx_get (local->inode, this, &bdatt);
+ if (!bdatt)
+ goto revert_xattr;
+
+ op_errno = bd_resize (this->private, local->inode->gfid,
+ local->bdatt->iatt.ia_size);
+ if (op_errno)
+ goto revert_xattr;
+
+ memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt));
+ /* LV resized, update new size in the cache */
+ bdatt->iatt.ia_size = local->bdatt->iatt.ia_size;
+
+ if (local->fd)
+ BD_STACK_UNWIND (ftruncate, frame, 0, 0, &prebuf, &bdatt->iatt,
+ NULL);
+ else
+ BD_STACK_UNWIND (truncate, frame, 0, 0, &prebuf, &bdatt->iatt,
+ NULL);
+
+ return 0;
+
+revert_xattr:
+ /* revert setxattr */
+ op_ret = dict_get_str (local->dict, BD_XATTR, &bd);
+ GF_FREE (bd);
+ gf_asprintf (&bd, "%s:%ld", bdatt->type, bdatt->iatt.ia_size);
+
+ if (local->fd)
+ STACK_WIND (frame, bd_trunc_setxattr_setx_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr,
+ local->fd, local->dict, 0, NULL);
+ else
+ STACK_WIND (frame, bd_trunc_setxattr_setx_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setxattr,
+ &local->loc, local->dict, 0, NULL);
+
+ return 0;
+out:
+ if (local->fd)
+ BD_STACK_UNWIND (ftruncate, frame, -1, EIO, NULL, NULL, NULL);
+ else
+ BD_STACK_UNWIND (truncate, frame, -1, EIO, NULL, NULL, NULL);
+
+ return 0;
+}
+
+/*
+ * call back from posix_[f]truncate_stat
+ * If offset > LV size, it resizes the LV and calls posix_setxattr
+ * to update new LV size in xattr else calls posix_setattr for updating
+ * the posix file so that truncate fop behaves properly
+ */
+int
+bd_trunc_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *buf, dict_t *xdata)
+{
+ char *bd = NULL;
+ bd_local_t *local = frame->local;
+ bd_attr_t *bdatt = NULL;
+
+ if (op_ret < 0)
+ goto out;
+
+ local->dict = dict_new ();
+ BD_VALIDATE_MEM_ALLOC (local->dict, op_errno, out);
+
+ bd_inode_ctx_get (local->inode, this, &bdatt);
+ if (!bdatt) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ gf_asprintf (&bd, "%s:%ld", bdatt->type, local->bdatt->iatt.ia_size);
+ if (dict_set_dynstr (local->dict, BD_XATTR, bd)) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ if (local->fd)
+ STACK_WIND (frame, bd_trunc_setxattr_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fsetxattr,
+ local->fd, local->dict, 0, NULL);
+ else
+ STACK_WIND (frame, bd_trunc_setxattr_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setxattr,
+ &local->loc, local->dict, 0, NULL);
+
+ return 0;
+out:
+ if (local->fd)
+ BD_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL,
+ NULL);
+ else
+ BD_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL,
+ NULL);
+ GF_FREE (bd);
+ return 0;
+}
+
+void
+bd_do_trunc (call_frame_t *frame, xlator_t *this, fd_t *fd, loc_t *loc,
+ off_t offset, bd_attr_t *bdatt)
+{
+ bd_local_t *local = NULL;
+ struct iatt prebuf = {0, };
+ int op_errno = 0;
+ int op_ret = -1;
+
+ /* If requested size is less than LV size, return success */
+ if (offset <= bdatt->iatt.ia_size) {
+ memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt));
+ bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME);
+ op_ret = 0;
+ goto out;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ local->bdatt = GF_CALLOC (1, sizeof (bd_attr_t), gf_bd_attr);
+ BD_VALIDATE_MEM_ALLOC (local->bdatt, op_errno, out);
+
+ if (fd) {
+ local->inode = inode_ref (fd->inode);
+ local->fd = fd_ref (fd);
+ } else {
+ local->inode = inode_ref (loc->inode);
+ loc_copy (&local->loc, loc);
+ }
+
+ local->bdatt->iatt.ia_size =
+ bd_adjust_size (this->private, offset);
+
+ STACK_WIND (frame, bd_trunc_stat_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fstat, fd, NULL);
+
+ return;
+
+out:
+ if (fd)
+ BD_STACK_UNWIND (ftruncate, frame, op_ret, op_errno,
+ &prebuf, &bdatt->iatt, NULL);
+ else
+ BD_STACK_UNWIND (truncate, frame, op_ret, op_errno,
+ &prebuf, &bdatt->iatt, NULL);
+ return;
+}
+
+/*
+ * bd_ftruncate: Resizes a LV if fd belongs to BD.
+ */
+int32_t
+bd_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata)
+{
+ int op_errno = 0;
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+
+ if (bd_inode_ctx_get (fd->inode, this, &bdatt)) {
+ STACK_WIND (frame, default_ftruncate_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ftruncate, fd,
+ offset, xdata);
+ return 0;
+ }
+
+ bd_do_trunc (frame, this, fd, NULL, offset, bdatt);
+ return 0;
+out:
+ BD_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+/*
+ * bd_truncate: Resizes a LV if file maps to LV.
+ */
+int32_t
+bd_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+ dict_t *xdata)
+{
+ int op_errno = 0;
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (loc, out);
+
+ if (bd_inode_ctx_get (loc->inode, this, &bdatt)) {
+ STACK_WIND (frame, default_truncate_cbk,
+ FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->truncate, loc,
+ offset, xdata);
+ return 0;
+ }
+
+ bd_do_trunc (frame, this, NULL, loc, offset, bdatt);
+ return 0;
+
+out:
+ BD_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t
+__bd_pwritev (int fd, struct iovec *vector, int count, off_t offset,
+ uint64_t bd_size)
+{
+ int index = 0;
+ int retval = 0;
+ off_t internal_offset = 0;
+
+ if (!vector)
+ return -EFAULT;
+
+ retval = pwritev (fd, vector, count, offset);
+ if (retval == -1) {
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "base %p, length %ld, offset %ld, message %s",
+ vector[index].iov_base, vector[index].iov_len,
+ internal_offset, strerror (errno));
+ retval = -errno;
+ goto err;
+ }
+/*
+
+
+ internal_offset = offset;
+ for (index = 0; index < count; index++) {
+ if (internal_offset > bd_size) {
+ op_ret = -ENOSPC;
+ goto err;
+ }
+ if (internal_offset + vector[index].iov_len > bd_size) {
+ vector[index].iov_len = bd_size - internal_offset;
+ no_space = 1;
+ }
+ retval = pwritev (fd, vector[index].iov_base,
+ vector[index].iov_len, internal_offset);
+ if (retval == -1) {
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "base %p, length %ld, offset %ld, message %s",
+ vector[index].iov_base, vector[index].iov_len,
+ internal_offset, strerror (errno));
+ op_ret = -errno;
+ goto err;
+ }
+ op_ret += retval;
+ internal_offset += retval;
+ if (no_space)
+ break;
+ }
+*/
+err:
+ return retval;
+}
+
+/*
+ * bd_writev: Writes to LV if its BD file or forwards the request to posix_write
+ * bd_writev -> posix_writev -> bd_writev_cbk
+ */
+int
+bd_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+ int32_t count, off_t offset, uint32_t flags, struct iobref *iobref,
+ dict_t *xdict)
+{
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ int _fd = -1;
+ bd_fd_t *bd_fd = NULL;
+ int ret = -1;
+ uint64_t size = 0;
+ struct iatt prebuf = {0, };
+ bd_attr_t *bdatt = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+ VALIDATE_OR_GOTO (vector, out);
+
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd) { /* posix fd */
+ STACK_WIND (frame, default_writev_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->writev, fd, vector, count,
+ offset, flags, iobref, xdict);
+ return 0;
+ }
+
+ _fd = bd_fd->fd;
+
+ if (bd_inode_ctx_get (fd->inode, this, &bdatt)) {
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+ size = bdatt->iatt.ia_size;
+
+ op_ret = __bd_pwritev (_fd, vector, count, offset, size);
+ if (op_ret < 0) {
+ op_errno = -op_ret;
+ op_ret = -1;
+ gf_log (this->name, GF_LOG_ERROR, "write failed: offset %"PRIu64
+ ", %s", offset, strerror (op_errno));
+ goto out;
+ }
+
+ memcpy (&prebuf, &bdatt->iatt, sizeof (struct iatt));
+ bd_update_amtime (&bdatt->iatt, GF_SET_ATTR_MTIME);
+out:
+
+ BD_STACK_UNWIND (writev, frame, op_ret, op_errno, &prebuf,
+ &bdatt->iatt, NULL);
+ return 0;
+}
+
+int
+bd_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata)
+{
+ bd_attr_t *bdatt = NULL;
+ int *valid = cookie;
+ bd_local_t *local = frame->local;
+
+ if (op_ret < 0 || !valid || !local)
+ goto out;
+
+ if (bd_inode_ctx_get (local->inode, this, &bdatt))
+ goto out;
+
+ if (*valid & GF_SET_ATTR_UID)
+ bdatt->iatt.ia_uid = postbuf->ia_uid;
+ else if (*valid & GF_SET_ATTR_GID)
+ bdatt->iatt.ia_gid = postbuf->ia_gid;
+ else if (*valid & GF_SET_ATTR_MODE) {
+ bdatt->iatt.ia_type = postbuf->ia_type;
+ bdatt->iatt.ia_prot = postbuf->ia_prot;
+ } else if (*valid & GF_SET_ATTR_ATIME) {
+ bdatt->iatt.ia_atime = postbuf->ia_atime;
+ bdatt->iatt.ia_atime_nsec = postbuf->ia_atime_nsec;
+ } else if (*valid & GF_SET_ATTR_MTIME) {
+ bdatt->iatt.ia_mtime = postbuf->ia_mtime;
+ bdatt->iatt.ia_mtime_nsec = postbuf->ia_mtime_nsec;
+ }
+
+ bdatt->iatt.ia_ctime = postbuf->ia_ctime;
+ bdatt->iatt.ia_ctime_nsec = postbuf->ia_ctime_nsec;
+
+ memcpy (postbuf, &bdatt->iatt, sizeof (struct iatt));
+out:
+ GF_FREE (valid);
+ BD_STACK_UNWIND (setattr, frame, op_ret, op_errno, prebuf,
+ postbuf, xdata);
+ return 0;
+}
+
+int
+bd_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf,
+ int32_t valid, dict_t *xdata)
+{
+ bd_local_t *local = NULL;
+ bd_attr_t *bdatt = NULL;
+ int *ck_valid = NULL;
+ int op_errno = 0;
+
+ if (bd_inode_ctx_get (loc->inode, this, &bdatt)) {
+ STACK_WIND(frame, default_setattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setattr,
+ loc, stbuf, valid, xdata);
+ return 0;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ ck_valid = GF_CALLOC (1, sizeof (valid), gf_bd_int32_t);
+ BD_VALIDATE_MEM_ALLOC (ck_valid, op_errno, out);
+
+ local->inode = inode_ref (loc->inode);
+ *ck_valid = valid;
+
+ STACK_WIND_COOKIE (frame, bd_setattr_cbk, ck_valid, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->setattr,
+ loc, stbuf, valid, xdata);
+
+ return 0;
+out:
+ BD_STACK_UNWIND (setattr, frame, -1, ENOMEM, NULL, NULL, xdata);
+ return 0;
+}
+
+int
+bd_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode, struct iatt *buf,
+ struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+ bd_attr_t *bdatt = NULL;
+
+ if (op_ret < 0)
+ goto out;
+
+ if (bd_inode_ctx_get (inode, this, &bdatt))
+ goto out;
+
+ bdatt->iatt.ia_ctime = buf->ia_ctime;
+ bdatt->iatt.ia_ctime_nsec = buf->ia_ctime_nsec;
+ bdatt->iatt.ia_nlink = buf->ia_nlink;
+ memcpy (buf, &bdatt->iatt, sizeof (struct iatt));
+
+out:
+ BD_STACK_UNWIND (link, frame, op_ret, op_errno, inode, buf,
+ preparent, postparent, NULL);
+ return 0;
+}
+
+int
+bd_link (call_frame_t *frame, xlator_t *this,
+ loc_t *oldloc, loc_t *newloc, dict_t *xdata)
+{
+ STACK_WIND (frame, bd_link_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
+ return 0;
+}
+
+int
+bd_handle_special_xattrs (call_frame_t *frame, xlator_t *this, loc_t *loc,
+ fd_t *fd, const char *name, dict_t *xdata)
+{
+ dict_t *xattr = NULL;
+ int op_ret = -1;
+ int op_errno = ENOMEM;;
+ bd_priv_t *priv = this->private;
+
+ xattr = dict_new ();
+ if (!xattr)
+ goto out;
+
+ if (!strcmp (name, VOL_TYPE))
+ op_ret = dict_set_int64 (xattr, (char *)name, 1);
+ else if (!strcmp (name, VOL_CAPS))
+ op_ret = dict_set_int64 (xattr, (char *)name, priv->caps);
+ else
+ op_ret = bd_get_origin (this->private, loc, fd, xattr);
+
+out:
+ if (loc)
+ BD_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr,
+ xdata);
+ else
+ BD_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, xattr,
+ xdata);
+
+ op_ret = dict_reset (xattr);
+ dict_unref (xattr);
+
+ return 0;
+}
+
+int
+bd_fgetxattr (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, const char *name, dict_t *xdata)
+{
+ if (name && (!strcmp (name, VOL_TYPE) || !strcmp (name, VOL_CAPS)
+ || !strcmp (name, BD_ORIGIN)))
+ bd_handle_special_xattrs (frame, this, NULL, fd, name, xdata);
+ else
+ STACK_WIND (frame, default_fgetxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fgetxattr,
+ fd, name, xdata);
+ return 0;
+}
+
+int
+bd_getxattr (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, const char *name, dict_t *xdata)
+{
+ if (name && (!strcmp (name, VOL_TYPE) || !strcmp (name, VOL_CAPS)
+ || !strcmp (name, BD_ORIGIN)))
+ bd_handle_special_xattrs (frame, this, loc, NULL, name, xdata);
+ else
+ STACK_WIND (frame, default_getxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->getxattr,
+ loc, name, xdata);
+
+ return 0;
+}
+
+int
+bd_unlink_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xattr,
+ struct iatt *postparent)
+{
+ bd_gfid_t gfid = {0, };
+ bd_local_t *local = frame->local;
+
+ if (buf->ia_nlink > 1)
+ goto posix;
+
+ BD_VALIDATE_LOCAL_OR_GOTO (local, op_errno, out);
+
+ uuid_utoa_r (inode->gfid, gfid);
+ if (bd_delete_lv (this->private, gfid, &op_errno) < 0) {
+ if (op_errno != ENOENT)
+ goto out;
+ }
+
+posix:
+ /* remove posix */
+ STACK_WIND (frame, default_unlink_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->unlink,
+ &local->loc, 0, NULL);
+
+ return 0;
+out:
+ BD_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int
+bd_unlink (call_frame_t *frame, xlator_t *this,
+ loc_t *loc, int xflag, dict_t *xdata)
+{
+ int op_errno = 0;
+ bd_attr_t *bdatt = NULL;
+ bd_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (loc, out);
+
+ if (bd_inode_ctx_get (loc->inode, this, &bdatt)) {
+ STACK_WIND (frame, default_unlink_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->unlink,
+ loc, xflag, xdata);
+ return 0;
+ }
+
+ local = bd_local_init (frame, this);
+ BD_VALIDATE_MEM_ALLOC (local, op_errno, out);
+
+ loc_copy (&local->loc, loc);
+
+ STACK_WIND (frame, bd_unlink_lookup_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->lookup, loc, NULL);
+ return 0;
+out:
+ BD_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
+
+int32_t
+bd_priv (xlator_t *this)
+{
+ return 0;
+}
+
+int32_t
+bd_inode (xlator_t *this)
+{
+ return 0;
+}
+
+int32_t
+bd_rchecksum (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ int32_t len, dict_t *xdata)
+{
+ int op_ret = -1;
+ int op_errno = 0;
+ int ret = 0;
+ int _fd = -1;
+ char *alloc_buf = NULL;
+ char *buf = NULL;
+ int32_t weak_checksum = 0;
+ bd_fd_t *bd_fd = NULL;
+ unsigned char strong_checksum[MD5_DIGEST_LENGTH] = {0};
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+
+ ret = bd_fd_ctx_get (this, fd, &bd_fd);
+ if (ret < 0 || !bd_fd) {
+ STACK_WIND (frame, default_rchecksum_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->rchecksum, fd, offset,
+ len, xdata);
+ return 0;
+ }
+
+ memset (strong_checksum, 0, MD5_DIGEST_LENGTH);
+
+ alloc_buf = page_aligned_alloc (len, &buf);
+ if (!alloc_buf) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ _fd = bd_fd->fd;
+
+ LOCK (&fd->lock);
+ {
+ ret = pread (_fd, buf, len, offset);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "pread of %d bytes returned %d (%s)",
+ len, ret, strerror (errno));
+ op_errno = errno;
+ }
+ }
+ UNLOCK (&fd->lock);
+
+ if (ret < 0)
+ goto out;
+
+ weak_checksum = gf_rsync_weak_checksum ((unsigned char *) buf,
+ (size_t) len);
+ gf_rsync_strong_checksum ((unsigned char *) buf, (size_t) len,
+ (unsigned char *) strong_checksum);
+
+ op_ret = 0;
+out:
+ BD_STACK_UNWIND (rchecksum, frame, op_ret, op_errno,
+ weak_checksum, strong_checksum, NULL);
+
+ GF_FREE (alloc_buf);
+
+ return 0;
+}
+
+static int
+bd_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata)
+{
+ int32_t ret = 0;
+ struct iatt statpre = {0,};
+ struct iatt statpost = {0,};
+ bd_attr_t *bdatt = NULL;
+
+ /* iatt already cached */
+ if (bd_inode_ctx_get (fd->inode, this, &bdatt) < 0) {
+ STACK_WIND (frame, default_zerofill_cbk, FIRST_CHILD (this),
+ FIRST_CHILD (this)->fops->zerofill,
+ fd, offset, len, xdata);
+ return 0;
+ }
+
+ ret = bd_do_zerofill(frame, this, fd, offset, len,
+ &statpre, &statpost);
+ if (ret)
+ goto err;
+
+ STACK_UNWIND_STRICT(zerofill, frame, 0, 0, &statpre, &statpost, NULL);
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT(zerofill, frame, -1, ret, NULL, NULL, NULL);
+ return 0;
+}
+
+/**
+ * notify - when parent sends PARENT_UP, send CHILD_UP event from here
+ */
+int32_t
+notify (xlator_t *this,
+ int32_t event,
+ void *data,
+ ...)
+{
+ switch (event)
+ {
+ case GF_EVENT_PARENT_UP:
+ {
+ /* Tell the parent that bd xlator is up */
+ default_notify (this, GF_EVENT_CHILD_UP, data);
+ }
+ break;
+ default:
+ break;
+ }
+ return 0;
+}
+
+int32_t
+mem_acct_init (xlator_t *this)
+{
+ int ret = -1;
+
+ if (!this)
+ return ret;
+
+ ret = xlator_mem_acct_init (this, gf_bd_mt_end + 1);
+
+ if (ret != 0)
+ gf_log (this->name, GF_LOG_ERROR, "Memory accounting init"
+ "failed");
+
+ return ret;
+}
+
+int
+reconfigure (xlator_t *this, dict_t *options)
+{
+ int ret = -1;
+ bd_priv_t *priv = this->private;
+
+ GF_OPTION_RECONF ("bd-aio", priv->aio_configured, options,
+ bool, out);
+
+ if (priv->aio_configured)
+ bd_aio_on (this);
+ else
+ bd_aio_off (this);
+
+ ret = 0;
+out:
+ return ret;
+}
+
+/**
+ * bd xlator init - Validate configured VG
+ */
+int
+init (xlator_t *this)
+{
+ int ret = 0;
+ char *vg_data = NULL;
+ char *device = NULL;
+ bd_priv_t *_private = NULL;
+
+ if (!this->children) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "FATAL: storage/bd needs posix as subvolume");
+ return -1;
+ }
+
+ if (!this->parents) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "Volume is dangling. Please check the volume file.");
+ }
+
+ GF_OPTION_INIT ("export", vg_data, str, error);
+ GF_OPTION_INIT ("device", device, str, error);
+
+ /* Now we support only LV device */
+ if (strcasecmp (device, BACKEND_VG)) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "FATAL: unknown %s backend %s", BD_XLATOR, device);
+ return -1;
+ }
+
+ this->local_pool = mem_pool_new (bd_local_t, 64);
+ if (!this->local_pool) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "FATAL: Failed to create bd memory pool");
+ return -1;
+ }
+
+ ret = 0;
+ _private = GF_CALLOC (1, sizeof (*_private), gf_bd_private);
+ if (!_private)
+ goto error;
+
+ this->private = _private;
+ _private->vg = gf_strdup (vg_data);
+ if (!_private->vg)
+ goto error;
+
+ _private->handle = lvm_init (NULL);
+ if (!_private->handle) {
+ gf_log (this->name, GF_LOG_CRITICAL, "lvm_init failed");
+ goto error;
+ }
+ _private->caps = BD_CAPS_BD;
+ if (bd_scan_vg (this, _private))
+ goto error;
+
+ _private->aio_init_done = _gf_false;
+ _private->aio_capable = _gf_false;
+
+ GF_OPTION_INIT ("bd-aio", _private->aio_configured, bool, error);
+ if (_private->aio_configured) {
+ if (bd_aio_on (this)) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "BD AIO init failed");
+ ret = -1;
+ goto error;
+ }
+ }
+
+ _private->caps |= BD_CAPS_OFFLOAD_COPY | BD_CAPS_OFFLOAD_SNAPSHOT |
+ BD_CAPS_OFFLOAD_ZERO;
+
+ return 0;
+error:
+ if (_private) {
+ GF_FREE (_private->vg);
+ if (_private->handle)
+ lvm_quit (_private->handle);
+ GF_FREE (_private);
+ }
+
+ mem_pool_destroy (this->local_pool);
+
+ return -1;
+}
+
+void
+fini (xlator_t *this)
+{
+ bd_priv_t *priv = this->private;
+ mem_pool_destroy (this->local_pool);
+ this->local_pool = NULL;
+ if (!priv)
+ return;
+ lvm_quit (priv->handle);
+ GF_FREE (priv->vg);
+ this->private = NULL;
+ GF_FREE (priv);
+ return;
+}
+
+struct xlator_dumpops dumpops = {
+ .priv = bd_priv,
+ .inode = bd_inode,
+};
+
+struct xlator_fops fops = {
+ .readdirp = bd_readdirp,
+ .lookup = bd_lookup,
+ .stat = bd_stat,
+ .statfs = bd_statfs,
+ .open = bd_open,
+ .fstat = bd_fstat,
+ .rchecksum = bd_rchecksum,
+ .readv = bd_readv,
+ .fsync = bd_fsync,
+ .setxattr = bd_setxattr,
+ .fsetxattr = bd_fsetxattr,
+ .removexattr = bd_removexattr,
+ .fremovexattr=bd_fremovexattr,
+ .truncate = bd_truncate,
+ .ftruncate = bd_ftruncate,
+ .writev = bd_writev,
+ .getxattr = bd_getxattr,
+ .fgetxattr = bd_fgetxattr,
+ .unlink = bd_unlink,
+ .link = bd_link,
+ .flush = bd_flush,
+ .setattr = bd_setattr,
+ .discard = bd_discard,
+ .zerofill = bd_zerofill,
+};
+
+struct xlator_cbks cbks = {
+ .release = bd_release,
+ .forget = bd_forget,
+};
+
+struct volume_options options[] = {
+ { .key = {"export"},
+ .type = GF_OPTION_TYPE_STR},
+ { .key = {"device"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = BACKEND_VG},
+ {
+ .key = {"bd-aio"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "Support for native Linux AIO"
+ },
+
+ { .key = {NULL} }
+};
diff --git a/xlators/storage/bd/src/bd.h b/xlators/storage/bd/src/bd.h
new file mode 100644
index 000000000..62add16cd
--- /dev/null
+++ b/xlators/storage/bd/src/bd.h
@@ -0,0 +1,173 @@
+/*
+ BD translator - Exports Block devices on server side as regular
+ files to client
+
+ Copyright IBM, Corp. 2012
+
+ This file is part of GlusterFS.
+
+ Author:
+ M. Mohan Kumar <mohan@in.ibm.com>
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _BD_H
+#define _BD_H
+
+#ifndef _CONFIG_H
+#define _CONFIG_H
+#include "config.h"
+#endif
+
+#ifdef HAVE_LIBAIO
+#include <libaio.h>
+#endif
+
+#include "xlator.h"
+#include "mem-types.h"
+
+#define BD_XLATOR "block device mapper xlator"
+#define BACKEND_VG "vg"
+#define GF_XATTR "user.glusterfs"
+#define BD_XATTR GF_XATTR ".bd"
+
+#define BD_LV "lv"
+#define BD_THIN "thin"
+
+#define VOL_TYPE "volume.type"
+#define VOL_CAPS "volume.caps"
+
+#define ALIGN_SIZE 4096
+
+#define BD_CAPS_BD 0x01
+#define BD_CAPS_THIN 0x02
+#define BD_CAPS_OFFLOAD_COPY 0x04
+#define BD_CAPS_OFFLOAD_SNAPSHOT 0x08
+#define BD_CAPS_OFFLOAD_ZERO 0x20
+
+#define BD_CLONE "clone"
+#define BD_SNAPSHOT "snapshot"
+#define BD_MERGE "merge"
+#define BD_ORIGIN "list-origin"
+
+#define IOV_NR 4
+#define IOV_SIZE (64 * 1024)
+
+#define ALIGN_SIZE 4096
+#define LINKTO "trusted.glusterfs.dht.linkto"
+
+#define MAX_NO_VECT 1024
+
+
+#define BD_VALIDATE_MEM_ALLOC(buff, op_errno, label) \
+ if (!buff) { \
+ op_errno = ENOMEM; \
+ gf_log (this->name, GF_LOG_ERROR, "out of memory"); \
+ goto label; \
+ }
+
+#define BD_VALIDATE_LOCAL_OR_GOTO(local, op_errno, label) \
+ if (!local) { \
+ op_errno = EINVAL; \
+ goto label; \
+ }
+
+#define BD_STACK_UNWIND(typ, frame, args ...) do { \
+ bd_local_t *__local = frame->local; \
+ xlator_t *__this = frame->this; \
+ \
+ frame->local = NULL; \
+ STACK_UNWIND_STRICT (typ, frame, args); \
+ if (__local) \
+ bd_local_free (__this, __local); \
+ } while (0)
+
+typedef char bd_gfid_t[GF_UUID_BUF_SIZE];
+
+/**
+ * bd_fd - internal structure
+ */
+typedef struct bd_fd {
+ int fd;
+ int32_t flag;
+ int odirect;
+} bd_fd_t;
+
+typedef struct bd_priv {
+ lvm_t handle;
+ char *vg;
+ char *pool;
+ int caps;
+ gf_boolean_t aio_init_done;
+ gf_boolean_t aio_capable;
+ gf_boolean_t aio_configured;
+#ifdef HAVE_LIBAIO
+ io_context_t ctxp;
+ pthread_t aiothread;
+#endif
+} bd_priv_t;
+
+
+typedef enum bd_type {
+ BD_TYPE_NONE,
+ BD_TYPE_LV,
+} bd_type_t;
+
+typedef struct {
+ struct iatt iatt;
+ char *type;
+} bd_attr_t;
+
+typedef enum {
+ BD_OF_NONE,
+ BD_OF_CLONE,
+ BD_OF_SNAPSHOT,
+ BD_OF_MERGE,
+} bd_offload_t;
+
+typedef struct {
+ dict_t *dict;
+ bd_attr_t *bdatt;
+ inode_t *inode;
+ loc_t loc;
+ fd_t *fd;
+ data_t *data; /* for setxattr */
+ bd_offload_t offload;
+ uint64_t size;
+ loc_t *dloc;
+} bd_local_t;
+
+/* Prototypes */
+int bd_inode_ctx_set (inode_t *inode, xlator_t *this, bd_attr_t *ctx);
+int bd_inode_ctx_get (inode_t *inode, xlator_t *this, bd_attr_t **ctx);
+int bd_scan_vg (xlator_t *this, bd_priv_t *priv);
+bd_local_t *bd_local_init (call_frame_t *frame, xlator_t *this);
+void bd_local_free (xlator_t *this, bd_local_t *local);
+int bd_fd_ctx_get (xlator_t *this, fd_t *fd, bd_fd_t **bdfd);
+char *page_aligned_alloc (size_t size, char **aligned_buf);
+int bd_validate_bd_xattr (xlator_t *this, char *bd, char **type,
+ uint64_t *lv_size, uuid_t uuid);
+uint64_t bd_get_default_extent (bd_priv_t *priv);
+uint64_t bd_adjust_size (bd_priv_t *priv, uint64_t size);
+int bd_create (uuid_t uuid, uint64_t size, char *type, bd_priv_t *priv);
+int bd_resize (bd_priv_t *priv, uuid_t uuid, off_t size);
+int bd_delete_lv (bd_priv_t *priv, const char *lv_name, int *op_errno);
+int bd_snapshot_create (bd_local_t *local, bd_priv_t *priv);
+int bd_clone (bd_local_t *local, bd_priv_t *priv);
+
+int bd_merge (bd_priv_t *priv, uuid_t gfid);
+int bd_get_origin (bd_priv_t *priv, loc_t *loc, fd_t *fd, dict_t *dict);
+void bd_update_amtime(struct iatt *iatt, int flag);
+int bd_snapshot_create (bd_local_t *local, bd_priv_t *priv);
+int bd_clone (bd_local_t *local, bd_priv_t *priv);
+int bd_merge (bd_priv_t *priv, uuid_t gfid);
+int bd_get_origin (bd_priv_t *priv, loc_t *loc, fd_t *fd, dict_t *dict);
+int bd_do_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, off_t len, struct iatt *prebuf,
+ struct iatt *postbuf);
+
+#endif
diff --git a/xlators/storage/posix/src/Makefile.am b/xlators/storage/posix/src/Makefile.am
index 03623cf04..88efcc784 100644
--- a/xlators/storage/posix/src/Makefile.am
+++ b/xlators/storage/posix/src/Makefile.am
@@ -2,7 +2,7 @@
xlator_LTLIBRARIES = posix.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/storage
-posix_la_LDFLAGS = -module -avoidversion
+posix_la_LDFLAGS = -module -avoid-version
posix_la_SOURCES = posix.c posix-helpers.c posix-handle.c posix-aio.c
posix_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(LIBAIO)
diff --git a/xlators/storage/posix/src/posix-aio.c b/xlators/storage/posix/src/posix-aio.c
index 0a776b9bc..c3bbddd67 100644
--- a/xlators/storage/posix/src/posix-aio.c
+++ b/xlators/storage/posix/src/posix-aio.c
@@ -136,11 +136,7 @@ posix_aio_readv_complete (struct posix_aio_cb *paiocb, int res, int res2)
/* Hack to notify higher layers of EOF. */
- if (postbuf.ia_size == 0)
- op_errno = ENOENT;
- else if ((offset + iov.iov_len) == postbuf.ia_size)
- op_errno = ENOENT;
- else if (offset > postbuf.ia_size)
+ if (!postbuf.ia_size || (offset + iov.iov_len) >= postbuf.ia_size)
op_errno = ENOENT;
LOCK (&priv->lock);
@@ -490,8 +486,8 @@ posix_aio_init (xlator_t *this)
goto out;
}
- ret = pthread_create (&priv->aiothread, NULL,
- posix_aio_thread, this);
+ ret = gf_thread_create (&priv->aiothread, NULL,
+ posix_aio_thread, this);
if (ret != 0) {
io_destroy (priv->ctxp);
goto out;
@@ -560,4 +556,14 @@ posix_aio_off (xlator_t *this)
return 0;
}
+void
+__posix_fd_set_odirect (fd_t *fd, struct posix_fd *pfd, int opflags,
+ off_t offset, size_t size)
+{
+ xlator_t *this = THIS;
+ gf_log (this->name, GF_LOG_INFO,
+ "Linux AIO not available at build-time."
+ " Continuing with synchronous IO");
+ return;
+}
#endif
diff --git a/xlators/storage/posix/src/posix-handle.c b/xlators/storage/posix/src/posix-handle.c
index 33bf3db56..adb8acc07 100644
--- a/xlators/storage/posix/src/posix-handle.c
+++ b/xlators/storage/posix/src/posix-handle.c
@@ -26,13 +26,167 @@
#include "xlator.h"
#include "syscall.h"
+inode_t *
+posix_resolve (xlator_t *this, inode_table_t *itable, inode_t *parent,
+ char *bname, struct iatt *iabuf)
+{
+ inode_t *inode = NULL, *linked_inode = NULL;
+ int ret = -1;
-#define HANDLE_PFX ".glusterfs"
-#define TRASH_DIR "landfill"
+ ret = posix_istat (this, parent->gfid, bname, iabuf);
+ if (ret < 0)
+ goto out;
-#define UUID0_STR "00000000-0000-0000-0000-000000000000"
-#define SLEN(str) (sizeof(str) - 1)
+ inode = inode_find (itable, iabuf->ia_gfid);
+ if (inode == NULL) {
+ inode = inode_new (itable);
+ }
+ linked_inode = inode_link (inode, parent, bname, iabuf);
+
+ inode_unref (inode);
+
+out:
+ return linked_inode;
+}
+
+int
+posix_make_ancestral_node (const char *priv_base_path, char *path, int pathsize,
+ gf_dirent_t *head,
+ char *dir_name, struct iatt *iabuf, inode_t *inode,
+ int type, dict_t *xdata)
+{
+ gf_dirent_t *entry = NULL;
+ char real_path[PATH_MAX + 1] = {0, }, len = 0;
+ loc_t loc = {0, };
+ int ret = -1;
+
+ len = strlen (path) + strlen (dir_name) + 1;
+ if (len > pathsize) {
+ goto out;
+ }
+
+ strcat (path, dir_name);
+
+ if (type & POSIX_ANCESTRY_DENTRY) {
+ entry = gf_dirent_for_name (dir_name);
+ if (!entry) {
+ gf_log (THIS->name, GF_LOG_ERROR,
+ "could not create gf_dirent for entry %s: (%s)",
+ dir_name, strerror (errno));
+ goto out;
+ }
+
+ entry->d_stat = *iabuf;
+ entry->inode = inode_ref (inode);
+
+ list_add_tail (&entry->list, &head->list);
+ strcpy (real_path, priv_base_path);
+ strcat (real_path, "/");
+ strcat (real_path, path);
+ loc.inode = inode_ref (inode);
+ uuid_copy (loc.gfid, inode->gfid);
+
+ entry->dict = posix_lookup_xattr_fill (THIS, real_path, &loc,
+ xdata, iabuf);
+ loc_wipe (&loc);
+ }
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
+int
+posix_make_ancestryfromgfid (xlator_t *this, char *path, int pathsize,
+ gf_dirent_t *head, int type, uuid_t gfid,
+ const size_t handle_size,
+ const char *priv_base_path, inode_table_t *itable,
+ inode_t **parent, dict_t *xdata)
+{
+ char *linkname = NULL; /* "../../<gfid[0]>/<gfid[1]/"
+ "<gfidstr>/<NAME_MAX>" */
+ char *dir_handle = NULL;
+ char *dir_name = NULL;
+ char *pgfidstr = NULL;
+ char *saveptr = NULL;
+ ssize_t len = 0;
+ inode_t *inode = NULL;
+ struct iatt iabuf = {0, };
+ int ret = -1;
+ uuid_t tmp_gfid = {0, };
+
+ if (!path || !parent || !priv_base_path || uuid_is_null (gfid)) {
+ goto out;
+ }
+
+ if (__is_root_gfid (gfid)) {
+ if (parent) {
+ if (*parent) {
+ inode_unref (*parent);
+ }
+
+ *parent = inode_ref (itable->root);
+ }
+
+ inode = itable->root;
+
+ memset (&iabuf, 0, sizeof (iabuf));
+ uuid_copy (iabuf.ia_gfid, inode->gfid);
+ iabuf.ia_type = inode->ia_type;
+
+ ret = posix_make_ancestral_node (priv_base_path, path, pathsize,
+ head, "/", &iabuf, inode, type,
+ xdata);
+ return ret;
+ }
+
+ dir_handle = alloca (handle_size);
+ linkname = alloca (PATH_MAX);
+ snprintf (dir_handle, handle_size, "%s/%s/%02x/%02x/%s",
+ priv_base_path, GF_HIDDEN_PATH, gfid[0], gfid[1],
+ uuid_utoa (gfid));
+
+ len = readlink (dir_handle, linkname, PATH_MAX);
+ if (len < 0) {
+ gf_log (this->name, GF_LOG_ERROR, "could not read the link "
+ "from the gfid handle %s (%s)", dir_handle,
+ strerror (errno));
+ goto out;
+ }
+
+ linkname[len] = '\0';
+
+ pgfidstr = strtok_r (linkname + SLEN("../../00/00/"), "/", &saveptr);
+ dir_name = strtok_r (NULL, "/", &saveptr);
+ strcat (dir_name, "/");
+
+ uuid_parse (pgfidstr, tmp_gfid);
+
+ ret = posix_make_ancestryfromgfid (this, path, pathsize, head, type,
+ tmp_gfid, handle_size,
+ priv_base_path, itable, parent,
+ xdata);
+ if (ret < 0) {
+ goto out;
+ }
+
+ memset (&iabuf, 0, sizeof (iabuf));
+
+ inode = posix_resolve (this, itable, *parent, dir_name, &iabuf);
+
+ ret = posix_make_ancestral_node (priv_base_path, path, pathsize, head,
+ dir_name, &iabuf, inode, type, xdata);
+ if (*parent != NULL) {
+ inode_unref (*parent);
+ }
+
+ *parent = inode;
+
+out:
+ return ret;
+}
int
posix_handle_relpath (xlator_t *this, uuid_t gfid, const char *basename,
@@ -189,13 +343,13 @@ posix_handle_path (xlator_t *this, uuid_t gfid, const char *basename,
buf = alloca (maxlen);
}
- base_len = (priv->base_path_length + SLEN(HANDLE_PFX) + 45);
+ base_len = (priv->base_path_length + SLEN(GF_HIDDEN_PATH) + 45);
base_str = alloca (base_len + 1);
base_len = snprintf (base_str, base_len + 1, "%s/%s/%02x/%02x/%s",
- priv->base_path, HANDLE_PFX, gfid[0], gfid[1],
+ priv->base_path, GF_HIDDEN_PATH, gfid[0], gfid[1],
uuid_str);
- pfx_len = priv->base_path_length + 1 + SLEN(HANDLE_PFX) + 1;
+ pfx_len = priv->base_path_length + 1 + SLEN(GF_HIDDEN_PATH) + 1;
if (basename) {
len = snprintf (buf, maxlen, "%s/%s", base_str, basename);
@@ -237,7 +391,7 @@ posix_handle_gfid_path (xlator_t *this, uuid_t gfid, const char *basename,
len = priv->base_path_length /* option directory "/export" */
+ SLEN("/")
- + SLEN(HANDLE_PFX)
+ + SLEN(GF_HIDDEN_PATH)
+ SLEN("/")
+ SLEN("00/")
+ SLEN("00/")
@@ -268,10 +422,10 @@ posix_handle_gfid_path (xlator_t *this, uuid_t gfid, const char *basename,
if (basename) {
len = snprintf (buf, buflen, "%s/%s/%02x/%02x/%s/%s", priv->base_path,
- HANDLE_PFX, gfid[0], gfid[1], uuid_str, basename);
+ GF_HIDDEN_PATH, gfid[0], gfid[1], uuid_str, basename);
} else {
len = snprintf (buf, buflen, "%s/%s/%02x/%02x/%s", priv->base_path,
- HANDLE_PFX, gfid[0], gfid[1], uuid_str);
+ GF_HIDDEN_PATH, gfid[0], gfid[1], uuid_str);
}
out:
return len;
@@ -300,10 +454,10 @@ posix_handle_init (xlator_t *this)
return -1;
}
- handle_pfx = alloca (priv->base_path_length + 1 + strlen (HANDLE_PFX)
+ handle_pfx = alloca (priv->base_path_length + 1 + strlen (GF_HIDDEN_PATH)
+ 1);
- sprintf (handle_pfx, "%s/%s", priv->base_path, HANDLE_PFX);
+ sprintf (handle_pfx, "%s/%s", priv->base_path, GF_HIDDEN_PATH);
ret = stat (handle_pfx, &stbuf);
switch (ret) {
@@ -467,7 +621,7 @@ posix_handle_trash_init (xlator_t *this)
priv = this->private;
priv->trash_path = GF_CALLOC (1, priv->base_path_length + strlen ("/")
- + strlen (HANDLE_PFX) + strlen ("/")
+ + strlen (GF_HIDDEN_PATH) + strlen ("/")
+ strlen (TRASH_DIR) + 1,
gf_posix_mt_trash_path);
@@ -475,7 +629,7 @@ posix_handle_trash_init (xlator_t *this)
goto out;
strncpy (priv->trash_path, priv->base_path, priv->base_path_length);
- strcat (priv->trash_path, "/" HANDLE_PFX "/" TRASH_DIR);
+ strcat (priv->trash_path, "/" GF_HIDDEN_PATH "/" TRASH_DIR);
ret = posix_handle_new_trash_init (this, priv->trash_path);
if (ret)
goto out;
@@ -547,16 +701,8 @@ posix_handle_hard (xlator_t *this, const char *oldpath, uuid_t gfid, struct stat
return -1;
}
-#ifdef HAVE_LINKAT
- /*
- * Use linkat if the target may be a symlink to a directory
- * or without an existing target. See comment about linkat()
- * usage in posix_link() in posix.c for details
- */
- ret = linkat (AT_FDCWD, oldpath, AT_FDCWD, newpath, 0);
-#else
- ret = link (oldpath, newpath);
-#endif
+ ret = sys_link (oldpath, newpath);
+
if (ret) {
gf_log (this->name, GF_LOG_WARNING,
"link %s -> %s failed (%s)",
@@ -573,13 +719,6 @@ posix_handle_hard (xlator_t *this, const char *oldpath, uuid_t gfid, struct stat
}
}
- ret = lstat (newpath, &newbuf);
- if (ret) {
- gf_log (this->name, GF_LOG_WARNING,
- "lstat on %s failed (%s)", newpath, strerror (errno));
- return -1;
- }
-
if (newbuf.st_ino != oldbuf->st_ino ||
newbuf.st_dev != oldbuf->st_dev) {
gf_log (this->name, GF_LOG_WARNING,
@@ -735,16 +874,7 @@ posix_create_link_if_gfid_exists (xlator_t *this, uuid_t gfid,
MAKE_HANDLE_PATH (newpath, this, gfid, NULL);
ret = lstat (newpath, &stbuf);
if (!ret) {
-#ifdef HAVE_LINKAT
- /*
- * Use linkat if the target may be a symlink to a directory
- * or without an existing target. See comment about linkat()
- * usage in posix_link() in posix.c for details
- */
- ret = linkat (AT_FDCWD, newpath, AT_FDCWD, real_path, 0);
-#else
- ret = link (newpath, real_path);
-#endif
+ ret = sys_link (newpath, real_path);
}
return ret;
diff --git a/xlators/storage/posix/src/posix-handle.h b/xlators/storage/posix/src/posix-handle.h
index f1163b727..31cbf83fd 100644
--- a/xlators/storage/posix/src/posix-handle.h
+++ b/xlators/storage/posix/src/posix-handle.h
@@ -17,9 +17,85 @@
#include <sys/types.h>
#include "xlator.h"
+#include "gf-dirent.h"
+#define TRASH_DIR "landfill"
-#define LOC_HAS_ABSPATH(loc) ((loc) && (loc->path) && (loc->path[0] == '/'))
+#define UUID0_STR "00000000-0000-0000-0000-000000000000"
+#define SLEN(str) (sizeof(str) - 1)
+
+#define LOC_HAS_ABSPATH(loc) (loc && (loc->path) && (loc->path[0] == '/'))
+
+#define MAKE_PGFID_XATTR_KEY(var, prefix, pgfid) do { \
+ var = alloca (strlen (prefix) + UUID_CANONICAL_FORM_LEN + 1); \
+ strcpy (var, prefix); \
+ strcat (var, uuid_utoa (pgfid)); \
+ } while (0)
+
+#define SET_PGFID_XATTR(path, key, value, flags, op_ret, this, label) do { \
+ value = hton32 (value); \
+ op_ret = sys_lsetxattr (path, key, &value, sizeof (value), \
+ flags); \
+ if (op_ret == -1) { \
+ op_errno = errno; \
+ gf_log (this->name, GF_LOG_WARNING, \
+ "setting xattr failed on %s: key = %s (%s)", \
+ path, key, strerror (op_errno)); \
+ goto label; \
+ } \
+ } while (0)
+
+
+#define REMOVE_PGFID_XATTR(path, key, op_ret, this, label) do { \
+ op_ret = sys_lremovexattr (path, key); \
+ if (op_ret == -1) { \
+ op_errno = errno; \
+ gf_log (this->name, GF_LOG_WARNING, "removing xattr " \
+ "failed on %s: key = %s (%s)", path, key, \
+ strerror (op_errno)); \
+ goto label; \
+ } \
+ } while (0)
+
+/* should be invoked holding a lock */
+#define LINK_MODIFY_PGFID_XATTR(path, key, value, flags, op_ret, this, label) do { \
+ op_ret = sys_lgetxattr (path, key, &value, sizeof (value)); \
+ if (op_ret == -1) { \
+ op_errno = errno; \
+ if (op_errno == ENOATTR) { \
+ value = 1; \
+ } else { \
+ gf_log (this->name, GF_LOG_WARNING,"getting xattr " \
+ "failed on %s: key = %s (%s)", path, key, \
+ strerror (op_errno)); \
+ goto label; \
+ } \
+ } else { \
+ value = ntoh32 (value); \
+ value++; \
+ } \
+ SET_PGFID_XATTR (path, key, value, flags, op_ret, this, label); \
+ } while (0)
+
+/* should be invoked holding a lock */
+#define UNLINK_MODIFY_PGFID_XATTR(path, key, value, flags, op_ret, this, label) do { \
+ op_ret = sys_lgetxattr (path, key, &value, sizeof (value)); \
+ if (op_ret == -1) { \
+ op_errno = errno; \
+ gf_log (this->name, GF_LOG_WARNING, "getting xattr failed on " \
+ "%s: key = %s (%s)", path, key, strerror (op_errno)); \
+ goto label; \
+ } else { \
+ value = ntoh32 (value); \
+ value--; \
+ if (value > 0) { \
+ SET_PGFID_XATTR (path, key, value, flags, op_ret, \
+ this, label); \
+ } else { \
+ REMOVE_PGFID_XATTR (path, key, op_ret, this, label); \
+ } \
+ } \
+ } while (0)
#define MAKE_REAL_PATH(var, this, path) do { \
var = alloca (strlen (path) + POSIX_BASE_PATH_LEN(this) + 2); \
@@ -27,7 +103,6 @@
strcpy (&var[POSIX_BASE_PATH_LEN(this)], path); \
} while (0)
-
#define MAKE_HANDLE_PATH(var, this, gfid, base) do { \
int __len; \
__len = posix_handle_path (this, gfid, base, NULL, 0); \
@@ -61,12 +136,12 @@
#define MAKE_INODE_HANDLE(rpath, this, loc, iatt_p) do { \
if (uuid_is_null (loc->gfid)) { \
gf_log (this->name, GF_LOG_ERROR, \
- "null gfid for path %s", loc->path); \
+ "null gfid for path %s", (loc)->path); \
break; \
} \
if (LOC_HAS_ABSPATH (loc)) { \
- MAKE_REAL_PATH (rpath, this, loc->path); \
- op_ret = posix_pstat (this, loc->gfid, rpath, iatt_p); \
+ MAKE_REAL_PATH (rpath, this, (loc)->path); \
+ op_ret = posix_pstat (this, (loc)->gfid, rpath, iatt_p); \
break; \
} \
errno = 0; \
@@ -107,10 +182,20 @@
} while (0)
+#define POSIX_ANCESTRY_PATH (1 << 0)
+#define POSIX_ANCESTRY_DENTRY (1 << 1)
int
posix_handle_path (xlator_t *this, uuid_t gfid, const char *basename, char *buf,
size_t len);
+
+int
+posix_make_ancestryfromgfid (xlator_t *this, char *path, int pathsize,
+ gf_dirent_t *head, int type, uuid_t gfid,
+ const size_t handle_size,
+ const char *priv_base_path,
+ inode_table_t *table, inode_t **parent,
+ dict_t *xdata);
int
posix_handle_path_safe (xlator_t *this, uuid_t gfid, const char *basename,
char *buf, size_t len);
diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c
index c8041b638..ab46f7f7e 100644
--- a/xlators/storage/posix/src/posix-helpers.c
+++ b/xlators/storage/posix/src/posix-helpers.c
@@ -22,6 +22,7 @@
#include <pthread.h>
#include <ftw.h>
#include <sys/stat.h>
+#include <signal.h>
#ifndef GF_BSD_HOST_OS
#include <alloca.h>
@@ -44,25 +45,22 @@
#include "timer.h"
#include "glusterfs3-xdr.h"
#include "hashfn.h"
+#include "glusterfs-acl.h"
#include <fnmatch.h>
-typedef struct {
- xlator_t *this;
- const char *real_path;
- dict_t *xattr;
- struct iatt *stbuf;
- loc_t *loc;
-} posix_xattr_filler_t;
-
char *marker_xattrs[] = {"trusted.glusterfs.quota.*",
"trusted.glusterfs.*.xtime",
NULL};
+char *marker_contri_key = "trusted.*.*.contri";
+
static char* posix_ignore_xattrs[] = {
"gfid-req",
GLUSTERFS_ENTRYLK_COUNT,
GLUSTERFS_INODELK_COUNT,
GLUSTERFS_POSIXLK_COUNT,
+ GLUSTERFS_PARENT_ENTRYLK,
+ GF_GFIDLESS_LOOKUP,
NULL
};
@@ -108,14 +106,142 @@ out:
}
static int
+_posix_xattr_get_set_from_backend (posix_xattr_filler_t *filler, char *key)
+{
+ ssize_t xattr_size = -1;
+ int ret = 0;
+ char *value = NULL;
+
+ xattr_size = sys_lgetxattr (filler->real_path, key, NULL, 0);
+
+ if (xattr_size > 0) {
+ value = GF_CALLOC (1, xattr_size + 1,
+ gf_posix_mt_char);
+ if (!value)
+ goto out;
+
+ xattr_size = sys_lgetxattr (filler->real_path, key, value,
+ xattr_size);
+ if (xattr_size <= 0) {
+ gf_log (filler->this->name, GF_LOG_WARNING,
+ "getxattr failed. path: %s, key: %s",
+ filler->real_path, key);
+ GF_FREE (value);
+ goto out;
+ }
+
+ value[xattr_size] = '\0';
+ ret = dict_set_bin (filler->xattr, key,
+ value, xattr_size);
+ if (ret < 0) {
+ gf_log (filler->this->name, GF_LOG_DEBUG,
+ "dict set failed. path: %s, key: %s",
+ filler->real_path, key);
+ GF_FREE (value);
+ goto out;
+ }
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+static int gf_posix_xattr_enotsup_log;
+
+static int
+_posix_get_marker_all_contributions (posix_xattr_filler_t *filler)
+{
+ ssize_t size = -1, remaining_size = -1, list_offset = 0;
+ int ret = -1;
+ char *list = NULL, key[4096] = {0, };
+
+ size = sys_llistxattr (filler->real_path, NULL, 0);
+ if (size == -1) {
+ if ((errno == ENOTSUP) || (errno == ENOSYS)) {
+ GF_LOG_OCCASIONALLY (gf_posix_xattr_enotsup_log,
+ THIS->name, GF_LOG_WARNING,
+ "Extended attributes not "
+ "supported (try remounting brick"
+ " with 'user_xattr' flag)");
+
+ } else {
+ gf_log (THIS->name, GF_LOG_WARNING,
+ "listxattr failed on %s: %s",
+ filler->real_path, strerror (errno));
+
+ }
+
+ goto out;
+ }
+
+ if (size == 0) {
+ ret = 0;
+ goto out;
+ }
+
+ list = alloca (size + 1);
+ if (!list) {
+ goto out;
+ }
+
+ size = sys_llistxattr (filler->real_path, list, size);
+ if (size <= 0) {
+ ret = size;
+ goto out;
+ }
+
+ remaining_size = size;
+ list_offset = 0;
+
+ while (remaining_size > 0) {
+ if (*(list + list_offset) == '\0')
+ break;
+ strcpy (key, list + list_offset);
+ if (fnmatch (marker_contri_key, key, 0) == 0) {
+ ret = _posix_xattr_get_set_from_backend (filler, key);
+ }
+
+ remaining_size -= strlen (key) + 1;
+ list_offset += strlen (key) + 1;
+ }
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
+static int
+_posix_get_marker_quota_contributions (posix_xattr_filler_t *filler, char *key)
+{
+ char *saveptr = NULL, *token = NULL, *tmp_key = NULL;
+ char *ptr = NULL;
+ int i = 0, ret = 0;
+
+ tmp_key = ptr = gf_strdup (key);
+ for (i = 0; i < 4; i++) {
+ token = strtok_r (tmp_key, ".", &saveptr);
+ tmp_key = NULL;
+ }
+
+ if (strncmp (token, "contri", strlen ("contri")) == 0) {
+ ret = _posix_get_marker_all_contributions (filler);
+ } else {
+ ret = _posix_xattr_get_set_from_backend (filler, key);
+ }
+
+ GF_FREE (ptr);
+
+ return ret;
+}
+
+static int
_posix_xattr_get_set (dict_t *xattr_req,
char *key,
data_t *data,
void *xattrargs)
{
posix_xattr_filler_t *filler = xattrargs;
- char *value = NULL;
- ssize_t xattr_size = -1;
int ret = -1;
char *databuf = NULL;
int _fd = -1;
@@ -140,6 +266,16 @@ _posix_xattr_get_set (dict_t *xattr_req,
goto err;
}
+ /*
+ * There could be a situation where the ia_size is
+ * zero. GF_CALLOC will return a pointer to the
+ * memory initialized by gf_mem_set_acct_info.
+ * This function adds a header and a footer to
+ * the allocated memory. The returned pointer
+ * points to the memory just after the header, but
+ * when size is zero, there is no space for user
+ * data. The memory can be freed by calling GF_FREE.
+ */
databuf = GF_CALLOC (1, filler->stbuf->ia_size,
gf_posix_mt_char);
if (!databuf) {
@@ -181,48 +317,34 @@ _posix_xattr_get_set (dict_t *xattr_req,
}
} else if (!strcmp (key, GLUSTERFS_OPEN_FD_COUNT)) {
loc = filler->loc;
- if (loc && !list_empty (&loc->inode->fd_list)) {
- ret = dict_set_uint32 (filler->xattr, key, 1);
- if (ret < 0)
- gf_log (filler->this->name, GF_LOG_WARNING,
- "Failed to set dictionary value for %s",
- key);
- } else {
- ret = dict_set_uint32 (filler->xattr, key, 0);
+ if (loc) {
+ ret = dict_set_uint32 (filler->xattr, key,
+ loc->inode->fd_count);
if (ret < 0)
gf_log (filler->this->name, GF_LOG_WARNING,
"Failed to set dictionary value for %s",
key);
}
- } else {
- xattr_size = sys_lgetxattr (filler->real_path, key, NULL, 0);
-
- if (xattr_size > 0) {
- value = GF_CALLOC (1, xattr_size + 1,
- gf_posix_mt_char);
- if (!value)
- return -1;
-
- xattr_size = sys_lgetxattr (filler->real_path, key, value,
- xattr_size);
- if (xattr_size <= 0) {
- gf_log (filler->this->name, GF_LOG_WARNING,
- "getxattr failed. path: %s, key: %s",
- filler->real_path, key);
- GF_FREE (value);
- return -1;
- }
+ } else if (!strcmp (key, GET_ANCESTRY_PATH_KEY)) {
+ char *path = NULL;
+ ret = posix_get_ancestry (filler->this, filler->loc->inode,
+ NULL, &path, POSIX_ANCESTRY_PATH,
+ &filler->op_errno, xattr_req);
+ if (ret < 0) {
+ goto out;
+ }
- value[xattr_size] = '\0';
- ret = dict_set_bin (filler->xattr, key,
- value, xattr_size);
- if (ret < 0) {
- gf_log (filler->this->name, GF_LOG_DEBUG,
- "dict set failed. path: %s, key: %s",
- filler->real_path, key);
- GF_FREE (value);
- }
+ ret = dict_set_dynstr (filler->xattr, GET_ANCESTRY_PATH_KEY,
+ path);
+ if (ret < 0) {
+ GF_FREE (path);
+ goto out;
}
+
+ } else if (fnmatch (marker_contri_key, key, 0) == 0) {
+ ret = _posix_get_marker_quota_contributions (filler, key);
+ } else {
+ ret = _posix_xattr_get_set_from_backend (filler, key);
}
out:
return 0;
@@ -281,7 +403,7 @@ posix_fill_ino_from_gfid (xlator_t *this, struct iatt *buf)
goto out;
}
for (i = 15; i > (15 - 8); i--) {
- temp_ino += (uint64_t)(buf->ia_gfid[i]) << j;
+ temp_ino += (uint64_t)(buf->ia_gfid[i]) << j;
j += 8;
}
buf->ia_ino = temp_ino;
@@ -662,6 +784,27 @@ out:
return op_ret;
}
+#ifdef GF_DARWIN_HOST_OS
+static
+void posix_dump_buffer (xlator_t *this, const char *real_path, const char *key,
+ data_t *value, int flags)
+{
+ char buffer[3*value->len+1];
+ int index = 0;
+ buffer[0] = 0;
+ gf_loglevel_t log_level = gf_log_get_loglevel ();
+ if (log_level == GF_LOG_TRACE) {
+ char *data = (char *) value->data;
+ for (index = 0; index < value->len; index++)
+ sprintf(buffer+3*index, " %02x", data[index]);
+ }
+ gf_log (this->name, GF_LOG_DEBUG,
+ "Dump %s: key:%s flags: %u length:%u data:%s ",
+ real_path, key, flags, value->len,
+ (log_level == GF_LOG_TRACE ? buffer : "<skipped in DEBUG>"));
+}
+#endif
+
static int gf_xattr_enotsup_log;
int
@@ -671,14 +814,20 @@ posix_handle_pair (xlator_t *this, const char *real_path,
int sys_ret = -1;
int ret = 0;
- if (ZR_FILE_CONTENT_REQUEST(key)) {
+ if (XATTR_IS_PATHINFO (key)) {
+ ret = -EACCES;
+ goto out;
+ } else if (ZR_FILE_CONTENT_REQUEST(key)) {
ret = posix_set_file_contents (this, real_path, key, value,
flags);
} else {
sys_ret = sys_lsetxattr (real_path, key, value->data,
value->len, flags);
-
+#ifdef GF_DARWIN_HOST_OS
+ posix_dump_buffer(this, real_path, key, value, flags);
+#endif
if (sys_ret < 0) {
+ ret = -errno;
if (errno == ENOTSUP) {
GF_LOG_OCCASIONALLY(gf_xattr_enotsup_log,
this->name,GF_LOG_WARNING,
@@ -699,18 +848,17 @@ posix_handle_pair (xlator_t *this, const char *real_path,
gf_log (this->name,
((errno == EINVAL) ?
GF_LOG_DEBUG : GF_LOG_ERROR),
- "%s: key:%s error:%s",
- real_path, key,
+ "%s: key:%s flags: %u length:%d error:%s",
+ real_path, key, flags, value->len,
strerror (errno));
#else /* ! DARWIN */
gf_log (this->name, GF_LOG_ERROR,
- "%s: key:%s error:%s",
- real_path, key,
+ "%s: key:%s flags: %u length:%d error:%s",
+ real_path, key, flags, value->len,
strerror (errno));
#endif /* DARWIN */
}
- ret = -errno;
goto out;
}
}
@@ -725,10 +873,16 @@ posix_fhandle_pair (xlator_t *this, int fd,
int sys_ret = -1;
int ret = 0;
+ if (XATTR_IS_PATHINFO (key)) {
+ ret = -EACCES;
+ goto out;
+ }
+
sys_ret = sys_fsetxattr (fd, key, value->data,
value->len, flags);
if (sys_ret < 0) {
+ ret = -errno;
if (errno == ENOTSUP) {
GF_LOG_OCCASIONALLY(gf_xattr_enotsup_log,
this->name,GF_LOG_WARNING,
@@ -755,7 +909,6 @@ posix_fhandle_pair (xlator_t *this, int fd,
#endif /* DARWIN */
}
- ret = -errno;
goto out;
}
@@ -896,8 +1049,8 @@ posix_spawn_janitor_thread (xlator_t *this)
LOCK (&priv->lock);
{
if (!priv->janitor_present) {
- ret = pthread_create (&priv->janitor, NULL,
- posix_janitor_thread_proc, this);
+ ret = gf_thread_create (&priv->janitor, NULL,
+ posix_janitor_thread_proc, this);
if (ret < 0) {
gf_log (this->name, GF_LOG_ERROR,
@@ -913,6 +1066,74 @@ unlock:
UNLOCK (&priv->lock);
}
+static int
+is_fresh_file (struct stat *stat)
+{
+ struct timeval tv;
+
+ gettimeofday (&tv, NULL);
+
+ if ((stat->st_ctime >= (tv.tv_sec - 1))
+ && (stat->st_ctime <= tv.tv_sec))
+ return 1;
+
+ return 0;
+}
+
+
+int
+posix_gfid_heal (xlator_t *this, const char *path, loc_t *loc, dict_t *xattr_req)
+{
+ /* The purpose of this function is to prevent a race
+ where an inode creation FOP (like mkdir/mknod/create etc)
+ races with lookup in the following way:
+
+ {create thread} | {lookup thread}
+ |
+ t0
+ mkdir ("name") |
+ t1
+ | posix_gfid_set ("name", 2);
+ t2
+ posix_gfid_set ("name", 1); |
+ t3
+ lstat ("name"); | lstat ("name");
+
+ In the above case mkdir FOP would have resulted with GFID 2 while
+ it should have been GFID 1. It matters in the case where GFID would
+ have gotten set to 1 on other subvolumes of replciate/distribute
+
+ The "solution" here is that, if we detect lookup is attempting to
+ set a GFID on a file which is created very recently, but does not
+ yet have a GFID (i.e, between t1 and t2), then "fake" it as though
+ posix_gfid_heal was called at t0 instead.
+ */
+
+ uuid_t uuid_curr;
+ int ret = 0;
+ struct stat stat = {0, };
+
+ if (!xattr_req)
+ goto out;
+
+ if (sys_lstat (path, &stat) != 0)
+ goto out;
+
+ ret = sys_lgetxattr (path, GFID_XATTR_KEY, uuid_curr, 16);
+ if (ret != 16) {
+ if (is_fresh_file (&stat)) {
+ ret = -1;
+ errno = ENOENT;
+ goto out;
+ }
+ }
+
+ ret = posix_gfid_set (this, path, loc, xattr_req);
+out:
+ return ret;
+}
+
+
int
posix_acl_xattr_set (xlator_t *this, const char *path, dict_t *xattr_req)
{
@@ -926,17 +1147,17 @@ posix_acl_xattr_set (xlator_t *this, const char *path, dict_t *xattr_req)
if (sys_lstat (path, &stat) != 0)
goto out;
- data = dict_get (xattr_req, "system.posix_acl_access");
+ data = dict_get (xattr_req, POSIX_ACL_ACCESS_XATTR);
if (data) {
- ret = sys_lsetxattr (path, "system.posix_acl_access",
+ ret = sys_lsetxattr (path, POSIX_ACL_ACCESS_XATTR,
data->data, data->len, 0);
if (ret != 0)
goto out;
}
- data = dict_get (xattr_req, "system.posix_acl_default");
+ data = dict_get (xattr_req, POSIX_ACL_DEFAULT_XATTR);
if (data) {
- ret = sys_lsetxattr (path, "system.posix_acl_default",
+ ret = sys_lsetxattr (path, POSIX_ACL_DEFAULT_XATTR,
data->data, data->len, 0);
if (ret != 0)
goto out;
@@ -946,35 +1167,47 @@ out:
return ret;
}
+static int
+_handle_entry_create_keyvalue_pair (dict_t *d, char *k, data_t *v,
+ void *tmp)
+{
+ int ret = -1;
+ posix_xattr_filler_t *filler = NULL;
+
+ filler = tmp;
+
+ if (!strcmp (GFID_XATTR_KEY, k) ||
+ !strcmp ("gfid-req", k) ||
+ !strcmp (POSIX_ACL_DEFAULT_XATTR, k) ||
+ !strcmp (POSIX_ACL_ACCESS_XATTR, k) ||
+ ZR_FILE_CONTENT_REQUEST(k)) {
+ return 0;
+ }
+
+ ret = posix_handle_pair (filler->this, filler->real_path, k, v,
+ XATTR_CREATE);
+ if (ret < 0) {
+ errno = -ret;
+ return -1;
+ }
+ return 0;
+}
+
int
posix_entry_create_xattr_set (xlator_t *this, const char *path,
dict_t *dict)
{
int ret = -1;
+ posix_xattr_filler_t filler = {0,};
+
if (!dict)
goto out;
- int _handle_keyvalue_pair (dict_t *d, char *k, data_t *v,
- void *tmp)
- {
- if (!strcmp (GFID_XATTR_KEY, k) ||
- !strcmp ("gfid-req", k) ||
- !strcmp ("system.posix_acl_default", k) ||
- !strcmp ("system.posix_acl_access", k) ||
- ZR_FILE_CONTENT_REQUEST(k)) {
- return 0;
- }
+ filler.this = this;
+ filler.real_path = path;
- ret = posix_handle_pair (this, path, k, v, XATTR_CREATE);
- if (ret < 0) {
- errno = -ret;
- return -1;
- }
- return 0;
- }
-
- ret = dict_foreach (dict, _handle_keyvalue_pair, NULL);
+ ret = dict_foreach (dict, _handle_entry_create_keyvalue_pair, &filler);
out:
return ret;
@@ -998,7 +1231,7 @@ __posix_fd_ctx_get (fd_t *fd, xlator_t *this, struct posix_fd **pfd_p)
goto out;
}
- if (fd->pid != -1)
+ if (!fd_is_anonymous(fd))
/* anonymous fd */
goto out;
@@ -1064,3 +1297,257 @@ posix_fd_ctx_get (fd_t *fd, xlator_t *this, struct posix_fd **pfd)
return ret;
}
+
+static void *
+posix_health_check_thread_proc (void *data)
+{
+ xlator_t *this = NULL;
+ struct posix_private *priv = NULL;
+ uint32_t interval = 0;
+ int ret = -1;
+ struct stat sb = {0, };
+
+ this = data;
+ priv = this->private;
+
+ /* prevent races when the interval is updated */
+ interval = priv->health_check_interval;
+ if (interval == 0)
+ goto out;
+
+ gf_log (this->name, GF_LOG_DEBUG, "health-check thread started, "
+ "interval = %d seconds", interval);
+
+ while (1) {
+ /* aborting sleep() is a request to exit this thread, sleep()
+ * will normally not return when cancelled */
+ ret = sleep (interval);
+ if (ret > 0)
+ break;
+
+ /* prevent thread errors while doing the health-check(s) */
+ pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL);
+
+ /* Do the health-check, it should be moved to its own function
+ * in case it gets more complex. */
+ ret = stat (priv->base_path, &sb);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING,
+ "stat() on %s returned: %s", priv->base_path,
+ strerror (errno));
+ goto abort;
+ }
+
+ pthread_setcancelstate (PTHREAD_CANCEL_ENABLE, NULL);
+ }
+
+out:
+ gf_log (this->name, GF_LOG_DEBUG, "health-check thread exiting");
+
+ LOCK (&priv->lock);
+ {
+ priv->health_check_active = _gf_false;
+ }
+ UNLOCK (&priv->lock);
+
+ return NULL;
+
+abort:
+ /* health-check failed */
+ gf_log (this->name, GF_LOG_EMERG, "health-check failed, going down");
+ xlator_notify (this->parents->xlator, GF_EVENT_CHILD_DOWN, this);
+
+ ret = sleep (30);
+ if (ret == 0) {
+ gf_log (this->name, GF_LOG_EMERG, "still alive! -> SIGTERM");
+ kill (getpid(), SIGTERM);
+ }
+
+ ret = sleep (30);
+ if (ret == 0) {
+ gf_log (this->name, GF_LOG_EMERG, "still alive! -> SIGKILL");
+ kill (getpid(), SIGKILL);
+ }
+
+ return NULL;
+}
+
+void
+posix_spawn_health_check_thread (xlator_t *xl)
+{
+ struct posix_private *priv = NULL;
+ int ret = -1;
+
+ priv = xl->private;
+
+ LOCK (&priv->lock);
+ {
+ /* cancel the running thread */
+ if (priv->health_check_active == _gf_true) {
+ pthread_cancel (priv->health_check);
+ priv->health_check_active = _gf_false;
+ }
+
+ /* prevent scheduling a check in a tight loop */
+ if (priv->health_check_interval == 0)
+ goto unlock;
+
+ ret = gf_thread_create (&priv->health_check, NULL,
+ posix_health_check_thread_proc, xl);
+ if (ret < 0) {
+ priv->health_check_interval = 0;
+ priv->health_check_active = _gf_false;
+ gf_log (xl->name, GF_LOG_ERROR,
+ "unable to setup health-check thread: %s",
+ strerror (errno));
+ goto unlock;
+ }
+
+ /* run the thread detached, resources will be freed on exit */
+ pthread_detach (priv->health_check);
+ priv->health_check_active = _gf_true;
+ }
+unlock:
+ UNLOCK (&priv->lock);
+}
+
+int
+posix_fsyncer_pick (xlator_t *this, struct list_head *head)
+{
+ struct posix_private *priv = NULL;
+ int count = 0;
+
+ priv = this->private;
+ pthread_mutex_lock (&priv->fsync_mutex);
+ {
+ while (list_empty (&priv->fsyncs))
+ pthread_cond_wait (&priv->fsync_cond,
+ &priv->fsync_mutex);
+
+ count = priv->fsync_queue_count;
+ priv->fsync_queue_count = 0;
+ list_splice_init (&priv->fsyncs, head);
+ }
+ pthread_mutex_unlock (&priv->fsync_mutex);
+
+ return count;
+}
+
+
+void
+posix_fsyncer_process (xlator_t *this, call_stub_t *stub, gf_boolean_t do_fsync)
+{
+ struct posix_fd *pfd = NULL;
+ int ret = -1;
+ struct posix_private *priv = NULL;
+
+ priv = this->private;
+
+ ret = posix_fd_ctx_get (stub->args.fd, this, &pfd);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not get fdctx for fd(%s)",
+ uuid_utoa (stub->args.fd->inode->gfid));
+ call_unwind_error (stub, -1, EINVAL);
+ return;
+ }
+
+ if (do_fsync) {
+ if (stub->args.datasync)
+ ret = sys_fdatasync (pfd->fd);
+ else
+ ret = sys_fsync (pfd->fd);
+ } else {
+ ret = 0;
+ }
+
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "could not fstat fd(%s)",
+ uuid_utoa (stub->args.fd->inode->gfid));
+ call_unwind_error (stub, -1, errno);
+ return;
+ }
+
+ call_unwind_error (stub, 0, 0);
+}
+
+
+static void
+posix_fsyncer_syncfs (xlator_t *this, struct list_head *head)
+{
+ call_stub_t *stub = NULL;
+ struct posix_fd *pfd = NULL;
+ int ret = -1;
+
+ stub = list_entry (head->prev, call_stub_t, list);
+ ret = posix_fd_ctx_get (stub->args.fd, this, &pfd);
+ if (ret)
+ return;
+
+#ifdef GF_LINUX_HOST_OS
+ /* syncfs() is not "declared" in RHEL's glibc even though
+ the kernel has support.
+ */
+#include <sys/syscall.h>
+#include <unistd.h>
+#ifdef SYS_syncfs
+ syscall (SYS_syncfs, pfd->fd);
+#else
+ sync();
+#endif
+#else
+ sync();
+#endif
+}
+
+
+void *
+posix_fsyncer (void *d)
+{
+ xlator_t *this = d;
+ struct posix_private *priv = NULL;
+ call_stub_t *stub = NULL;
+ call_stub_t *tmp = NULL;
+ struct list_head list;
+ int count = 0;
+ gf_boolean_t do_fsync = _gf_true;
+
+ priv = this->private;
+
+ for (;;) {
+ INIT_LIST_HEAD (&list);
+
+ count = posix_fsyncer_pick (this, &list);
+
+ usleep (priv->batch_fsync_delay_usec);
+
+ gf_log (this->name, GF_LOG_DEBUG,
+ "picked %d fsyncs", count);
+
+ switch (priv->batch_fsync_mode) {
+ case BATCH_NONE:
+ case BATCH_REVERSE_FSYNC:
+ break;
+ case BATCH_SYNCFS:
+ case BATCH_SYNCFS_SINGLE_FSYNC:
+ case BATCH_SYNCFS_REVERSE_FSYNC:
+ posix_fsyncer_syncfs (this, &list);
+ break;
+ }
+
+ if (priv->batch_fsync_mode == BATCH_SYNCFS)
+ do_fsync = _gf_false;
+ else
+ do_fsync = _gf_true;
+
+ list_for_each_entry_safe_reverse (stub, tmp, &list, list) {
+ list_del_init (&stub->list);
+
+ posix_fsyncer_process (this, stub, do_fsync);
+
+ if (priv->batch_fsync_mode == BATCH_SYNCFS_SINGLE_FSYNC)
+ do_fsync = _gf_false;
+ }
+ }
+}
diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c
index 86c35f44f..f7800184e 100644
--- a/xlators/storage/posix/src/posix.c
+++ b/xlators/storage/posix/src/posix.c
@@ -23,6 +23,8 @@
#include <pthread.h>
#include <ftw.h>
#include <sys/stat.h>
+#include <signal.h>
+#include <sys/uio.h>
#ifndef GF_BSD_HOST_OS
#include <alloca.h>
@@ -50,8 +52,10 @@
#include "glusterfs3-xdr.h"
#include "hashfn.h"
#include "posix-aio.h"
+#include "glusterfs-acl.h"
extern char *marker_xattrs[];
+#define ALIGN_SIZE 4096
#undef HAVE_SET_FSID
#ifdef HAVE_SET_FSID
@@ -75,7 +79,6 @@ extern char *marker_xattrs[];
#define SET_TO_OLD_FS_ID()
#endif
-
int
posix_forget (xlator_t *this, inode_t *inode)
{
@@ -101,15 +104,18 @@ posix_lookup (call_frame_t *frame, xlator_t *this,
char * par_path = NULL;
struct iatt postparent = {0,};
int32_t gfidless = 0;
+ struct posix_private *priv = NULL;
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
VALIDATE_OR_GOTO (loc, out);
+ priv = this->private;
+
/* The Hidden directory should be for housekeeping purpose and it
should not get any gfid on it */
- if (__is_root_gfid (loc->pargfid) &&
- (strcmp (loc->name, GF_HIDDEN_PATH) == 0)) {
+ if (__is_root_gfid (loc->pargfid) && loc->name
+ && (strcmp (loc->name, GF_HIDDEN_PATH) == 0)) {
gf_log (this->name, GF_LOG_WARNING,
"Lookup issued on %s, which is not permitted",
GF_HIDDEN_PATH);
@@ -120,14 +126,14 @@ posix_lookup (call_frame_t *frame, xlator_t *this,
op_ret = dict_get_int32 (xdata, GF_GFIDLESS_LOOKUP, &gfidless);
op_ret = -1;
- if (uuid_is_null (loc->pargfid)) {
+ if (uuid_is_null (loc->pargfid) || (loc->name == NULL)) {
/* nameless lookup */
MAKE_INODE_HANDLE (real_path, this, loc, &buf);
} else {
MAKE_ENTRY_HANDLE (real_path, par_path, this, loc, &buf);
if (uuid_is_null (loc->inode->gfid)) {
- posix_gfid_set (this, real_path, loc, xdata);
+ posix_gfid_heal (this, real_path, loc, xdata);
MAKE_ENTRY_HANDLE (real_path, par_path, this,
loc, &buf);
}
@@ -159,6 +165,12 @@ parent:
gf_log (this->name, GF_LOG_ERROR,
"post-operation lstat on parent %s failed: %s",
par_path, strerror (op_errno));
+ if (op_errno == ENOENT)
+ /* If parent directory is missing in a lookup,
+ errno should be ESTALE (bad handle) and not
+ ENOENT (missing entry)
+ */
+ op_errno = ESTALE;
goto out;
}
}
@@ -350,23 +362,23 @@ posix_setattr (call_frame_t *frame, xlator_t *this,
goto out;
}
- if (valid & GF_SET_ATTR_MODE) {
- op_ret = posix_do_chmod (this, real_path, stbuf);
+ if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)){
+ op_ret = posix_do_chown (this, real_path, stbuf, valid);
if (op_ret == -1) {
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
- "setattr (chmod) on %s failed: %s", real_path,
+ "setattr (chown) on %s failed: %s", real_path,
strerror (op_errno));
goto out;
}
}
- if (valid & (GF_SET_ATTR_UID | GF_SET_ATTR_GID)){
- op_ret = posix_do_chown (this, real_path, stbuf, valid);
+ if (valid & GF_SET_ATTR_MODE) {
+ op_ret = posix_do_chmod (this, real_path, stbuf);
if (op_ret == -1) {
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
- "setattr (chown) on %s failed: %s", real_path,
+ "setattr (chmod) on %s failed: %s", real_path,
strerror (op_errno));
goto out;
}
@@ -560,6 +572,318 @@ out:
return 0;
}
+#ifdef FALLOC_FL_KEEP_SIZE
+static int32_t
+posix_do_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
+ off_t offset, size_t len, struct iatt *statpre,
+ struct iatt *statpost)
+{
+ struct posix_fd *pfd = NULL;
+ int32_t ret = -1;
+
+ DECLARE_OLD_FS_ID_VAR;
+
+ SET_FS_ID (frame->root->uid, frame->root->gid);
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+
+ ret = posix_fd_ctx_get (fd, this, &pfd);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "pfd is NULL from fd=%p", fd);
+ goto out;
+ }
+
+ ret = posix_fdstat (this, pfd->fd, statpre);
+ if (ret == -1) {
+ ret = -errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "fallocate (fstat) failed on fd=%p: %s", fd,
+ strerror (errno));
+ goto out;
+ }
+
+ ret = sys_fallocate(pfd->fd, flags, offset, len);
+ if (ret == -1) {
+ ret = -errno;
+ goto out;
+ }
+
+ ret = posix_fdstat (this, pfd->fd, statpost);
+ if (ret == -1) {
+ ret = -errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "fallocate (fstat) failed on fd=%p: %s", fd,
+ strerror (errno));
+ goto out;
+ }
+
+out:
+ SET_TO_OLD_FS_ID ();
+
+ return ret;
+}
+#endif /* FALLOC_FL_KEEP_SIZE */
+
+char*
+_page_aligned_alloc (size_t size, char **aligned_buf)
+{
+ char *alloc_buf = NULL;
+ char *buf = NULL;
+
+ alloc_buf = GF_CALLOC (1, (size + ALIGN_SIZE), gf_posix_mt_char);
+ if (!alloc_buf)
+ goto out;
+ /* page aligned buffer */
+ buf = GF_ALIGN_BUF (alloc_buf, ALIGN_SIZE);
+ *aligned_buf = buf;
+out:
+ return alloc_buf;
+}
+
+static int32_t
+_posix_do_zerofill(int fd, off_t offset, off_t len, int o_direct)
+{
+ off_t num_vect = 0;
+ off_t num_loop = 1;
+ off_t idx = 0;
+ int32_t op_ret = -1;
+ int32_t vect_size = VECTOR_SIZE;
+ off_t remain = 0;
+ off_t extra = 0;
+ struct iovec *vector = NULL;
+ char *iov_base = NULL;
+ char *alloc_buf = NULL;
+
+ if (len == 0)
+ return 0;
+ if (len < VECTOR_SIZE)
+ vect_size = len;
+
+ num_vect = len / (vect_size);
+ remain = len % vect_size ;
+ if (num_vect > MAX_NO_VECT) {
+ extra = num_vect % MAX_NO_VECT;
+ num_loop = num_vect / MAX_NO_VECT;
+ num_vect = MAX_NO_VECT;
+ }
+
+ vector = GF_CALLOC (num_vect, sizeof(struct iovec),
+ gf_common_mt_iovec);
+ if (!vector)
+ return -1;
+ if (o_direct) {
+ alloc_buf = _page_aligned_alloc(vect_size, &iov_base);
+ if (!alloc_buf) {
+ gf_log ("_posix_do_zerofill", GF_LOG_DEBUG,
+ "memory alloc failed, vect_size %d: %s",
+ vect_size, strerror(errno));
+ GF_FREE(vector);
+ return -1;
+ }
+ } else {
+ iov_base = GF_CALLOC (vect_size, sizeof(char),
+ gf_common_mt_char);
+ if (!iov_base) {
+ GF_FREE(vector);
+ return -1;
+ }
+ }
+
+ for (idx = 0; idx < num_vect; idx++) {
+ vector[idx].iov_base = iov_base;
+ vector[idx].iov_len = vect_size;
+ }
+ if (lseek(fd, offset, SEEK_SET) < 0) {
+ op_ret = -1;
+ goto err;
+ }
+
+ for (idx = 0; idx < num_loop; idx++) {
+ op_ret = writev(fd, vector, num_vect);
+ if (op_ret < 0)
+ goto err;
+ }
+ if (extra) {
+ op_ret = writev(fd, vector, extra);
+ if (op_ret < 0)
+ goto err;
+ }
+ if (remain) {
+ vector[0].iov_len = remain;
+ op_ret = writev(fd, vector , 1);
+ if (op_ret < 0)
+ goto err;
+ }
+err:
+ if (o_direct)
+ GF_FREE(alloc_buf);
+ else
+ GF_FREE(iov_base);
+ GF_FREE(vector);
+ return op_ret;
+}
+
+static int32_t
+posix_do_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ off_t offset, off_t len, struct iatt *statpre,
+ struct iatt *statpost)
+{
+ struct posix_fd *pfd = NULL;
+ int32_t ret = -1;
+
+ DECLARE_OLD_FS_ID_VAR;
+
+ SET_FS_ID (frame->root->uid, frame->root->gid);
+
+ VALIDATE_OR_GOTO (frame, out);
+ VALIDATE_OR_GOTO (this, out);
+ VALIDATE_OR_GOTO (fd, out);
+
+ ret = posix_fd_ctx_get (fd, this, &pfd);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_DEBUG,
+ "pfd is NULL from fd=%p", fd);
+ goto out;
+ }
+
+ ret = posix_fdstat (this, pfd->fd, statpre);
+ if (ret == -1) {
+ ret = -errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "pre-operation fstat failed on fd = %p: %s", fd,
+ strerror (errno));
+ goto out;
+ }
+ ret = _posix_do_zerofill(pfd->fd, offset, len, pfd->flags & O_DIRECT);
+ if (ret < 0) {
+ ret = -errno;
+ gf_log(this->name, GF_LOG_ERROR,
+ "zerofill failed on fd %d length %" PRId64 " %s",
+ pfd->fd, len, strerror(errno));
+ goto out;
+ }
+ if (pfd->flags & (O_SYNC|O_DSYNC)) {
+ ret = fsync (pfd->fd);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "fsync() in writev on fd %d failed: %s",
+ pfd->fd, strerror (errno));
+ ret = -errno;
+ goto out;
+ }
+ }
+
+ ret = posix_fdstat (this, pfd->fd, statpost);
+ if (ret == -1) {
+ ret = -errno;
+ gf_log (this->name, GF_LOG_ERROR,
+ "post operation fstat failed on fd=%p: %s", fd,
+ strerror (errno));
+ goto out;
+ }
+
+out:
+ SET_TO_OLD_FS_ID ();
+
+ return ret;
+}
+
+static int32_t
+_posix_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t keep_size,
+ off_t offset, size_t len, dict_t *xdata)
+{
+ int32_t ret;
+#ifndef FALLOC_FL_KEEP_SIZE
+ ret = EOPNOTSUPP;
+
+#else /* FALLOC_FL_KEEP_SIZE */
+ int32_t flags = 0;
+ struct iatt statpre = {0,};
+ struct iatt statpost = {0,};
+
+ if (keep_size)
+ flags = FALLOC_FL_KEEP_SIZE;
+
+ ret = posix_do_fallocate(frame, this, fd, flags, offset, len,
+ &statpre, &statpost);
+ if (ret < 0)
+ goto err;
+
+ STACK_UNWIND_STRICT(fallocate, frame, 0, 0, &statpre, &statpost, NULL);
+ return 0;
+
+err:
+#endif /* FALLOC_FL_KEEP_SIZE */
+ STACK_UNWIND_STRICT(fallocate, frame, -1, -ret, NULL, NULL, NULL);
+ return 0;
+}
+
+static int32_t
+posix_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
+{
+ int32_t ret;
+#ifndef FALLOC_FL_KEEP_SIZE
+ ret = EOPNOTSUPP;
+
+#else /* FALLOC_FL_KEEP_SIZE */
+ int32_t flags = FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE;
+ struct iatt statpre = {0,};
+ struct iatt statpost = {0,};
+
+ ret = posix_do_fallocate(frame, this, fd, flags, offset, len,
+ &statpre, &statpost);
+ if (ret < 0)
+ goto err;
+
+ STACK_UNWIND_STRICT(discard, frame, 0, 0, &statpre, &statpost, NULL);
+ return 0;
+
+err:
+#endif /* FALLOC_FL_KEEP_SIZE */
+ STACK_UNWIND_STRICT(discard, frame, -1, -ret, NULL, NULL, NULL);
+ return 0;
+}
+
+static int32_t
+posix_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata)
+{
+ int32_t ret = 0;
+ struct iatt statpre = {0,};
+ struct iatt statpost = {0,};
+
+ ret = posix_do_zerofill(frame, this, fd, offset, len,
+ &statpre, &statpost);
+ if (ret < 0)
+ goto err;
+
+ STACK_UNWIND_STRICT(zerofill, frame, 0, 0, &statpre, &statpost, NULL);
+ return 0;
+
+err:
+ STACK_UNWIND_STRICT(zerofill, frame, -1, -ret, NULL, NULL, NULL);
+ return 0;
+
+}
+
+static int32_t
+posix_ipc (call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata)
+{
+ /*
+ * IPC is for inter-translator communication. If one gets here, it
+ * means somebody sent one that nobody else recognized, which is an
+ * error much like an uncaught exception.
+ */
+ gf_log (this->name, GF_LOG_ERROR, "GF_LOG_IPC(%d) not handled", op);
+ STACK_UNWIND_STRICT (ipc, frame, -1, -EOPNOTSUPP, NULL);
+ return 0;
+
+}
+
int32_t
posix_opendir (call_frame_t *frame, xlator_t *this,
loc_t *loc, fd_t *fd, dict_t *xdata)
@@ -575,7 +899,6 @@ posix_opendir (call_frame_t *frame, xlator_t *this,
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
VALIDATE_OR_GOTO (loc, out);
- VALIDATE_OR_GOTO (loc->path, out);
VALIDATE_OR_GOTO (fd, out);
SET_FS_ID (frame->root->uid, frame->root->gid);
@@ -727,18 +1050,20 @@ int
posix_mknod (call_frame_t *frame, xlator_t *this,
loc_t *loc, mode_t mode, dev_t dev, mode_t umask, dict_t *xdata)
{
- int tmp_fd = 0;
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- char *real_path = 0;
- char *par_path = 0;
- struct iatt stbuf = { 0, };
- char was_present = 1;
- struct posix_private *priv = NULL;
- gid_t gid = 0;
- struct iatt preparent = {0,};
- struct iatt postparent = {0,};
- void * uuid_req = NULL;
+ int tmp_fd = 0;
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ char *real_path = 0;
+ char *par_path = 0;
+ struct iatt stbuf = { 0, };
+ char was_present = 1;
+ struct posix_private *priv = NULL;
+ gid_t gid = 0;
+ struct iatt preparent = {0,};
+ struct iatt postparent = {0,};
+ void * uuid_req = NULL;
+ int32_t nlink_samepgfid = 0;
+ char *pgfid_xattr_key = NULL;
DECLARE_OLD_FS_ID_VAR;
@@ -841,6 +1166,16 @@ post_op:
strerror (errno));
}
+ if (priv->update_pgfid_nlinks) {
+ MAKE_PGFID_XATTR_KEY (pgfid_xattr_key, PGFID_XATTR_KEY_PREFIX,
+ loc->pargfid);
+ nlink_samepgfid = 1;
+
+ SET_PGFID_XATTR (real_path, pgfid_xattr_key, nlink_samepgfid,
+ XATTR_CREATE, op_ret, this, ignore);
+ }
+
+ignore:
op_ret = posix_entry_create_xattr_set (this, real_path, xdata);
if (op_ret) {
gf_log (this->name, GF_LOG_ERROR,
@@ -969,7 +1304,6 @@ posix_mkdir (call_frame_t *frame, xlator_t *this,
goto out;
}
#endif
-
op_ret = posix_acl_xattr_set (this, real_path, xdata);
if (op_ret) {
gf_log (this->name, GF_LOG_ERROR,
@@ -1023,15 +1357,17 @@ int32_t
posix_unlink (call_frame_t *frame, xlator_t *this,
loc_t *loc, int xflag, dict_t *xdata)
{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- char *real_path = NULL;
- char *par_path = NULL;
- int32_t fd = -1;
- struct iatt stbuf = {0,};
- struct posix_private *priv = NULL;
- struct iatt preparent = {0,};
- struct iatt postparent = {0,};
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ char *real_path = NULL;
+ char *par_path = NULL;
+ int32_t fd = -1;
+ struct iatt stbuf = {0,};
+ struct posix_private *priv = NULL;
+ struct iatt preparent = {0,};
+ struct iatt postparent = {0,};
+ char *pgfid_xattr_key = NULL;
+ int32_t nlink_samepgfid = 0;
DECLARE_OLD_FS_ID_VAR;
@@ -1069,6 +1405,26 @@ posix_unlink (call_frame_t *frame, xlator_t *this,
}
}
+ if (priv->update_pgfid_nlinks && (stbuf.ia_nlink > 1)) {
+ MAKE_PGFID_XATTR_KEY (pgfid_xattr_key, PGFID_XATTR_KEY_PREFIX,
+ loc->pargfid);
+ LOCK (&loc->inode->lock);
+ {
+ UNLINK_MODIFY_PGFID_XATTR (real_path, pgfid_xattr_key,
+ nlink_samepgfid, 0, op_ret,
+ this, unlock);
+ }
+ unlock:
+ UNLOCK (&loc->inode->lock);
+
+ if (op_ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING, "modification of "
+ "parent gfid xattr failed (path:%s gfid:%s)",
+ real_path, uuid_utoa (loc->inode->gfid));
+ goto out;
+ }
+ }
+
op_ret = sys_unlink (real_path);
if (op_ret == -1) {
op_errno = errno;
@@ -1210,16 +1566,18 @@ int
posix_symlink (call_frame_t *frame, xlator_t *this,
const char *linkname, loc_t *loc, mode_t umask, dict_t *xdata)
{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- char * real_path = 0;
- char * par_path = 0;
- struct iatt stbuf = { 0, };
- struct posix_private *priv = NULL;
- gid_t gid = 0;
- char was_present = 1;
- struct iatt preparent = {0,};
- struct iatt postparent = {0,};
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ char * real_path = 0;
+ char * par_path = 0;
+ struct iatt stbuf = { 0, };
+ struct posix_private *priv = NULL;
+ gid_t gid = 0;
+ char was_present = 1;
+ struct iatt preparent = {0,};
+ struct iatt postparent = {0,};
+ char *pgfid_xattr_key = NULL;
+ int32_t nlink_samepgfid = 0;
DECLARE_OLD_FS_ID_VAR;
@@ -1280,7 +1638,6 @@ posix_symlink (call_frame_t *frame, xlator_t *this,
goto out;
}
#endif
-
op_ret = posix_acl_xattr_set (this, real_path, xdata);
if (op_ret) {
gf_log (this->name, GF_LOG_ERROR,
@@ -1288,6 +1645,14 @@ posix_symlink (call_frame_t *frame, xlator_t *this,
strerror (errno));
}
+ if (priv->update_pgfid_nlinks) {
+ MAKE_PGFID_XATTR_KEY (pgfid_xattr_key, PGFID_XATTR_KEY_PREFIX,
+ loc->pargfid);
+ nlink_samepgfid = 1;
+ SET_PGFID_XATTR (real_path, pgfid_xattr_key, nlink_samepgfid,
+ XATTR_CREATE, op_ret, this, ignore);
+ }
+ignore:
op_ret = posix_entry_create_xattr_set (this, real_path, xdata);
if (op_ret) {
gf_log (this->name, GF_LOG_ERROR,
@@ -1334,24 +1699,26 @@ int
posix_rename (call_frame_t *frame, xlator_t *this,
loc_t *oldloc, loc_t *newloc, dict_t *xdata)
{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- char *real_oldpath = NULL;
- char *real_newpath = NULL;
- char *par_oldpath = NULL;
- char *par_newpath = NULL;
- struct iatt stbuf = {0, };
- struct posix_private *priv = NULL;
- char was_present = 1;
- struct iatt preoldparent = {0, };
- struct iatt postoldparent = {0, };
- struct iatt prenewparent = {0, };
- struct iatt postnewparent = {0, };
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ char *real_oldpath = NULL;
+ char *real_newpath = NULL;
+ char *par_oldpath = NULL;
+ char *par_newpath = NULL;
+ struct iatt stbuf = {0, };
+ struct posix_private *priv = NULL;
+ char was_present = 1;
+ struct iatt preoldparent = {0, };
+ struct iatt postoldparent = {0, };
+ struct iatt prenewparent = {0, };
+ struct iatt postnewparent = {0, };
char olddirid[64];
char newdirid[64];
- uuid_t victim = {0};
- int was_dir = 0;
- int nlink = 0;
+ uuid_t victim = {0};
+ int was_dir = 0;
+ int nlink = 0;
+ char *pgfid_xattr_key = NULL;
+ int32_t nlink_samepgfid = 0;
DECLARE_OLD_FS_ID_VAR;
@@ -1416,17 +1783,64 @@ posix_rename (call_frame_t *frame, xlator_t *this,
goto out;
}
- if (IA_ISDIR (oldloc->inode->ia_type)) {
+ if (IA_ISDIR (oldloc->inode->ia_type))
posix_handle_unset (this, oldloc->inode->gfid, NULL);
+
+ LOCK (&oldloc->inode->lock);
+ {
+ if (!IA_ISDIR (oldloc->inode->ia_type)
+ && priv->update_pgfid_nlinks) {
+ MAKE_PGFID_XATTR_KEY (pgfid_xattr_key,
+ PGFID_XATTR_KEY_PREFIX,
+ oldloc->pargfid);
+ UNLINK_MODIFY_PGFID_XATTR (real_oldpath,
+ pgfid_xattr_key,
+ nlink_samepgfid, 0,
+ op_ret,
+ this, unlock);
+ }
+
+ op_ret = sys_rename (real_oldpath, real_newpath);
+ if (op_ret == -1) {
+ op_errno = errno;
+ gf_log (this->name,
+ (op_errno == ENOTEMPTY ? GF_LOG_DEBUG
+ : GF_LOG_ERROR),
+ "rename of %s to %s failed: %s",
+ real_oldpath, real_newpath,
+ strerror (op_errno));
+
+ if (priv->update_pgfid_nlinks
+ && !IA_ISDIR (oldloc->inode->ia_type)) {
+ LINK_MODIFY_PGFID_XATTR (real_oldpath,
+ pgfid_xattr_key,
+ nlink_samepgfid, 0,
+ op_ret,
+ this, unlock);
+ }
+
+ goto unlock;
+ }
+
+ if (!IA_ISDIR (oldloc->inode->ia_type)
+ && priv->update_pgfid_nlinks) {
+ MAKE_PGFID_XATTR_KEY (pgfid_xattr_key,
+ PGFID_XATTR_KEY_PREFIX,
+ newloc->pargfid);
+ LINK_MODIFY_PGFID_XATTR (real_newpath,
+ pgfid_xattr_key,
+ nlink_samepgfid, 0,
+ op_ret,
+ this, unlock);
+ }
}
+unlock:
+ UNLOCK (&oldloc->inode->lock);
- op_ret = sys_rename (real_oldpath, real_newpath);
- if (op_ret == -1) {
- op_errno = errno;
- gf_log (this->name,
- (op_errno == ENOTEMPTY ? GF_LOG_DEBUG : GF_LOG_ERROR),
- "rename of %s to %s failed: %s",
- real_oldpath, real_newpath, strerror (op_errno));
+ if (op_ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING, "modification of "
+ "parent gfid xattr failed (gfid:%s)",
+ uuid_utoa (oldloc->inode->gfid));
goto out;
}
@@ -1490,16 +1904,18 @@ int
posix_link (call_frame_t *frame, xlator_t *this,
loc_t *oldloc, loc_t *newloc, dict_t *xdata)
{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- char *real_oldpath = 0;
- char *real_newpath = 0;
- char *par_newpath = 0;
- struct iatt stbuf = {0, };
- struct posix_private *priv = NULL;
- char was_present = 1;
- struct iatt preparent = {0,};
- struct iatt postparent = {0,};
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ char *real_oldpath = 0;
+ char *real_newpath = 0;
+ char *par_newpath = 0;
+ struct iatt stbuf = {0, };
+ struct posix_private *priv = NULL;
+ char was_present = 1;
+ struct iatt preparent = {0,};
+ struct iatt postparent = {0,};
+ int32_t nlink_samepgfid = 0;
+ char *pgfid_xattr_key = NULL;
DECLARE_OLD_FS_ID_VAR;
@@ -1527,18 +1943,9 @@ posix_link (call_frame_t *frame, xlator_t *this,
goto out;
}
-#ifdef HAVE_LINKAT
- /*
- * On most systems (Linux being the notable exception), link(2)
- * first resolves symlinks. If the target is a directory or
- * is nonexistent, it will fail. linkat(2) operates on the
- * symlink instead of its target when the AT_SYMLINK_FOLLOW
- * flag is not supplied.
- */
- op_ret = linkat (AT_FDCWD, real_oldpath, AT_FDCWD, real_newpath, 0);
-#else
- op_ret = link (real_oldpath, real_newpath);
-#endif
+
+ op_ret = sys_link (real_oldpath, real_newpath);
+
if (op_ret == -1) {
op_errno = errno;
gf_log (this->name, GF_LOG_ERROR,
@@ -1564,6 +1971,27 @@ posix_link (call_frame_t *frame, xlator_t *this,
goto out;
}
+ if (priv->update_pgfid_nlinks) {
+ MAKE_PGFID_XATTR_KEY (pgfid_xattr_key, PGFID_XATTR_KEY_PREFIX,
+ newloc->pargfid);
+
+ LOCK (&newloc->inode->lock);
+ {
+ LINK_MODIFY_PGFID_XATTR (real_newpath, pgfid_xattr_key,
+ nlink_samepgfid, 0, op_ret,
+ this, unlock);
+ }
+ unlock:
+ UNLOCK (&newloc->inode->lock);
+
+ if (op_ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING, "modification of "
+ "parent gfid xattr failed (path:%s gfid:%s)",
+ real_newpath, uuid_utoa (newloc->inode->gfid));
+ goto out;
+ }
+ }
+
op_ret = 0;
out:
@@ -1630,7 +2058,6 @@ posix_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
}
op_ret = 0;
-
out:
SET_TO_OLD_FS_ID ();
@@ -1646,20 +2073,23 @@ posix_create (call_frame_t *frame, xlator_t *this,
loc_t *loc, int32_t flags, mode_t mode,
mode_t umask, fd_t *fd, dict_t *xdata)
{
- int32_t op_ret = -1;
- int32_t op_errno = 0;
- int32_t _fd = -1;
- int _flags = 0;
- char * real_path = NULL;
- char * par_path = NULL;
- struct iatt stbuf = {0, };
- struct posix_fd * pfd = NULL;
- struct posix_private * priv = NULL;
- char was_present = 1;
-
- gid_t gid = 0;
- struct iatt preparent = {0,};
- struct iatt postparent = {0,};
+ int32_t op_ret = -1;
+ int32_t op_errno = 0;
+ int32_t _fd = -1;
+ int _flags = 0;
+ char * real_path = NULL;
+ char * par_path = NULL;
+ struct iatt stbuf = {0, };
+ struct posix_fd * pfd = NULL;
+ struct posix_private * priv = NULL;
+ char was_present = 1;
+
+ gid_t gid = 0;
+ struct iatt preparent = {0,};
+ struct iatt postparent = {0,};
+
+ int nlink_samepgfid = 0;
+ char * pgfid_xattr_key = NULL;
DECLARE_OLD_FS_ID_VAR;
@@ -1717,6 +2147,9 @@ posix_create (call_frame_t *frame, xlator_t *this,
goto out;
}
+ if (was_present)
+ goto fill_stat;
+
op_ret = posix_gfid_set (this, real_path, loc, xdata);
if (op_ret) {
gf_log (this->name, GF_LOG_ERROR,
@@ -1732,7 +2165,6 @@ posix_create (call_frame_t *frame, xlator_t *this,
real_path, strerror (op_errno));
}
#endif
-
op_ret = posix_acl_xattr_set (this, real_path, xdata);
if (op_ret) {
gf_log (this->name, GF_LOG_ERROR,
@@ -1740,6 +2172,14 @@ posix_create (call_frame_t *frame, xlator_t *this,
strerror (errno));
}
+ if (priv->update_pgfid_nlinks) {
+ MAKE_PGFID_XATTR_KEY (pgfid_xattr_key, PGFID_XATTR_KEY_PREFIX,
+ loc->pargfid);
+ nlink_samepgfid = 1;
+ SET_PGFID_XATTR (real_path, pgfid_xattr_key, nlink_samepgfid,
+ XATTR_CREATE, op_ret, this, ignore);
+ }
+ignore:
op_ret = posix_entry_create_xattr_set (this, real_path, xdata);
if (op_ret) {
gf_log (this->name, GF_LOG_ERROR,
@@ -1747,6 +2187,7 @@ posix_create (call_frame_t *frame, xlator_t *this,
strerror (errno));
}
+fill_stat:
op_ret = posix_fdstat (this, _fd, &stbuf);
if (op_ret == -1) {
op_errno = errno;
@@ -1964,11 +2405,7 @@ posix_readv (call_frame_t *frame, xlator_t *this,
}
/* Hack to notify higher layers of EOF. */
- if (stbuf.ia_size == 0)
- op_errno = ENOENT;
- else if ((offset + vec.iov_len) == stbuf.ia_size)
- op_errno = ENOENT;
- else if (offset > stbuf.ia_size)
+ if (!stbuf.ia_size || (offset + vec.iov_len) >= stbuf.ia_size)
op_errno = ENOENT;
op_ret = vec.iov_len;
@@ -2013,14 +2450,12 @@ err:
return op_ret;
}
-
int32_t
__posix_writev (int fd, struct iovec *vector, int count, off_t startoff,
int odirect)
{
int32_t op_ret = 0;
int idx = 0;
- int align = 4096;
int max_buf_size = 0;
int retval = 0;
char *buf = NULL;
@@ -2036,7 +2471,7 @@ __posix_writev (int fd, struct iovec *vector, int count, off_t startoff,
max_buf_size = vector[idx].iov_len;
}
- alloc_buf = GF_MALLOC (1 * (max_buf_size + align), gf_posix_mt_char);
+ alloc_buf = _page_aligned_alloc (max_buf_size, &buf);
if (!alloc_buf) {
op_ret = -errno;
goto err;
@@ -2044,9 +2479,6 @@ __posix_writev (int fd, struct iovec *vector, int count, off_t startoff,
internal_off = startoff;
for (idx = 0; idx < count; idx++) {
- /* page aligned buffer */
- buf = GF_ALIGN_BUF (alloc_buf, align);
-
memcpy (buf, vector[idx].iov_base, vector[idx].iov_len);
/* not sure whether writev works on O_DIRECT'd fd */
@@ -2066,6 +2498,48 @@ err:
return op_ret;
}
+dict_t*
+_fill_writev_xdata (fd_t *fd, dict_t *xdata, xlator_t *this, int is_append)
+{
+ dict_t *rsp_xdata = NULL;
+ int32_t ret = 0;
+ inode_t *inode = NULL;
+
+ if (fd)
+ inode = fd->inode;
+
+ if (!fd || !fd->inode || uuid_is_null (fd->inode->gfid)) {
+ gf_log_callingfn (this->name, GF_LOG_ERROR, "Invalid Args: "
+ "fd: %p inode: %p gfid:%s", fd, inode?inode:0,
+ inode?uuid_utoa(inode->gfid):"N/A");
+ goto out;
+ }
+
+ if (!xdata || !dict_get (xdata, GLUSTERFS_OPEN_FD_COUNT))
+ goto out;
+
+ rsp_xdata = dict_new();
+ if (!rsp_xdata)
+ goto out;
+
+ ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_OPEN_FD_COUNT,
+ fd->inode->fd_count);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING, "%s: Failed to set "
+ "dictionary value for %s", uuid_utoa (fd->inode->gfid),
+ GLUSTERFS_OPEN_FD_COUNT);
+ }
+
+ ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_WRITE_IS_APPEND,
+ is_append);
+ if (ret < 0) {
+ gf_log (this->name, GF_LOG_WARNING, "%s: Failed to set "
+ "dictionary value for %s", uuid_utoa (fd->inode->gfid),
+ GLUSTERFS_WRITE_IS_APPEND);
+ }
+out:
+ return rsp_xdata;
+}
int32_t
posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
@@ -2080,6 +2554,9 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
struct iatt preop = {0,};
struct iatt postop = {0,};
int ret = -1;
+ dict_t *rsp_xdata = NULL;
+ int is_append = 0;
+ gf_boolean_t locked = _gf_false;
VALIDATE_OR_GOTO (frame, out);
VALIDATE_OR_GOTO (this, out);
@@ -2101,6 +2578,17 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
_fd = pfd->fd;
+ if (xdata && dict_get (xdata, GLUSTERFS_WRITE_IS_APPEND)) {
+ /* The write_is_append check and write must happen
+ atomically. Else another write can overtake this
+ write after the check and get written earlier.
+
+ So lock before preop-stat and unlock after write.
+ */
+ locked = _gf_true;
+ LOCK(&fd->inode->lock);
+ }
+
op_ret = posix_fdstat (this, _fd, &preop);
if (op_ret == -1) {
op_errno = errno;
@@ -2110,8 +2598,19 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
goto out;
}
+ if (locked) {
+ if (preop.ia_size == offset || (fd->flags & O_APPEND))
+ is_append = 1;
+ }
+
op_ret = __posix_writev (_fd, vector, count, offset,
(pfd->flags & O_DIRECT));
+
+ if (locked) {
+ UNLOCK (&fd->inode->lock);
+ locked = _gf_false;
+ }
+
if (op_ret < 0) {
op_errno = -op_ret;
op_ret = -1;
@@ -2127,14 +2626,21 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
UNLOCK (&priv->lock);
if (op_ret >= 0) {
+ rsp_xdata = _fill_writev_xdata (fd, xdata, this, is_append);
/* wiretv successful, we also need to get the stat of
* the file we wrote to
*/
- if (pfd->flushwrites) {
- /* NOTE: ignore the error, if one occurs at this
- * point */
- fsync (_fd);
+ if (flags & (O_SYNC|O_DSYNC)) {
+ ret = fsync (_fd);
+ if (ret) {
+ gf_log (this->name, GF_LOG_ERROR,
+ "fsync() in writev on fd %d failed: %s",
+ _fd, strerror (errno));
+ op_ret = -1;
+ op_errno = errno;
+ goto out;
+ }
}
ret = posix_fdstat (this, _fd, &postop);
@@ -2150,9 +2656,16 @@ posix_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
out:
+ if (locked) {
+ UNLOCK (&fd->inode->lock);
+ locked = _gf_false;
+ }
+
STACK_UNWIND_STRICT (writev, frame, op_ret, op_errno, &preop, &postop,
- NULL);
+ rsp_xdata);
+ if (rsp_xdata)
+ dict_unref (rsp_xdata);
return 0;
}
@@ -2279,6 +2792,33 @@ out:
}
+int
+posix_batch_fsync (call_frame_t *frame, xlator_t *this,
+ fd_t *fd, int datasync, dict_t *xdata)
+{
+ call_stub_t *stub = NULL;
+ struct posix_private *priv = NULL;
+
+ priv = this->private;
+
+ stub = fop_fsync_stub (frame, default_fsync, fd, datasync, xdata);
+ if (!stub) {
+ STACK_UNWIND_STRICT (fsync, frame, -1, ENOMEM, 0, 0, 0);
+ return 0;
+ }
+
+ pthread_mutex_lock (&priv->fsync_mutex);
+ {
+ list_add_tail (&stub->list, &priv->fsyncs);
+ priv->fsync_queue_count++;
+ pthread_cond_signal (&priv->fsync_cond);
+ }
+ pthread_mutex_unlock (&priv->fsync_mutex);
+
+ return 0;
+}
+
+
int32_t
posix_fsync (call_frame_t *frame, xlator_t *this,
fd_t *fd, int32_t datasync, dict_t *xdata)
@@ -2290,6 +2830,7 @@ posix_fsync (call_frame_t *frame, xlator_t *this,
int ret = -1;
struct iatt preop = {0,};
struct iatt postop = {0,};
+ struct posix_private *priv = NULL;
DECLARE_OLD_FS_ID_VAR;